[Feature]: add ignore missing field to text chunking processors (#907)

* feat: add ignore missing field to text chunking processor Signed-off-by: Ian Menendez <ianfmenendezd@gmail.com> Co-authored-by: Ian Menendez <ian.menendez@upstartcommerce.com> (cherry picked from commit 00e622e)
opensearch-project · Oct 3, 2024 · 8a786fe · 8a786fe
1 parent b157213
commit 8a786fe
Show file tree

Hide file tree

Showing 4 changed files with 57 additions and 3 deletions.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -15,6 +15,7 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 ## [Unreleased 2.x](https://github.com/opensearch-project/neural-search/compare/2.17...2.x)
 ### Features
 ### Enhancements
+- Implement `ignore_missing` field in text chunking processors ([#907](https://github.com/opensearch-project/neural-search/pull/907))
 ### Bug Fixes
 ### Infrastructure
 ### Documentation

diff --git a/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java b/src/main/java/org/opensearch/neuralsearch/processor/TextChunkingProcessor.java
@@ -46,10 +46,13 @@ public final class TextChunkingProcessor extends AbstractProcessor {
     public static final String FIELD_MAP_FIELD = "field_map";
     public static final String ALGORITHM_FIELD = "algorithm";
     private static final String DEFAULT_ALGORITHM = FixedTokenLengthChunker.ALGORITHM_NAME;
+    public static final String IGNORE_MISSING = "ignore_missing";
+    public static final boolean DEFAULT_IGNORE_MISSING = false;
 
     private int maxChunkLimit;
     private Chunker chunker;
     private final Map<String, Object> fieldMap;
+    private final boolean ignoreMissing;
     private final ClusterService clusterService;
     private final AnalysisRegistry analysisRegistry;
     private final Environment environment;
@@ -59,12 +62,14 @@ public TextChunkingProcessor(
         final String description,
         final Map<String, Object> fieldMap,
         final Map<String, Object> algorithmMap,
+        final boolean ignoreMissing,
         final Environment environment,
         final ClusterService clusterService,
         final AnalysisRegistry analysisRegistry
     ) {
         super(tag, description);
         this.fieldMap = fieldMap;
+        this.ignoreMissing = ignoreMissing;
         this.environment = environment;
         this.clusterService = clusterService;
         this.analysisRegistry = analysisRegistry;
@@ -75,6 +80,11 @@ public String getType() {
         return TYPE;
     }
 
+    // if ignore missing is true null fields return null. If ignore missing is false null fields return an empty list
+    private boolean shouldProcessChunk(Object chunkObject) {
+        return !ignoreMissing || Objects.nonNull(chunkObject);
+    }
+
     @SuppressWarnings("unchecked")
     private void parseAlgorithmMap(final Map<String, Object> algorithmMap) {
         if (algorithmMap.size() > 1) {
@@ -250,8 +260,11 @@ private void chunkMapType(
             } else {
                 // chunk the object when target key is of leaf type (null, string and list of string)
                 Object chunkObject = sourceAndMetadataMap.get(originalKey);
-                List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
-                sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
+
+                if (shouldProcessChunk(chunkObject)) {
+                    List<String> chunkedResult = chunkLeafType(chunkObject, runtimeParameters);
+                    sourceAndMetadataMap.put(String.valueOf(targetKey), chunkedResult);
+                }
             }
         }
     }

diff --git a/...main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java b/...main/java/org/opensearch/neuralsearch/processor/factory/TextChunkingProcessorFactory.java
@@ -14,7 +14,10 @@
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.DEFAULT_IGNORE_MISSING;
 import static org.opensearch.ingest.ConfigurationUtils.readMap;
+import static org.opensearch.ingest.ConfigurationUtils.readBooleanProperty;
 
 /**
  * Factory for chunking ingest processor for ingestion pipeline.
@@ -45,6 +48,16 @@ public TextChunkingProcessor create(
     ) throws Exception {
         Map<String, Object> fieldMap = readMap(TYPE, processorTag, config, FIELD_MAP_FIELD);
         Map<String, Object> algorithmMap = readMap(TYPE, processorTag, config, ALGORITHM_FIELD);
-        return new TextChunkingProcessor(processorTag, description, fieldMap, algorithmMap, environment, clusterService, analysisRegistry);
+        boolean ignoreMissing = readBooleanProperty(TYPE, processorTag, config, IGNORE_MISSING, DEFAULT_IGNORE_MISSING);
+        return new TextChunkingProcessor(
+            processorTag,
+            description,
+            fieldMap,
+            algorithmMap,
+            ignoreMissing,
+            environment,
+            clusterService,
+            analysisRegistry
+        );
     }
 }
diff --git a/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java b/src/test/java/org/opensearch/neuralsearch/processor/TextChunkingProcessorTests.java
@@ -42,6 +42,7 @@
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.TYPE;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.FIELD_MAP_FIELD;
 import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.ALGORITHM_FIELD;
+import static org.opensearch.neuralsearch.processor.TextChunkingProcessor.IGNORE_MISSING;
 import static org.opensearch.neuralsearch.processor.chunker.Chunker.MAX_CHUNK_LIMIT_FIELD;
 
 public class TextChunkingProcessorTests extends OpenSearchTestCase {
@@ -181,6 +182,20 @@ private TextChunkingProcessor createDelimiterInstance() {
         return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
     }
 
+    @SneakyThrows
+    private TextChunkingProcessor createIgnoreMissingInstance() {
+        Map<String, Object> config = new HashMap<>();
+        Map<String, Object> fieldMap = new HashMap<>();
+        Map<String, Object> algorithmMap = new HashMap<>();
+        algorithmMap.put(DelimiterChunker.ALGORITHM_NAME, createDelimiterParameters());
+        fieldMap.put(INPUT_FIELD, OUTPUT_FIELD);
+        config.put(FIELD_MAP_FIELD, fieldMap);
+        config.put(ALGORITHM_FIELD, algorithmMap);
+        config.put(IGNORE_MISSING, true);
+        Map<String, Processor.Factory> registry = new HashMap<>();
+        return textChunkingProcessorFactory.create(registry, PROCESSOR_TAG, DESCRIPTION, config);
+    }
+
     public void testCreate_whenAlgorithmFieldMissing_thenFail() {
         Map<String, Object> config = new HashMap<>();
         Map<String, Object> fieldMap = new HashMap<>();
@@ -945,4 +960,16 @@ public void testExecute_withDelimiter_andSourceDataString_thenSucceed() {
         expectedPassages.add(" The document contains a single paragraph, two sentences and 24 tokens by standard tokenizer in OpenSearch.");
         assertEquals(expectedPassages, passages);
     }
+
+    @SneakyThrows
+    public void testExecute_withIgnoreMissing_thenSucceed() {
+        Map<String, Object> sourceAndMetadata = new HashMap<>();
+        sourceAndMetadata.put("text_field", "");
+        sourceAndMetadata.put(IndexFieldMapper.NAME, INDEX_NAME);
+        IngestDocument ingestDocument = new IngestDocument(sourceAndMetadata, new HashMap<>());
+
+        TextChunkingProcessor processor = createIgnoreMissingInstance();
+        IngestDocument document = processor.execute(ingestDocument);
+        assertFalse(document.getSourceAndMetadata().containsKey(OUTPUT_FIELD));
+    }
 }