feat: refactor ndjson input format. (#14943)

* feat: refactor ndjson input format. * feat: refactor ndjson input format.
databendlabs · Mar 29, 2024 · 7452ffc · 7452ffc
1 parent 29f6bde
commit 7452ffc
Show file tree

Hide file tree

Showing 19 changed files with 683 additions and 55 deletions.
diff --git a/Cargo.lock b/Cargo.lock
diff --git a/src/query/formats/src/field_decoder/json_ast.rs b/src/query/formats/src/field_decoder/json_ast.rs
@@ -63,12 +63,12 @@ impl FieldDecoder for FieldJsonAstDecoder {
 }
 
 impl FieldJsonAstDecoder {
-    pub fn create(options: &FileFormatOptionsExt, rounding_mode: bool) -> Self {
+    pub fn create(options: &FileFormatOptionsExt) -> Self {
         FieldJsonAstDecoder {
             timezone: options.timezone,
             ident_case_sensitive: options.ident_case_sensitive,
             is_select: options.is_select,
-            is_rounding_mode: rounding_mode,
+            is_rounding_mode: options.is_rounding_mode,
         }
     }
 

diff --git a/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs b/src/query/pipeline/sources/src/input_formats/impls/input_format_ndjson.rs
@@ -179,10 +179,7 @@ impl InputFormatTextBase for InputFormatNDJson {
         _params: &FileFormatParams,
         options: &FileFormatOptionsExt,
     ) -> Arc<dyn FieldDecoder> {
-        Arc::new(FieldJsonAstDecoder::create(
-            options,
-            options.is_rounding_mode,
-        ))
+        Arc::new(FieldJsonAstDecoder::create(options))
     }
 
     fn deserialize(builder: &mut BlockBuilder<Self>, batch: RowBatch) -> Result<()> {

diff --git a/src/query/storages/stage/Cargo.toml b/src/query/storages/stage/Cargo.toml
@@ -28,12 +28,15 @@ databend-common-storages-parquet = { path = "../parquet" }
 
 async-backtrace = { workspace = true }
 async-trait = { workspace = true }
+bstr = "1.9.1"
 csv-core = "0.1.11"
 dashmap = { workspace = true }
+enum-as-inner = "0.6.0"
 log = { workspace = true }
 opendal = { workspace = true }
 serde = { workspace = true }
 
+serde_json = { workspace = true }
 typetag = { workspace = true }
 uuid = { workspace = true }
 

diff --git a/src/query/storages/stage/src/lib.rs b/src/query/storages/stage/src/lib.rs
@@ -12,6 +12,8 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+#![allow(internal_features)]
+#![feature(core_intrinsics)]
 #![feature(impl_trait_in_assoc_type)]
 #![feature(box_patterns)]
 #![allow(clippy::uninlined_format_args)]

diff --git a/src/query/storages/stage/src/read/row_based/batch.rs b/src/query/storages/stage/src/read/row_based/batch.rs
@@ -12,9 +12,39 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
+use std::intrinsics::unlikely;
+
 use databend_common_expression::BlockMetaInfo;
+use enum_as_inner::EnumAsInner;
+use serde::Deserialize;
+use serde::Serialize;
 
-#[derive(serde::Serialize, serde::Deserialize, Debug)]
+#[derive(Serialize, Deserialize, Debug, Clone)]
+pub struct Position {
+    pub path: String,
+    pub rows: usize,
+    pub offset: usize,
+}
+
+impl Position {
+    pub fn new(path: String) -> Self {
+        Self {
+            path,
+            rows: 0,
+            offset: 0,
+        }
+    }
+
+    pub fn from_bytes_batch(batch: &BytesBatch, start_row_id: usize) -> Self {
+        Self {
+            path: batch.path.clone(),
+            rows: start_row_id,
+            offset: batch.offset,
+        }
+    }
+}
+
+#[derive(Serialize, Deserialize, Debug)]
 pub struct BytesBatch {
     pub data: Vec<u8>,
 
@@ -45,32 +75,131 @@ impl BlockMetaInfo for BytesBatch {
     }
 }
 
+#[derive(serde::Serialize, serde::Deserialize, Debug)]
+pub struct RowBatchWithPosition {
+    pub data: RowBatch,
+    pub start_pos: Position,
+}
+
+impl RowBatchWithPosition {
+    pub fn new(data: RowBatch, start_pos: Position) -> Self {
+        Self { data, start_pos }
+    }
+}
+
+#[derive(serde::Serialize, serde::Deserialize, Debug, EnumAsInner)]
+pub enum RowBatch {
+    Csv(CSVRowBatch),
+    NDJson(NdjsonRowBatch),
+}
+
+impl RowBatch {
+    pub fn rows(&self) -> usize {
+        match self {
+            RowBatch::Csv(b) => b.rows(),
+            RowBatch::NDJson(b) => b.rows(),
+        }
+    }
+
+    pub fn size(&self) -> usize {
+        match self {
+            RowBatch::Csv(b) => b.size(),
+            RowBatch::NDJson(b) => b.size(),
+        }
+    }
+}
+
 #[derive(serde::Serialize, serde::Deserialize, Debug, Default)]
-pub struct RowBatch {
+pub struct CSVRowBatch {
     /// row[i] starts at row_ends[i-1] and ends at row_ends[i]
     /// has num_fields[i] fields
     /// field[j] starts at field_ends[i-1][j] and ends at field_ends[i-1][j]
     pub data: Vec<u8>,
     pub row_ends: Vec<usize>,
     pub field_ends: Vec<usize>,
     pub num_fields: Vec<usize>,
+}
 
-    pub path: String,
-    pub offset: usize,
-    // start from 0
-    pub start_row_id: usize,
+#[derive(serde::Serialize, serde::Deserialize, Debug, Default)]
+pub struct NdjsonRowBatch {
+    // as the first row of this batch
+    pub tail_of_last_batch: Option<Vec<u8>>,
+
+    // designed to use Vec of BytesBatch without realloc
+    // should ignore data[..start]
+    pub data: Vec<u8>,
+    pub start: usize,
+    pub row_ends: Vec<usize>,
+}
+pub struct NdJsonRowBatchIter<'a> {
+    first_row: &'a [u8],
+    data: &'a [u8],
+    row_ends: &'a [usize],
+    end_index: i32,
+    start: usize,
 }
 
-impl RowBatch {
-    pub fn new(raw: &BytesBatch, start_row_id: usize) -> Self {
-        Self {
-            path: raw.path.clone(),
-            offset: raw.offset,
-            start_row_id,
-            ..Default::default()
+impl<'a> NdjsonRowBatch {
+    pub fn iter(&'a self) -> NdJsonRowBatchIter<'a> {
+        let (end_index, first_row) = if let Some(row) = &self.tail_of_last_batch {
+            (-1, row)
+        } else {
+            (0, &self.data)
+        };
+        NdJsonRowBatchIter {
+            first_row,
+            end_index,
+            data: &self.data,
+            row_ends: &self.row_ends,
+            start: self.start,
         }
     }
+}
+
+impl<'a> Iterator for NdJsonRowBatchIter<'a> {
+    type Item = &'a [u8];
+
+    fn next(&mut self) -> Option<Self::Item> {
+        if unlikely(self.end_index < 0) {
+            self.end_index = 0;
+            Some(self.first_row)
+        } else {
+            let end_index = self.end_index as usize;
+            if end_index >= self.row_ends.len() {
+                None
+            } else {
+                let end = self.row_ends[end_index];
+                let start = self.start;
+                self.start = end;
+                self.end_index += 1;
+                Some(&self.data[start..end])
+            }
+        }
+    }
+}
+
+impl NdjsonRowBatch {
+    pub fn rows(&self) -> usize {
+        self.row_ends.len()
+            + if self.tail_of_last_batch.is_none() {
+                0
+            } else {
+                1
+            }
+    }
+
+    pub fn size(&self) -> usize {
+        self.data.len()
+            + self
+                .tail_of_last_batch
+                .as_ref()
+                .map(|v| v.len())
+                .unwrap_or(0)
+            - self.start
+    }
+}
 
+impl CSVRowBatch {
     pub fn rows(&self) -> usize {
         self.row_ends.len()
     }
@@ -81,7 +210,7 @@ impl RowBatch {
 }
 
 #[typetag::serde(name = "row_batch")]
-impl BlockMetaInfo for RowBatch {
+impl BlockMetaInfo for RowBatchWithPosition {
     fn equals(&self, _info: &Box<dyn BlockMetaInfo>) -> bool {
         unreachable!("RowBatch as BlockMetaInfo is not expected to be compared.")
     }

diff --git a/src/query/storages/stage/src/read/row_based/format.rs b/src/query/storages/stage/src/read/row_based/format.rs
@@ -21,18 +21,22 @@ use databend_common_meta_app::principal::FileFormatParams;
 use databend_common_storage::FileStatus;
 
 use super::batch::BytesBatch;
-use super::batch::RowBatch;
+use super::batch::RowBatchWithPosition;
 use super::processors::BlockBuilderState;
 use crate::read::load_context::LoadContext;
 use crate::read::row_based::formats::CsvInputFormat;
+use crate::read::row_based::formats::NdJsonInputFormat;
 
 pub trait SeparatorState: Send + Sync {
-    fn append(&mut self, batch: BytesBatch) -> Result<(Vec<RowBatch>, FileStatus)>;
+    fn append(&mut self, batch: BytesBatch) -> Result<(Vec<RowBatchWithPosition>, FileStatus)>;
 }
 
 pub trait RowDecoder: Send + Sync {
-    fn add(&self, block_builder: &mut BlockBuilderState, batch: RowBatch)
-    -> Result<Vec<DataBlock>>;
+    fn add(
+        &self,
+        block_builder: &mut BlockBuilderState,
+        batch: RowBatchWithPosition,
+    ) -> Result<Vec<DataBlock>>;
 
     fn flush(&self, columns: Vec<Column>, _num_rows: usize) -> Vec<Column> {
         columns
@@ -51,6 +55,7 @@ pub trait RowBasedFileFormat: Sync + Send {
 pub fn create_row_based_file_format(params: &FileFormatParams) -> Arc<dyn RowBasedFileFormat> {
     match params {
         FileFormatParams::Csv(p) => Arc::new(CsvInputFormat { params: p.clone() }),
+        FileFormatParams::NdJson(p) => Arc::new(NdJsonInputFormat { params: p.clone() }),
         _ => {
             unreachable!("Unsupported row based file format")
         }

diff --git a/src/query/storages/stage/src/read/row_based/formats/csv/block_builder.rs b/src/query/storages/stage/src/read/row_based/formats/csv/block_builder.rs
@@ -27,7 +27,7 @@ use databend_common_pipeline_sources::input_formats::error_utils::get_decode_err
 use databend_common_storage::FileParseError;
 
 use crate::read::load_context::LoadContext;
-use crate::read::row_based::batch::RowBatch;
+use crate::read::row_based::batch::RowBatchWithPosition;
 use crate::read::row_based::format::RowDecoder;
 use crate::read::row_based::formats::csv::CsvInputFormat;
 use crate::read::row_based::processors::BlockBuilderState;
@@ -157,24 +157,29 @@ impl CsvDecoder {
 }
 
 impl RowDecoder for CsvDecoder {
-    fn add(&self, state: &mut BlockBuilderState, batch: RowBatch) -> Result<Vec<DataBlock>> {
+    fn add(
+        &self,
+        state: &mut BlockBuilderState,
+        batch: RowBatchWithPosition,
+    ) -> Result<Vec<DataBlock>> {
+        let data = batch.data.into_csv().unwrap();
         let columns = &mut state.mutable_columns;
         let mut start = 0usize;
         let mut field_end_idx = 0;
-        for (i, end) in batch.row_ends.iter().enumerate() {
-            let num_fields = batch.num_fields[i];
-            let buf = &batch.data[start..*end];
+        for (i, end) in data.row_ends.iter().enumerate() {
+            let num_fields = data.num_fields[i];
+            let buf = &data.data[start..*end];
             if let Err(e) = self.read_row(
                 buf,
                 columns,
-                &batch.field_ends[field_end_idx..field_end_idx + num_fields],
+                &data.field_ends[field_end_idx..field_end_idx + num_fields],
             ) {
                 self.load_context.error_handler.on_error(
                     e,
                     Some((columns, state.num_rows)),
                     &mut state.file_status,
-                    &batch.path,
-                    i + batch.start_row_id,
+                    &batch.start_pos.path,
+                    i + batch.start_pos.rows,
                 )?
             } else {
                 state.num_rows += 1;

diff --git a/src/query/storages/stage/src/read/row_based/formats/csv/format.rs b/src/query/storages/stage/src/read/row_based/formats/csv/format.rs
@@ -24,11 +24,6 @@ use crate::read::row_based::format::SeparatorState;
 use crate::read::row_based::formats::csv::block_builder::CsvDecoder;
 use crate::read::row_based::formats::csv::separator::CsvReader;
 
-pub struct Position {
-    pub path: String,
-    pub rows: usize,
-    pub offset: usize,
-}
 #[derive(Clone)]
 pub struct CsvInputFormat {
     pub(crate) params: CsvFileFormatParams,