Skip to content

Commit

Permalink
feat: refactor ndjson input format. (#14943)
Browse files Browse the repository at this point in the history
* feat: refactor ndjson input format.

* feat: refactor ndjson input format.
  • Loading branch information
youngsofun authored Mar 29, 2024
1 parent 29f6bde commit 7452ffc
Show file tree
Hide file tree
Showing 19 changed files with 683 additions and 55 deletions.
3 changes: 3 additions & 0 deletions Cargo.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions src/query/formats/src/field_decoder/json_ast.rs
Original file line number Diff line number Diff line change
Expand Up @@ -63,12 +63,12 @@ impl FieldDecoder for FieldJsonAstDecoder {
}

impl FieldJsonAstDecoder {
pub fn create(options: &FileFormatOptionsExt, rounding_mode: bool) -> Self {
pub fn create(options: &FileFormatOptionsExt) -> Self {
FieldJsonAstDecoder {
timezone: options.timezone,
ident_case_sensitive: options.ident_case_sensitive,
is_select: options.is_select,
is_rounding_mode: rounding_mode,
is_rounding_mode: options.is_rounding_mode,
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -179,10 +179,7 @@ impl InputFormatTextBase for InputFormatNDJson {
_params: &FileFormatParams,
options: &FileFormatOptionsExt,
) -> Arc<dyn FieldDecoder> {
Arc::new(FieldJsonAstDecoder::create(
options,
options.is_rounding_mode,
))
Arc::new(FieldJsonAstDecoder::create(options))
}

fn deserialize(builder: &mut BlockBuilder<Self>, batch: RowBatch) -> Result<()> {
Expand Down
3 changes: 3 additions & 0 deletions src/query/storages/stage/Cargo.toml
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,15 @@ databend-common-storages-parquet = { path = "../parquet" }

async-backtrace = { workspace = true }
async-trait = { workspace = true }
bstr = "1.9.1"
csv-core = "0.1.11"
dashmap = { workspace = true }
enum-as-inner = "0.6.0"
log = { workspace = true }
opendal = { workspace = true }
serde = { workspace = true }

serde_json = { workspace = true }
typetag = { workspace = true }
uuid = { workspace = true }

Expand Down
2 changes: 2 additions & 0 deletions src/query/storages/stage/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
// See the License for the specific language governing permissions and
// limitations under the License.

#![allow(internal_features)]
#![feature(core_intrinsics)]
#![feature(impl_trait_in_assoc_type)]
#![feature(box_patterns)]
#![allow(clippy::uninlined_format_args)]
Expand Down
157 changes: 143 additions & 14 deletions src/query/storages/stage/src/read/row_based/batch.rs
Original file line number Diff line number Diff line change
Expand Up @@ -12,9 +12,39 @@
// See the License for the specific language governing permissions and
// limitations under the License.

use std::intrinsics::unlikely;

use databend_common_expression::BlockMetaInfo;
use enum_as_inner::EnumAsInner;
use serde::Deserialize;
use serde::Serialize;

#[derive(serde::Serialize, serde::Deserialize, Debug)]
#[derive(Serialize, Deserialize, Debug, Clone)]
pub struct Position {
pub path: String,
pub rows: usize,
pub offset: usize,
}

impl Position {
pub fn new(path: String) -> Self {
Self {
path,
rows: 0,
offset: 0,
}
}

pub fn from_bytes_batch(batch: &BytesBatch, start_row_id: usize) -> Self {
Self {
path: batch.path.clone(),
rows: start_row_id,
offset: batch.offset,
}
}
}

#[derive(Serialize, Deserialize, Debug)]
pub struct BytesBatch {
pub data: Vec<u8>,

Expand Down Expand Up @@ -45,32 +75,131 @@ impl BlockMetaInfo for BytesBatch {
}
}

#[derive(serde::Serialize, serde::Deserialize, Debug)]
pub struct RowBatchWithPosition {
pub data: RowBatch,
pub start_pos: Position,
}

impl RowBatchWithPosition {
pub fn new(data: RowBatch, start_pos: Position) -> Self {
Self { data, start_pos }
}
}

#[derive(serde::Serialize, serde::Deserialize, Debug, EnumAsInner)]
pub enum RowBatch {
Csv(CSVRowBatch),
NDJson(NdjsonRowBatch),
}

impl RowBatch {
pub fn rows(&self) -> usize {
match self {
RowBatch::Csv(b) => b.rows(),
RowBatch::NDJson(b) => b.rows(),
}
}

pub fn size(&self) -> usize {
match self {
RowBatch::Csv(b) => b.size(),
RowBatch::NDJson(b) => b.size(),
}
}
}

#[derive(serde::Serialize, serde::Deserialize, Debug, Default)]
pub struct RowBatch {
pub struct CSVRowBatch {
/// row[i] starts at row_ends[i-1] and ends at row_ends[i]
/// has num_fields[i] fields
/// field[j] starts at field_ends[i-1][j] and ends at field_ends[i-1][j]
pub data: Vec<u8>,
pub row_ends: Vec<usize>,
pub field_ends: Vec<usize>,
pub num_fields: Vec<usize>,
}

pub path: String,
pub offset: usize,
// start from 0
pub start_row_id: usize,
#[derive(serde::Serialize, serde::Deserialize, Debug, Default)]
pub struct NdjsonRowBatch {
// as the first row of this batch
pub tail_of_last_batch: Option<Vec<u8>>,

// designed to use Vec of BytesBatch without realloc
// should ignore data[..start]
pub data: Vec<u8>,
pub start: usize,
pub row_ends: Vec<usize>,
}
pub struct NdJsonRowBatchIter<'a> {
first_row: &'a [u8],
data: &'a [u8],
row_ends: &'a [usize],
end_index: i32,
start: usize,
}

impl RowBatch {
pub fn new(raw: &BytesBatch, start_row_id: usize) -> Self {
Self {
path: raw.path.clone(),
offset: raw.offset,
start_row_id,
..Default::default()
impl<'a> NdjsonRowBatch {
pub fn iter(&'a self) -> NdJsonRowBatchIter<'a> {
let (end_index, first_row) = if let Some(row) = &self.tail_of_last_batch {
(-1, row)
} else {
(0, &self.data)
};
NdJsonRowBatchIter {
first_row,
end_index,
data: &self.data,
row_ends: &self.row_ends,
start: self.start,
}
}
}

impl<'a> Iterator for NdJsonRowBatchIter<'a> {
type Item = &'a [u8];

fn next(&mut self) -> Option<Self::Item> {
if unlikely(self.end_index < 0) {
self.end_index = 0;
Some(self.first_row)
} else {
let end_index = self.end_index as usize;
if end_index >= self.row_ends.len() {
None
} else {
let end = self.row_ends[end_index];
let start = self.start;
self.start = end;
self.end_index += 1;
Some(&self.data[start..end])
}
}
}
}

impl NdjsonRowBatch {
pub fn rows(&self) -> usize {
self.row_ends.len()
+ if self.tail_of_last_batch.is_none() {
0
} else {
1
}
}

pub fn size(&self) -> usize {
self.data.len()
+ self
.tail_of_last_batch
.as_ref()
.map(|v| v.len())
.unwrap_or(0)
- self.start
}
}

impl CSVRowBatch {
pub fn rows(&self) -> usize {
self.row_ends.len()
}
Expand All @@ -81,7 +210,7 @@ impl RowBatch {
}

#[typetag::serde(name = "row_batch")]
impl BlockMetaInfo for RowBatch {
impl BlockMetaInfo for RowBatchWithPosition {
fn equals(&self, _info: &Box<dyn BlockMetaInfo>) -> bool {
unreachable!("RowBatch as BlockMetaInfo is not expected to be compared.")
}
Expand Down
13 changes: 9 additions & 4 deletions src/query/storages/stage/src/read/row_based/format.rs
Original file line number Diff line number Diff line change
Expand Up @@ -21,18 +21,22 @@ use databend_common_meta_app::principal::FileFormatParams;
use databend_common_storage::FileStatus;

use super::batch::BytesBatch;
use super::batch::RowBatch;
use super::batch::RowBatchWithPosition;
use super::processors::BlockBuilderState;
use crate::read::load_context::LoadContext;
use crate::read::row_based::formats::CsvInputFormat;
use crate::read::row_based::formats::NdJsonInputFormat;

pub trait SeparatorState: Send + Sync {
fn append(&mut self, batch: BytesBatch) -> Result<(Vec<RowBatch>, FileStatus)>;
fn append(&mut self, batch: BytesBatch) -> Result<(Vec<RowBatchWithPosition>, FileStatus)>;
}

pub trait RowDecoder: Send + Sync {
fn add(&self, block_builder: &mut BlockBuilderState, batch: RowBatch)
-> Result<Vec<DataBlock>>;
fn add(
&self,
block_builder: &mut BlockBuilderState,
batch: RowBatchWithPosition,
) -> Result<Vec<DataBlock>>;

fn flush(&self, columns: Vec<Column>, _num_rows: usize) -> Vec<Column> {
columns
Expand All @@ -51,6 +55,7 @@ pub trait RowBasedFileFormat: Sync + Send {
pub fn create_row_based_file_format(params: &FileFormatParams) -> Arc<dyn RowBasedFileFormat> {
match params {
FileFormatParams::Csv(p) => Arc::new(CsvInputFormat { params: p.clone() }),
FileFormatParams::NdJson(p) => Arc::new(NdJsonInputFormat { params: p.clone() }),
_ => {
unreachable!("Unsupported row based file format")
}
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -27,7 +27,7 @@ use databend_common_pipeline_sources::input_formats::error_utils::get_decode_err
use databend_common_storage::FileParseError;

use crate::read::load_context::LoadContext;
use crate::read::row_based::batch::RowBatch;
use crate::read::row_based::batch::RowBatchWithPosition;
use crate::read::row_based::format::RowDecoder;
use crate::read::row_based::formats::csv::CsvInputFormat;
use crate::read::row_based::processors::BlockBuilderState;
Expand Down Expand Up @@ -157,24 +157,29 @@ impl CsvDecoder {
}

impl RowDecoder for CsvDecoder {
fn add(&self, state: &mut BlockBuilderState, batch: RowBatch) -> Result<Vec<DataBlock>> {
fn add(
&self,
state: &mut BlockBuilderState,
batch: RowBatchWithPosition,
) -> Result<Vec<DataBlock>> {
let data = batch.data.into_csv().unwrap();
let columns = &mut state.mutable_columns;
let mut start = 0usize;
let mut field_end_idx = 0;
for (i, end) in batch.row_ends.iter().enumerate() {
let num_fields = batch.num_fields[i];
let buf = &batch.data[start..*end];
for (i, end) in data.row_ends.iter().enumerate() {
let num_fields = data.num_fields[i];
let buf = &data.data[start..*end];
if let Err(e) = self.read_row(
buf,
columns,
&batch.field_ends[field_end_idx..field_end_idx + num_fields],
&data.field_ends[field_end_idx..field_end_idx + num_fields],
) {
self.load_context.error_handler.on_error(
e,
Some((columns, state.num_rows)),
&mut state.file_status,
&batch.path,
i + batch.start_row_id,
&batch.start_pos.path,
i + batch.start_pos.rows,
)?
} else {
state.num_rows += 1;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -24,11 +24,6 @@ use crate::read::row_based::format::SeparatorState;
use crate::read::row_based::formats::csv::block_builder::CsvDecoder;
use crate::read::row_based::formats::csv::separator::CsvReader;

pub struct Position {
pub path: String,
pub rows: usize,
pub offset: usize,
}
#[derive(Clone)]
pub struct CsvInputFormat {
pub(crate) params: CsvFileFormatParams,
Expand Down
Loading

0 comments on commit 7452ffc

Please sign in to comment.