-
Notifications
You must be signed in to change notification settings - Fork 81
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Lower parquet row group size for image datasets #833
Changes from 1 commit
be1a86e
106dcd0
aef5c96
3781cc8
713fb94
aea52b3
08e8246
2a9f2a2
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,10 +14,12 @@ | |
|
||
import datasets | ||
import datasets.config | ||
import datasets.info | ||
import numpy as np | ||
import requests | ||
from datasets import ( | ||
DownloadConfig, | ||
get_dataset_config_info, | ||
get_dataset_config_names, | ||
get_dataset_infos, | ||
load_dataset_builder, | ||
|
@@ -650,6 +652,25 @@ def raise_if_too_big_from_external_data_files( | |
) from error | ||
|
||
|
||
def get_writer_batch_size(ds_config_info: datasets.info.DatasetInfo) -> Optional[int]: | ||
""" | ||
Get the writer_batch_size that defines the maximum row group size in the parquet files. | ||
The default in `datasets` is 1,000 but we lower it to 100 for image datasets. | ||
This allows to optimize random access to parquet file, since accessing 1 row requires | ||
to read its entire row group. | ||
|
||
Args: | ||
ds_config_info (`datasets.info.DatasetInfo`): | ||
Dataset info from `datasets`. | ||
|
||
Returns: | ||
writer_batch_size (`Optional[int]`): | ||
Writer batch size to pass to a dataset builder. | ||
If `None`, then it will use the `datasets` default. | ||
""" | ||
return 100 if "Image(" in str(ds_config_info.features) else None | ||
|
||
|
||
def compute_parquet_and_dataset_info_response( | ||
dataset: str, | ||
hf_endpoint: str, | ||
|
@@ -774,12 +795,17 @@ def compute_parquet_and_dataset_info_response( | |
parquet_files: List[ParquetFile] = [] | ||
dataset_info: dict[str, Any] = {} | ||
for config in config_names: | ||
ds_config_info = get_dataset_config_info( | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more.
Also: it can rely on streaming and fail if the dataset does not support streaming, right? In that case, it defeats the purpose of downloading the dataset. Maybe handle the case with a try/except block? |
||
path=dataset, config_name=config, revision=source_revision, use_auth_token=hf_token | ||
) | ||
writer_batch_size = get_writer_batch_size(ds_config_info) | ||
builder = load_dataset_builder(path=dataset, name=config, revision=source_revision, use_auth_token=hf_token) | ||
raise_if_too_big_from_external_data_files( | ||
builder=builder, | ||
max_dataset_size=max_dataset_size, | ||
max_external_data_files=max_external_data_files, | ||
hf_token=hf_token, | ||
writer_batch_size=writer_batch_size, | ||
) | ||
builder.download_and_prepare(file_format="parquet") # the parquet files are stored in the cache dir | ||
dataset_info[config] = asdict(builder.info) # type: ignore | ||
|
@@ -860,7 +886,7 @@ def get_job_type() -> str: | |
|
||
@staticmethod | ||
def get_version() -> str: | ||
return "1.1.0" | ||
return "1.2.0" | ||
|
||
def __init__( | ||
self, | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Can we define a constant for
100
?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
we can add a constants.py file as in libcommon