Skip to content

Commit

Permalink
Merge branch 'main' into test-hfh-rc-0.14
Browse files Browse the repository at this point in the history
  • Loading branch information
testbot committed Apr 24, 2023
2 parents 0c51b8e + 649d5a3 commit 515196a
Show file tree
Hide file tree
Showing 8 changed files with 180 additions and 99 deletions.
18 changes: 16 additions & 2 deletions src/datasets/arrow_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -381,6 +381,20 @@ def to_tf_dataset(
else:
raise ImportError("Called a Tensorflow-specific function but Tensorflow is not installed.")

if (isinstance(columns, list) and len(columns) == 1) or (
isinstance(label_cols, list) and len(label_cols) == 1
):
warnings.warn(
"The output of `to_tf_dataset` will change when a passing single element list for `labels` or "
"`columns` in the next datasets version. To return a tuple structure rather than dict, pass a "
"single string.\n"
"Old behaviour: columns=['a'], labels=['labels'] -> (tf.Tensor, tf.Tensor) \n"
" : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) \n"
"New behaviour: columns=['a'],labels=['labels'] -> ({'a': tf.Tensor}, {'labels': tf.Tensor}) \n"
" : columns='a', labels='labels' -> (tf.Tensor, tf.Tensor) ",
FutureWarning,
)

if isinstance(tf.distribute.get_strategy(), tf.distribute.TPUStrategy):
logger.warning(
"Note that to_tf_dataset() loads the data with a generator rather than a full tf.data "
Expand Down Expand Up @@ -1437,8 +1451,8 @@ def save_to_disk(
else:
pbar.update(content)
else:
for kwargs in kwargs_per_job:
with pbar:
with pbar:
for kwargs in kwargs_per_job:
for job_id, done, content in Dataset._save_to_disk_single(**kwargs):
if done:
shards_done += 1
Expand Down
46 changes: 24 additions & 22 deletions src/datasets/builder.py
Original file line number Diff line number Diff line change
Expand Up @@ -1485,13 +1485,14 @@ def _prepare_split(
result = None
gen_kwargs = split_generator.gen_kwargs
job_id = 0
for job_id, done, content in self._prepare_split_single(
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
):
if done:
result = content
else:
pbar.update(content)
with pbar:
for job_id, done, content in self._prepare_split_single(
gen_kwargs=gen_kwargs, job_id=job_id, **_prepare_split_args
):
if done:
result = content
else:
pbar.update(content)
# wrapping everything into lists for consistency with the multiprocessed code path
assert result is not None, "Failed to retrieve results from prepare_split"
examples_per_job, bytes_per_job, features_per_job, shards_per_job, shard_lengths_per_job = [
Expand All @@ -1513,21 +1514,22 @@ def _prepare_split(
shard_lengths_per_job = [None] * num_jobs

with Pool(num_proc) as pool:
for job_id, done, content in iflatmap_unordered(
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
):
if done:
# the content is the result of the job
(
examples_per_job[job_id],
bytes_per_job[job_id],
features_per_job[job_id],
shards_per_job[job_id],
shard_lengths_per_job[job_id],
) = content
else:
# the content is the number of examples progress update
pbar.update(content)
with pbar:
for job_id, done, content in iflatmap_unordered(
pool, self._prepare_split_single, kwargs_iterable=kwargs_per_job
):
if done:
# the content is the result of the job
(
examples_per_job[job_id],
bytes_per_job[job_id],
features_per_job[job_id],
shards_per_job[job_id],
shard_lengths_per_job[job_id],
) = content
else:
# the content is the number of examples progress update
pbar.update(content)

assert (
None not in examples_per_job
Expand Down
107 changes: 59 additions & 48 deletions src/datasets/features/features.py
Original file line number Diff line number Diff line change
Expand Up @@ -634,6 +634,9 @@ def __init__(self, shape: tuple, dtype: str):
raise ValueError("You must instantiate an array type with a value for dim that is > 1")
if len(shape) != self.ndims:
raise ValueError(f"shape={shape} and ndims={self.ndims} don't match")
for dim in range(1, self.ndims):
if shape[dim] is None:
raise ValueError(f"Support only dynamic size on first dimension. Got: {shape}")
self.shape = tuple(shape)
self.value_type = dtype
self.storage_dtype = self._generate_dtype(self.value_type)
Expand Down Expand Up @@ -711,53 +714,54 @@ def __getitem__(self, i):

def to_numpy(self, zero_copy_only=True):
storage: pa.ListArray = self.storage
size = 1

null_indices = np.arange(len(storage))[storage.is_null().to_numpy(zero_copy_only=False)]

for i in range(self.type.ndims):
size *= self.type.shape[i]
storage = storage.flatten()
numpy_arr = storage.to_numpy(zero_copy_only=zero_copy_only)
numpy_arr = numpy_arr.reshape(len(self) - len(null_indices), *self.type.shape)
if self.type.shape[0] is not None:
size = 1
null_indices = np.arange(len(storage))[storage.is_null().to_numpy(zero_copy_only=False)]

if len(null_indices):
numpy_arr = np.insert(numpy_arr.astype(np.float64), null_indices, np.nan, axis=0)

return numpy_arr
for i in range(self.type.ndims):
size *= self.type.shape[i]
storage = storage.flatten()
numpy_arr = storage.to_numpy(zero_copy_only=zero_copy_only)
numpy_arr = numpy_arr.reshape(len(self) - len(null_indices), *self.type.shape)

def to_list_of_numpy(self, zero_copy_only=True):
storage: pa.ListArray = self.storage
shape = self.type.shape
ndims = self.type.ndims

for dim in range(1, ndims):
if shape[dim] is None:
raise ValueError(f"Support only dynamic size on first dimension. Got: {shape}")
if len(null_indices):
numpy_arr = np.insert(numpy_arr.astype(np.float64), null_indices, np.nan, axis=0)

arrays = []
first_dim_offsets = np.array([off.as_py() for off in storage.offsets])
for i, is_null in enumerate(storage.is_null().to_numpy(zero_copy_only=False)):
if is_null:
arrays.append(np.nan)
else:
shape = self.type.shape
ndims = self.type.ndims
arrays = []
first_dim_offsets = np.array([off.as_py() for off in storage.offsets])
for i, is_null in enumerate(storage.is_null().to_numpy(zero_copy_only=False)):
if is_null:
arrays.append(np.nan)
else:
storage_el = storage[i : i + 1]
first_dim = first_dim_offsets[i + 1] - first_dim_offsets[i]
# flatten storage
for _ in range(ndims):
storage_el = storage_el.flatten()

numpy_arr = storage_el.to_numpy(zero_copy_only=zero_copy_only)
arrays.append(numpy_arr.reshape(first_dim, *shape[1:]))

if len(np.unique(np.diff(first_dim_offsets))) > 1:
# ragged
numpy_arr = np.empty(len(arrays), dtype=object)
numpy_arr[:] = arrays
else:
storage_el = storage[i : i + 1]
first_dim = first_dim_offsets[i + 1] - first_dim_offsets[i]
# flatten storage
for _ in range(ndims):
storage_el = storage_el.flatten()
numpy_arr = np.array(arrays)

numpy_arr = storage_el.to_numpy(zero_copy_only=zero_copy_only)
arrays.append(numpy_arr.reshape(first_dim, *shape[1:]))

return arrays
return numpy_arr

def to_pylist(self):
zero_copy_only = _is_zero_copy_only(self.storage.type, unnest=True)
if self.type.shape[0] is None:
return self.to_list_of_numpy(zero_copy_only=zero_copy_only)
numpy_arr = self.to_numpy(zero_copy_only=zero_copy_only)
if self.type.shape[0] is None and numpy_arr.dtype == object:
return [arr.tolist() for arr in numpy_arr.tolist()]
else:
return self.to_numpy(zero_copy_only=zero_copy_only).tolist()
return numpy_arr.tolist()


class PandasArrayExtensionDtype(PandasExtensionDtype):
Expand All @@ -767,16 +771,10 @@ def __init__(self, value_type: Union["PandasArrayExtensionDtype", np.dtype]):
self._value_type = value_type

def __from_arrow__(self, array: Union[pa.Array, pa.ChunkedArray]):
if array.type.shape[0] is None:
raise NotImplementedError(
"Dynamic first dimension is not supported for "
f"PandasArrayExtensionDtype, dimension: {array.type.shape}"
)
zero_copy_only = _is_zero_copy_only(array.type, unnest=True)
if isinstance(array, pa.ChunkedArray):
numpy_arr = np.vstack([chunk.to_numpy(zero_copy_only=zero_copy_only) for chunk in array.chunks])
else:
numpy_arr = array.to_numpy(zero_copy_only=zero_copy_only)
array = array.type.wrap_array(pa.concat_arrays([chunk.storage for chunk in array.chunks]))
zero_copy_only = _is_zero_copy_only(array.storage.type, unnest=True)
numpy_arr = array.to_numpy(zero_copy_only=zero_copy_only)
return PandasArrayExtensionArray(numpy_arr)

@classmethod
Expand Down Expand Up @@ -832,12 +830,25 @@ def copy(self, deep: bool = False) -> "PandasArrayExtensionArray":
def _from_sequence(
cls, scalars, dtype: Optional[PandasArrayExtensionDtype] = None, copy: bool = False
) -> "PandasArrayExtensionArray":
data = np.array(scalars, dtype=dtype if dtype is None else dtype.value_type, copy=copy)
if len(scalars) > 1 and all(
isinstance(x, np.ndarray) and x.shape == scalars[0].shape and x.dtype == scalars[0].dtype for x in scalars
):
data = np.array(scalars, dtype=dtype if dtype is None else dtype.value_type, copy=copy)
else:
data = np.empty(len(scalars), dtype=object)
data[:] = scalars
return cls(data, copy=copy)

@classmethod
def _concat_same_type(cls, to_concat: Sequence_["PandasArrayExtensionArray"]) -> "PandasArrayExtensionArray":
data = np.vstack([va._data for va in to_concat])
if len(to_concat) > 1 and all(
va._data.shape == to_concat[0]._data.shape and va._data.dtype == to_concat[0]._data.dtype
for va in to_concat
):
data = np.vstack([va._data for va in to_concat])
else:
data = np.empty(len(to_concat), dtype=object)
data[:] = [va._data for va in to_concat]
return cls(data, copy=False)

@property
Expand Down
18 changes: 4 additions & 14 deletions src/datasets/formatting/formatting.py
Original file line number Diff line number Diff line change
Expand Up @@ -168,16 +168,9 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
if isinstance(pa_array.type, _ArrayXDExtensionType):
# don't call to_pylist() to preserve dtype of the fixed-size array
zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
if pa_array.type.shape[0] is None:
array: List = [
row
for chunk in pa_array.chunks
for row in chunk.to_list_of_numpy(zero_copy_only=zero_copy_only)
]
else:
array: List = [
row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
]
array: List = [
row for chunk in pa_array.chunks for row in chunk.to_numpy(zero_copy_only=zero_copy_only)
]
else:
zero_copy_only = _is_zero_copy_only(pa_array.type) and all(
not _is_array_with_nulls(chunk) for chunk in pa_array.chunks
Expand All @@ -189,10 +182,7 @@ def _arrow_array_to_numpy(self, pa_array: pa.Array) -> np.ndarray:
if isinstance(pa_array.type, _ArrayXDExtensionType):
# don't call to_pylist() to preserve dtype of the fixed-size array
zero_copy_only = _is_zero_copy_only(pa_array.type.storage_dtype, unnest=True)
if pa_array.type.shape[0] is None:
array: List = pa_array.to_list_of_numpy(zero_copy_only=zero_copy_only)
else:
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only)
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only)
else:
zero_copy_only = _is_zero_copy_only(pa_array.type) and not _is_array_with_nulls(pa_array)
array: List = pa_array.to_numpy(zero_copy_only=zero_copy_only).tolist()
Expand Down
4 changes: 3 additions & 1 deletion src/datasets/load.py
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,9 @@ def __init__(
increase_load_count(name, resource_type="dataset")

def get_module(self) -> DatasetModule:
base_path = str(Path(self.data_dir).resolve()) if self.data_dir is not None else str(Path().resolve())
base_path = (
str(Path(self.data_dir).expanduser().resolve()) if self.data_dir is not None else str(Path().resolve())
)
patterns = (
sanitize_patterns(self.data_files) if self.data_files is not None else get_data_patterns_locally(base_path)
)
Expand Down
9 changes: 6 additions & 3 deletions src/datasets/packaged_modules/json/json.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,10 +84,11 @@ def _generate_tables(self, files):

# We accept two format: a list of dicts or a dict of lists
if isinstance(dataset, (list, tuple)):
mapping = {col: [dataset[i][col] for i in range(len(dataset))] for col in dataset[0].keys()}
keys = set().union(*[row.keys() for row in dataset])
mapping = {col: [row.get(col) for row in dataset] for col in keys}
else:
mapping = dataset
pa_table = pa.Table.from_pydict(mapping=mapping)
pa_table = pa.Table.from_pydict(mapping)
yield file_idx, self._cast_table(pa_table)

# If the file has one json object per line
Expand Down Expand Up @@ -137,7 +138,9 @@ def _generate_tables(self, files):
# If possible, parse the file as a list of json objects and exit the loop
if isinstance(dataset, list): # list is the only sequence type supported in JSON
try:
pa_table = pa.Table.from_pylist(dataset)
keys = set().union(*[row.keys() for row in dataset])
mapping = {col: [row.get(col) for row in dataset] for col in keys}
pa_table = pa.Table.from_pydict(mapping)
except (pa.ArrowInvalid, AttributeError) as e:
logger.error(f"Failed to read file '{file}' with error {type(e)}: {e}")
raise ValueError(f"Not able to read records in the JSON file at {file}.") from None
Expand Down
Loading

1 comment on commit 515196a

@github-actions
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Show benchmarks

PyArrow==8.0.0

Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.006983 / 0.011353 (-0.004369) 0.004999 / 0.011008 (-0.006009) 0.096240 / 0.038508 (0.057732) 0.035838 / 0.023109 (0.012729) 0.341877 / 0.275898 (0.065979) 0.369595 / 0.323480 (0.046115) 0.005317 / 0.007986 (-0.002668) 0.005049 / 0.004328 (0.000721) 0.072605 / 0.004250 (0.068354) 0.042662 / 0.037052 (0.005610) 0.348376 / 0.258489 (0.089887) 0.378950 / 0.293841 (0.085109) 0.034685 / 0.128546 (-0.093861) 0.011782 / 0.075646 (-0.063864) 0.331215 / 0.419271 (-0.088056) 0.050371 / 0.043533 (0.006839) 0.340836 / 0.255139 (0.085697) 0.366519 / 0.283200 (0.083319) 0.099752 / 0.141683 (-0.041931) 1.429070 / 1.452155 (-0.023084) 1.496852 / 1.492716 (0.004135)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.211981 / 0.018006 (0.193975) 0.443783 / 0.000490 (0.443293) 0.003308 / 0.000200 (0.003108) 0.000079 / 0.000054 (0.000025)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.026999 / 0.037411 (-0.010412) 0.105175 / 0.014526 (0.090649) 0.117171 / 0.176557 (-0.059386) 0.176597 / 0.737135 (-0.560538) 0.122451 / 0.296338 (-0.173887)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.396380 / 0.215209 (0.181171) 3.973747 / 2.077655 (1.896092) 1.788963 / 1.504120 (0.284843) 1.598681 / 1.541195 (0.057486) 1.631392 / 1.468490 (0.162902) 0.700018 / 4.584777 (-3.884758) 3.737008 / 3.745712 (-0.008705) 2.073801 / 5.269862 (-3.196061) 1.433344 / 4.565676 (-3.132332) 0.085575 / 0.424275 (-0.338700) 0.012118 / 0.007607 (0.004511) 0.494828 / 0.226044 (0.268783) 4.952871 / 2.268929 (2.683943) 2.252335 / 55.444624 (-53.192289) 1.916730 / 6.876477 (-4.959747) 2.033176 / 2.142072 (-0.108896) 0.833034 / 4.805227 (-3.972194) 0.166629 / 6.500664 (-6.334035) 0.063304 / 0.075469 (-0.012165)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.171178 / 1.841788 (-0.670609) 14.561896 / 8.074308 (6.487588) 14.481285 / 10.191392 (4.289893) 0.164320 / 0.680424 (-0.516104) 0.017807 / 0.534201 (-0.516394) 0.423918 / 0.579283 (-0.155366) 0.426620 / 0.434364 (-0.007744) 0.488252 / 0.540337 (-0.052085) 0.583960 / 1.386936 (-0.802976)
PyArrow==latest
Show updated benchmarks!

Benchmark: benchmark_array_xd.json

metric read_batch_formatted_as_numpy after write_array2d read_batch_formatted_as_numpy after write_flattened_sequence read_batch_formatted_as_numpy after write_nested_sequence read_batch_unformated after write_array2d read_batch_unformated after write_flattened_sequence read_batch_unformated after write_nested_sequence read_col_formatted_as_numpy after write_array2d read_col_formatted_as_numpy after write_flattened_sequence read_col_formatted_as_numpy after write_nested_sequence read_col_unformated after write_array2d read_col_unformated after write_flattened_sequence read_col_unformated after write_nested_sequence read_formatted_as_numpy after write_array2d read_formatted_as_numpy after write_flattened_sequence read_formatted_as_numpy after write_nested_sequence read_unformated after write_array2d read_unformated after write_flattened_sequence read_unformated after write_nested_sequence write_array2d write_flattened_sequence write_nested_sequence
new / old (diff) 0.007499 / 0.011353 (-0.003854) 0.005187 / 0.011008 (-0.005822) 0.075486 / 0.038508 (0.036978) 0.033966 / 0.023109 (0.010857) 0.336030 / 0.275898 (0.060132) 0.377033 / 0.323480 (0.053553) 0.006003 / 0.007986 (-0.001983) 0.004140 / 0.004328 (-0.000188) 0.075070 / 0.004250 (0.070819) 0.045750 / 0.037052 (0.008697) 0.334974 / 0.258489 (0.076485) 0.387718 / 0.293841 (0.093877) 0.035543 / 0.128546 (-0.093004) 0.012201 / 0.075646 (-0.063445) 0.086935 / 0.419271 (-0.332336) 0.054980 / 0.043533 (0.011448) 0.331399 / 0.255139 (0.076260) 0.362834 / 0.283200 (0.079635) 0.112511 / 0.141683 (-0.029172) 1.445385 / 1.452155 (-0.006769) 1.522921 / 1.492716 (0.030204)

Benchmark: benchmark_getitem_100B.json

metric get_batch_of_1024_random_rows get_batch_of_1024_rows get_first_row get_last_row
new / old (diff) 0.260105 / 0.018006 (0.242099) 0.443679 / 0.000490 (0.443189) 0.009552 / 0.000200 (0.009352) 0.000097 / 0.000054 (0.000042)

Benchmark: benchmark_indices_mapping.json

metric select shard shuffle sort train_test_split
new / old (diff) 0.029773 / 0.037411 (-0.007639) 0.110785 / 0.014526 (0.096259) 0.125409 / 0.176557 (-0.051147) 0.176518 / 0.737135 (-0.560618) 0.128222 / 0.296338 (-0.168116)

Benchmark: benchmark_iterating.json

metric read 5000 read 50000 read_batch 50000 10 read_batch 50000 100 read_batch 50000 1000 read_formatted numpy 5000 read_formatted pandas 5000 read_formatted tensorflow 5000 read_formatted torch 5000 read_formatted_batch numpy 5000 10 read_formatted_batch numpy 5000 1000 shuffled read 5000 shuffled read 50000 shuffled read_batch 50000 10 shuffled read_batch 50000 100 shuffled read_batch 50000 1000 shuffled read_formatted numpy 5000 shuffled read_formatted_batch numpy 5000 10 shuffled read_formatted_batch numpy 5000 1000
new / old (diff) 0.421865 / 0.215209 (0.206656) 4.218994 / 2.077655 (2.141339) 1.990595 / 1.504120 (0.486475) 1.803292 / 1.541195 (0.262097) 1.879167 / 1.468490 (0.410677) 0.710071 / 4.584777 (-3.874706) 3.832449 / 3.745712 (0.086737) 2.119593 / 5.269862 (-3.150268) 1.343408 / 4.565676 (-3.222269) 0.086740 / 0.424275 (-0.337535) 0.012299 / 0.007607 (0.004692) 0.521463 / 0.226044 (0.295418) 5.216671 / 2.268929 (2.947742) 2.444842 / 55.444624 (-52.999782) 2.136350 / 6.876477 (-4.740127) 2.295467 / 2.142072 (0.153394) 0.847802 / 4.805227 (-3.957425) 0.170957 / 6.500664 (-6.329707) 0.064908 / 0.075469 (-0.010561)

Benchmark: benchmark_map_filter.json

metric filter map fast-tokenizer batched map identity map identity batched map no-op batched map no-op batched numpy map no-op batched pandas map no-op batched pytorch map no-op batched tensorflow
new / old (diff) 1.256369 / 1.841788 (-0.585419) 14.950106 / 8.074308 (6.875797) 14.158535 / 10.191392 (3.967143) 0.158599 / 0.680424 (-0.521824) 0.017299 / 0.534201 (-0.516902) 0.419499 / 0.579283 (-0.159784) 0.418593 / 0.434364 (-0.015771) 0.492269 / 0.540337 (-0.048069) 0.590274 / 1.386936 (-0.796662)

Please sign in to comment.