Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

REF: simplify internals.construction #38400

Merged
merged 5 commits into from
Dec 11, 2020
Merged
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 44 additions & 44 deletions pandas/core/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -524,59 +524,49 @@ def to_arrays(
if columns is not None:
return [[]] * len(columns), columns
return [], [] # columns if columns is not None else []
if isinstance(data[0], (list, tuple)):
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
elif isinstance(data[0], abc.Mapping):
return _list_of_dict_to_arrays(
data, columns, coerce_float=coerce_float, dtype=dtype
)
elif isinstance(data[0], ABCSeries):
return _list_of_series_to_arrays(
data, columns, coerce_float=coerce_float, dtype=dtype
)

elif isinstance(data[0], Categorical):
if columns is None:
columns = ibase.default_index(len(data))
return data, columns
elif (
isinstance(data, (np.ndarray, ABCSeries, Index))
and data.dtype.names is not None
):

elif isinstance(data, np.ndarray) and data.dtype.names is not None:
# e.g. recarray
columns = list(data.dtype.names)
arrays = [data[k] for k in columns]
return arrays, columns

if isinstance(data[0], (list, tuple)):
content, columns = _list_to_arrays(data, columns)
elif isinstance(data[0], abc.Mapping):
content, columns = _list_of_dict_to_arrays(data, columns)
elif isinstance(data[0], ABCSeries):
content, columns = _list_of_series_to_arrays(data, columns)
else:
# last ditch effort
data = [tuple(x) for x in data]
return _list_to_arrays(data, columns, coerce_float=coerce_float, dtype=dtype)
content, columns = _list_to_arrays(data, columns)

content, columns = _finalize_columns_and_data(content, columns, dtype, coerce_float)
return content, columns


def _list_to_arrays(
data: List[Scalar],
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[DtypeObj] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
if len(data) > 0 and isinstance(data[0], tuple):
content = list(lib.to_object_array_tuples(data).T)
# Note: we already check len(data) > 0 before getting hre
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

can prob add a type for the returns here (internal & local to construction.py)

if isinstance(data[0], tuple):
content = lib.to_object_array_tuples(data)
else:
# list of lists
content = list(lib.to_object_array(data).T)
# gh-26429 do not raise user-facing AssertionError
try:
columns = _validate_or_indexify_columns(content, columns)
result = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
except AssertionError as e:
raise ValueError(e) from e
return result, columns
content = lib.to_object_array(data)
return content, columns


def _list_of_series_to_arrays(
data: List,
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[DtypeObj] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
if columns is None:
# We know pass_data is non-empty because data[0] is a Series
Expand All @@ -599,22 +589,14 @@ def _list_of_series_to_arrays(
values = extract_array(s, extract_numpy=True)
aligned_values.append(algorithms.take_1d(values, indexer))

values = np.vstack(aligned_values)
content = np.vstack(aligned_values)

if values.dtype == np.object_:
content = list(values.T)
columns = _validate_or_indexify_columns(content, columns)
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
return content, columns
else:
return values.T, columns
return content, columns


def _list_of_dict_to_arrays(
data: List[Dict],
columns: Union[Index, List],
coerce_float: bool = False,
dtype: Optional[DtypeObj] = None,
) -> Tuple[List[Scalar], Union[Index, List[Axis]]]:
"""
Convert list of dicts to numpy arrays
Expand All @@ -629,8 +611,6 @@ def _list_of_dict_to_arrays(
data : iterable
collection of records (OrderedDict, dict)
columns: iterables or None
coerce_float : bool
dtype : np.dtype

Returns
-------
Expand All @@ -646,9 +626,29 @@ def _list_of_dict_to_arrays(
# classes
data = [(type(d) is dict) and d or dict(d) for d in data]

content = list(lib.dicts_to_array(data, list(columns)).T)
columns = _validate_or_indexify_columns(content, columns)
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
content = lib.dicts_to_array(data, list(columns))
return content, columns


def _finalize_columns_and_data(
content: np.ndarray,
columns: Optional[Union[Index, List]],
dtype: Optional[DtypeObj],
coerce_float: bool,
) -> Tuple[List[np.ndarray], Union[Index, List[Axis]]]:
"""
Ensure we have valid columns, cast object dtypes if possible.
"""
content = list(content.T)

try:
columns = _validate_or_indexify_columns(content, columns)
except AssertionError as err:
# GH#26429 do not raise user-facing AssertionError
raise ValueError(err) from err

if len(content) and content[0].dtype == np.object_:
content = _convert_object_array(content, dtype=dtype, coerce_float=coerce_float)
return content, columns


Expand Down