Skip to content

Commit

Permalink
Backport PR #37302 on branch 1.1.x (BUG: Allow empty chunksize in sta…
Browse files Browse the repository at this point in the history
…ta reader when using iterator) (#37364)

* Backport PR #37302: BUG: Allow empty chunksize in stata reader when using iterator

* remove match argument to assert_produces_warning

Co-authored-by: Kevin Sheppard <bashtage@users.noreply.github.com>
Co-authored-by: Simon Hawkins <simonjayhawkins@gmail.com>
  • Loading branch information
3 people authored Oct 24, 2020
1 parent 60edd86 commit c132858
Show file tree
Hide file tree
Showing 3 changed files with 57 additions and 42 deletions.
1 change: 1 addition & 0 deletions doc/source/whatsnew/v1.1.4.rst
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@ Fixed regressions
- Fixed regression in :class:`RollingGroupby` causing a segmentation fault with Index of dtype object (:issue:`36727`)
- Fixed regression in :class:`PeriodDtype` comparing both equal and unequal to its string representation (:issue:`37265`)
- Fixed regression in certain offsets (:meth:`pd.offsets.Day() <pandas.tseries.offsets.Day>` and below) no longer being hashable (:issue:`37267`)
- Fixed regression in :class:`StataReader` which required ``chunksize`` to be manually set when using an iterator to read a dataset (:issue:`37280`)

.. ---------------------------------------------------------------------------
Expand Down
80 changes: 41 additions & 39 deletions pandas/io/stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -477,7 +477,7 @@ class PossiblePrecisionLoss(Warning):


precision_loss_doc = """
Column converted from %s to %s, and some data are outside of the lossless
Column converted from {0} to {1}, and some data are outside of the lossless
conversion range. This may result in a loss of precision in the saved data.
"""

Expand Down Expand Up @@ -551,7 +551,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
object in a DataFrame.
"""
ws = ""
# original, if small, if large
# original, if small, if large
conversion_data = (
(np.bool_, np.int8, np.int8),
(np.uint8, np.int8, np.int16),
Expand All @@ -571,7 +571,7 @@ def _cast_to_stata_types(data: DataFrame) -> DataFrame:
dtype = c_data[1]
else:
dtype = c_data[2]
if c_data[2] == np.float64: # Warn if necessary
if c_data[2] == np.int64: # Warn if necessary
if data[col].max() >= 2 ** 53:
ws = precision_loss_doc.format("uint64", "float64")

Expand Down Expand Up @@ -635,12 +635,12 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
self.value_labels = list(zip(np.arange(len(categories)), categories))
self.value_labels.sort(key=lambda x: x[0])
self.text_len = 0
self.off: List[int] = []
self.val: List[int] = []
self.txt: List[bytes] = []
self.n = 0

# Compute lengths and setup lists of offsets and labels
offsets: List[int] = []
values: List[int] = []
for vl in self.value_labels:
category = vl[1]
if not isinstance(category, str):
Expand All @@ -650,9 +650,9 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
ValueLabelTypeMismatch,
)
category = category.encode(encoding)
self.off.append(self.text_len)
offsets.append(self.text_len)
self.text_len += len(category) + 1 # +1 for the padding
self.val.append(vl[0])
values.append(vl[0])
self.txt.append(category)
self.n += 1

Expand All @@ -663,8 +663,8 @@ def __init__(self, catarray: Series, encoding: str = "latin-1"):
)

# Ensure int32
self.off = np.array(self.off, dtype=np.int32)
self.val = np.array(self.val, dtype=np.int32)
self.off = np.array(offsets, dtype=np.int32)
self.val = np.array(values, dtype=np.int32)

# Total length
self.len = 4 + 4 + 4 * self.n + 4 * self.n + self.text_len
Expand Down Expand Up @@ -876,23 +876,23 @@ def __init__(self):
# with a label, but the underlying variable is -127 to 100
# we're going to drop the label and cast to int
self.DTYPE_MAP = dict(
list(zip(range(1, 245), ["a" + str(i) for i in range(1, 245)]))
list(zip(range(1, 245), [np.dtype("a" + str(i)) for i in range(1, 245)]))
+ [
(251, np.int8),
(252, np.int16),
(253, np.int32),
(254, np.float32),
(255, np.float64),
(251, np.dtype(np.int8)),
(252, np.dtype(np.int16)),
(253, np.dtype(np.int32)),
(254, np.dtype(np.float32)),
(255, np.dtype(np.float64)),
]
)
self.DTYPE_MAP_XML = dict(
[
(32768, np.uint8), # Keys to GSO
(65526, np.float64),
(65527, np.float32),
(65528, np.int32),
(65529, np.int16),
(65530, np.int8),
(32768, np.dtype(np.uint8)), # Keys to GSO
(65526, np.dtype(np.float64)),
(65527, np.dtype(np.float32)),
(65528, np.dtype(np.int32)),
(65529, np.dtype(np.int16)),
(65530, np.dtype(np.int8)),
]
)
self.TYPE_MAP = list(range(251)) + list("bhlfd")
Expand Down Expand Up @@ -1050,9 +1050,10 @@ def __init__(
self._order_categoricals = order_categoricals
self._encoding = ""
self._chunksize = chunksize
if self._chunksize is not None and (
not isinstance(chunksize, int) or chunksize <= 0
):
self._using_iterator = False
if self._chunksize is None:
self._chunksize = 1
elif not isinstance(chunksize, int) or chunksize <= 0:
raise ValueError("chunksize must be a positive integer when set.")

# State variables for the file
Expand All @@ -1062,7 +1063,7 @@ def __init__(
self._column_selector_set = False
self._value_labels_read = False
self._data_read = False
self._dtype = None
self._dtype: Optional[np.dtype] = None
self._lines_read = 0

self._native_byteorder = _set_endianness(sys.byteorder)
Expand Down Expand Up @@ -1195,7 +1196,7 @@ def _read_new_header(self) -> None:
# Get data type information, works for versions 117-119.
def _get_dtypes(
self, seek_vartypes: int
) -> Tuple[List[Union[int, str]], List[Union[int, np.dtype]]]:
) -> Tuple[List[Union[int, str]], List[Union[str, np.dtype]]]:

self.path_or_buf.seek(seek_vartypes)
raw_typlist = [
Expand Down Expand Up @@ -1519,11 +1520,8 @@ def _read_strls(self) -> None:
self.GSO[str(v_o)] = decoded_va

def __next__(self) -> DataFrame:
if self._chunksize is None:
raise ValueError(
"chunksize must be set to a positive integer to use as an iterator."
)
return self.read(nrows=self._chunksize or 1)
self._using_iterator = True
return self.read(nrows=self._chunksize)

def get_chunk(self, size: Optional[int] = None) -> DataFrame:
"""
Expand Down Expand Up @@ -1692,11 +1690,15 @@ def any_startswith(x: str) -> bool:
convert = False
for col in data:
dtype = data[col].dtype
if dtype in (np.float16, np.float32):
dtype = np.float64
if dtype in (np.dtype(np.float16), np.dtype(np.float32)):
dtype = np.dtype(np.float64)
convert = True
elif dtype in (np.int8, np.int16, np.int32):
dtype = np.int64
elif dtype in (
np.dtype(np.int8),
np.dtype(np.int16),
np.dtype(np.int32),
):
dtype = np.dtype(np.int64)
convert = True
retyped_data.append((col, data[col].astype(dtype)))
if convert:
Expand Down Expand Up @@ -1807,14 +1809,14 @@ def _do_convert_categoricals(
keys = np.array(list(vl.keys()))
column = data[col]
key_matches = column.isin(keys)
if self._chunksize is not None and key_matches.all():
initial_categories = keys
if self._using_iterator and key_matches.all():
initial_categories: Optional[np.ndarray] = keys
# If all categories are in the keys and we are iterating,
# use the same keys for all chunks. If some are missing
# value labels, then we will fall back to the categories
# varying across chunks.
else:
if self._chunksize is not None:
if self._using_iterator:
# warn is using an iterator
warnings.warn(
categorical_conversion_warning, CategoricalConversionWarning
Expand Down Expand Up @@ -2010,7 +2012,7 @@ def _convert_datetime_to_stata_type(fmt: str) -> np.dtype:
"ty",
"%ty",
]:
return np.float64 # Stata expects doubles for SIFs
return np.dtype(np.float64) # Stata expects doubles for SIFs
else:
raise NotImplementedError(f"Format {fmt} not implemented")

Expand Down
18 changes: 15 additions & 3 deletions pandas/tests/io/test_stata.py
Original file line number Diff line number Diff line change
Expand Up @@ -1966,9 +1966,6 @@ def test_iterator_errors(dirpath):
StataReader(dta_file, chunksize=0)
with pytest.raises(ValueError, match="chunksize must be a positive"):
StataReader(dta_file, chunksize="apple")
with pytest.raises(ValueError, match="chunksize must be set to a positive"):
with StataReader(dta_file) as reader:
reader.__next__()


def test_iterator_value_labels():
Expand All @@ -1983,3 +1980,18 @@ def test_iterator_value_labels():
for i in range(2):
tm.assert_index_equal(chunk.dtypes[i].categories, expected)
tm.assert_frame_equal(chunk, df.iloc[j * 100 : (j + 1) * 100])


def test_precision_loss():
df = DataFrame(
[[sum(2 ** i for i in range(60)), sum(2 ** i for i in range(52))]],
columns=["big", "little"],
)
with tm.ensure_clean() as path:
with tm.assert_produces_warning(PossiblePrecisionLoss):
df.to_stata(path, write_index=False)
reread = read_stata(path)
expected_dt = Series([np.float64, np.float64], index=["big", "little"])
tm.assert_series_equal(reread.dtypes, expected_dt)
assert reread.loc[0, "little"] == df.loc[0, "little"]
assert reread.loc[0, "big"] == float(df.loc[0, "big"])

0 comments on commit c132858

Please sign in to comment.