-
-
Notifications
You must be signed in to change notification settings - Fork 19.2k
Description
Pandas version checks
-
I have checked that this issue has not already been reported.
-
I have confirmed this bug exists on the latest version of pandas.
-
I have confirmed this bug exists on the main branch of pandas.
Reproducible Example
import pandas as pd
import numpy as np
pd.set_option("mode.copy_on_write", True)
df = pd.DataFrame({'a': np.random.randint(1, 100, 2**31)}).astype({'a': 'UInt32'})
df.iloc[0] = pd.NA
mask = df['a']<200
df = df[mask]
>>>
---------------------------------------------------------------------------
OverflowError Traceback (most recent call last)
Cell In[5], line 6
4 df.iloc[0] = pd.NA
5 mask = df['a']<200
----> 6 df = df[mask]
File ~/miniconda3/envs/py3.10/lib/python3.10/site-packages/pandas/core/frame.py:3752, in DataFrame.__getitem__(self, key)
3750 # Do we have a (boolean) 1d indexer?
3751 if com.is_bool_indexer(key):
-> 3752 return self._getitem_bool_array(key)
3754 # We are left with two options: a single key, and a collection of keys,
3755 # We interpret tuples as collections only for non-MultiIndex
3756 is_single_key = isinstance(key, tuple) or not is_list_like(key)
File ~/miniconda3/envs/py3.10/lib/python3.10/site-packages/pandas/core/frame.py:3811, in DataFrame._getitem_bool_array(self, key)
3808 return self.copy(deep=None)
3810 indexer = key.nonzero()[0]
-> 3811 return self._take_with_is_copy(indexer, axis=0)
File ~/miniconda3/envs/py3.10/lib/python3.10/site-packages/pandas/core/generic.py:3948, in NDFrame._take_with_is_copy(self, indices, axis)
3940 def _take_with_is_copy(self: NDFrameT, indices, axis: Axis = 0) -> NDFrameT:
3941 """
3942 Internal version of the `take` method that sets the `_is_copy`
3943 attribute to keep track of the parent dataframe (using in indexing
(...)
3946 See the docstring of `take` for full explanation of the parameters.
3947 """
-> 3948 result = self._take(indices=indices, axis=axis)
3949 # Maybe set copy if we didn't actually change the index.
3950 if not result._get_axis(axis).equals(self._get_axis(axis)):
File ~/miniconda3/envs/py3.10/lib/python3.10/site-packages/pandas/core/generic.py:3928, in NDFrame._take(self, indices, axis, convert_indices)
3922 if not isinstance(indices, slice):
3923 indices = np.asarray(indices, dtype=np.intp)
3924 if (
3925 axis == 0
3926 and indices.ndim == 1
3927 and using_copy_on_write()
-> 3928 and is_range_indexer(indices, len(self))
3929 ):
3930 return self.copy(deep=None)
3932 new_data = self._mgr.take(
3933 indices,
3934 axis=self._get_block_manager_axis(axis),
3935 verify=True,
3936 convert_indices=convert_indices,
3937 )
File ~/miniconda3/envs/py3.10/lib/python3.10/site-packages/pandas/_libs/lib.pyx:654, in pandas._libs.lib.is_range_indexer()
OverflowError: value too large to convert to int
Issue Description
Here is the bug the encountered. In the following I make some tests to
hopefully facilitate the diagnosis of the issue.
As a short summary, the exception seems to be triggered by three conditions
- CoW enabled
- a nullable extended dtype (UInt32 in my case)
- a pd.NA value in the column.
without CoW there is no exception.
import pandas as pd
import numpy as np
pd.set_option("mode.copy_on_write", False)
df = pd.DataFrame({'a': np.random.randint(1, 100, 2**31)}).astype({'a': 'UInt32'})
df.iloc[0] = pd.NA
mask = df['a']<200
df = df[mask]
with CoW enables, but only 2**30 rows, there is no exception
pd.set_option("mode.copy_on_write", True)
df = pd.DataFrame({'a': np.random.randint(1, 100, 2**30)}).astype({'a': 'UInt32'})
df.iloc[0] = pd.NA
mask = df['a']<200
df = df[mask]
# no exception
with 2*31 rows, there is a OvewflowError exception.
pd.set_option("mode.copy_on_write", True)
df = pd.DataFrame({'a': np.random.randint(1, 100, 2**31)}).astype({'a': 'UInt32'})
df.iloc[0] = pd.NA
mask = df['a']<200
df = df[mask] # here is the line that producess the exception.
# OvewflowError exception
I also obtain an exception using df.sample(n=1_000_000)
Without pd.NA in the column, there is no exception
pd.set_option("mode.copy_on_write", True)
df = pd.DataFrame({'a': np.random.randint(1, 100, 2**31)}).astype({'a': 'UInt32'})
mask = df['a']<200
df = df[mask]
Expected Behavior
No exception.
Installed Versions
pandas : 2.0.2
numpy : 1.24.3
pytz : 2022.7
dateutil : 2.8.2
setuptools : 67.8.0
pip : 23.0.1
Cython : 0.29.33
pytest : None
hypothesis : None
sphinx : None
blosc : None
feather : None
xlsxwriter : None
lxml.etree : 4.9.2
html5lib : None
pymysql : None
psycopg2 : None
jinja2 : 3.1.2
IPython : 8.12.0
pandas_datareader: 0.10.0
bs4 : 4.12.2
bottleneck : 1.3.5
brotli :
fastparquet : None
fsspec : 2023.4.0
gcsfs : None
matplotlib : 3.7.1
numba : 0.57.0
numexpr : 2.8.4
odfpy : None
openpyxl : None
pandas_gbq : None
pyarrow : 11.0.0
pyreadstat : None
pyxlsb : None
s3fs : None
scipy : 1.10.1
snappy :
sqlalchemy : 1.4.39
tables : None
tabulate : 0.8.10
xarray : 2022.11.0
xlrd : None
zstandard : None
tzdata : 2023.3
qtpy : 2.2.0
pyqt5 : None