From a2fe30ce73ace914994c7bf98b394a7549a50268 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Fri, 26 Sep 2025 16:25:15 +0000 Subject: [PATCH 1/2] Fix copy semantics bugs --- .../cudf/cudf/core/buffer/spillable_buffer.py | 1 - python/cudf/cudf/core/column/column.py | 32 ++++++++++++++----- python/cudf/cudf/core/column/numerical.py | 2 +- python/cudf/cudf/core/dataframe.py | 2 +- python/cudf/cudf/core/index.py | 10 ++++-- python/cudf/cudf/core/indexed_frame.py | 2 +- .../cudf/pandas/scripts/conftest-patch.py | 4 --- .../indexes/rangeindex/methods/test_rename.py | 3 +- 8 files changed, 37 insertions(+), 19 deletions(-) diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py index 25fbfd318a2..dda32008238 100644 --- a/python/cudf/cudf/core/buffer/spillable_buffer.py +++ b/python/cudf/cudf/core/buffer/spillable_buffer.py @@ -247,7 +247,6 @@ def mark_exposed(self) -> None: This also unspills the buffer (unspillable buffers cannot be spilled!). """ - self._manager.spill_to_device_limit() with self.lock: if not self.exposed: diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 20415d9b6b7..a9b8daf507d 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -24,6 +24,7 @@ import rmm import cudf +from cudf.api.extensions import no_default from cudf.api.types import ( _is_categorical_dtype, infer_dtype, @@ -612,7 +613,9 @@ def from_pylibcudf( ) @classmethod - def from_cuda_array_interface(cls, arbitrary: Any) -> Self: + def from_cuda_array_interface( + cls, arbitrary: Any, data_ptr_exposed=no_default + ) -> Self: """ Create a Column from an object implementing the CUDA array interface. @@ -647,9 +650,11 @@ def from_cuda_array_interface(cls, arbitrary: Any) -> Self: else: mask = None + if data_ptr_exposed is no_default: + data_ptr_exposed = cudf.get_option("copy_on_write") column = ColumnBase.from_pylibcudf( plc.Column.from_cuda_array_interface(arbitrary), - data_ptr_exposed=cudf.get_option("copy_on_write"), + data_ptr_exposed=data_ptr_exposed, ) if mask is not None: column = column.set_mask(mask) @@ -2755,12 +2760,7 @@ def as_column( return arbitrary.astype(dtype) return arbitrary elif hasattr(arbitrary, "__cuda_array_interface__"): - column = ColumnBase.from_cuda_array_interface(arbitrary) - if nan_as_null is not False: - column = column.nans_to_nulls() - if dtype is not None: - column = column.astype(dtype) - return column + return _column_from_cuda_array_interface(arbitrary, nan_as_null, dtype) elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): column = ColumnBase.from_arrow(arbitrary) if nan_as_null is not False: @@ -2886,6 +2886,9 @@ def as_column( arbitrary = np.asarray(arbitrary) else: arbitrary = cp.asarray(arbitrary) + return _column_from_cuda_array_interface( + arbitrary, nan_as_null, dtype, data_ptr_exposed=False + ) return as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) @@ -3253,6 +3256,19 @@ def as_column( return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) +def _column_from_cuda_array_interface( + arbitrary, nan_as_null, dtype, data_ptr_exposed=no_default +): + column = ColumnBase.from_cuda_array_interface( + arbitrary, data_ptr_exposed=data_ptr_exposed + ) + if nan_as_null is not False: + column = column.nans_to_nulls() + if dtype is not None: + column = column.astype(dtype) + return column + + def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: """ Return the headers and frames resulting diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py index c0ebc3343a1..0ad54b185ad 100644 --- a/python/cudf/cudf/core/column/numerical.py +++ b/python/cudf/cudf/core/column/numerical.py @@ -567,7 +567,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn: # If the dtype is a pandas nullable extension type, we need to # float column doesn't have any NaNs. res = self.nans_to_nulls() - res._dtype = dtype + res._dtype = dtype # type: ignore[has-type] return res else: self._dtype = dtype diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index afc46e5ace3..29c67eec8db 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3222,7 +3222,7 @@ def set_index( # label-like if is_scalar(col) or isinstance(col, tuple): if col in self._column_names: - data_to_add.append(self[col]._column) + data_to_add.append(self[col]._column.copy(deep=True)) names.append(col) if drop: to_drop.append(col) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 017099636aa..27ef00d8376 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -1949,7 +1949,7 @@ def copy(self, name: Hashable = None, deep: bool = False) -> Self: New index instance. """ name = self.name if name is None else name - col = self._column.copy(deep=True) if deep else self._column + col = self._column.copy(deep=deep) return type(self)._from_column(col, name=name) @_performance_tracking @@ -5505,7 +5505,13 @@ def _as_index( ) return data.copy(deep=copy) elif isinstance(data, Index): - idx = data.copy(deep=copy).rename(name) + if not isinstance(data, cudf.RangeIndex): + idx = type(data)._from_column( + data._column.copy(deep=copy) if copy else data._column, + name=name, + ) + else: + idx = data.copy(deep=copy).rename(name) elif isinstance(data, ColumnBase): raise ValueError("Use cudf.Index._from_column instead.") elif isinstance(data, (pd.RangeIndex, range)): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 46d407606df..af849022054 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -620,7 +620,7 @@ def copy(self, deep: bool = True) -> Self: return self._from_data( self._data.copy(deep=deep), # Indexes are immutable so copies can always be shallow. - self.index.copy(deep=False), + self.index.copy(deep=deep), attrs=copy.deepcopy(self.attrs) if deep else self._attrs, ) diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py index 61f085759a7..356c28474ae 100644 --- a/python/cudf/cudf/pandas/scripts/conftest-patch.py +++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py @@ -6741,10 +6741,6 @@ def pytest_unconfigure(config): "tests/indexes/test_common.py::TestCommon::test_to_frame[uint8-new_name]", "tests/indexes/test_common.py::test_ndarray_compat_properties[multi]", "tests/indexes/test_common.py::test_ndarray_compat_properties[tuples]", - "tests/indexes/test_common.py::test_sort_values_invalid_na_position[nullable_int-None]", - "tests/indexes/test_common.py::test_sort_values_invalid_na_position[nullable_int-middle]", - "tests/indexes/test_common.py::test_sort_values_with_missing[nullable_int-first]", - "tests/indexes/test_common.py::test_sort_values_with_missing[nullable_int-last]", "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index1]", "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index2]", "tests/indexes/test_indexing.py::TestGetIndexer::test_get_indexer_base[multi]", diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py index 01841d11647..9c563f01b72 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py @@ -3,6 +3,7 @@ import pandas as pd import cudf +from cudf.testing import assert_eq def test_rename_shallow_copy(): @@ -12,4 +13,4 @@ def test_rename_shallow_copy(): idx = cudf.Index([1]) result = idx.rename("a") - assert idx._column is result._column + assert_eq(idx._column, result._column) From 13e5480e414a52664e19205105748476aa216e75 Mon Sep 17 00:00:00 2001 From: galipremsagar Date: Mon, 29 Sep 2025 18:11:11 +0000 Subject: [PATCH 2/2] Address reviews --- python/cudf/cudf/core/column/column.py | 31 +++++++++---------- python/cudf/cudf/core/dataframe.py | 5 ++- python/cudf/cudf/core/indexed_frame.py | 1 - .../indexes/rangeindex/methods/test_rename.py | 5 +-- 4 files changed, 22 insertions(+), 20 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a9b8daf507d..24ab59e8293 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2760,7 +2760,12 @@ def as_column( return arbitrary.astype(dtype) return arbitrary elif hasattr(arbitrary, "__cuda_array_interface__"): - return _column_from_cuda_array_interface(arbitrary, nan_as_null, dtype) + column = ColumnBase.from_cuda_array_interface(arbitrary) + if nan_as_null is not False: + column = column.nans_to_nulls() + if dtype is not None: + column = column.astype(dtype) + return column elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)): column = ColumnBase.from_arrow(arbitrary) if nan_as_null is not False: @@ -2886,9 +2891,16 @@ def as_column( arbitrary = np.asarray(arbitrary) else: arbitrary = cp.asarray(arbitrary) - return _column_from_cuda_array_interface( - arbitrary, nan_as_null, dtype, data_ptr_exposed=False + # Explicitly passing `data_ptr_exposed` to + # reuse existing memory created by cupy here + column = ColumnBase.from_cuda_array_interface( + arbitrary, data_ptr_exposed=False ) + if nan_as_null is not False: + column = column.nans_to_nulls() + if dtype is not None: + column = column.astype(dtype) + return column return as_column( arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length ) @@ -3256,19 +3268,6 @@ def as_column( return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype) -def _column_from_cuda_array_interface( - arbitrary, nan_as_null, dtype, data_ptr_exposed=no_default -): - column = ColumnBase.from_cuda_array_interface( - arbitrary, data_ptr_exposed=data_ptr_exposed - ) - if nan_as_null is not False: - column = column.nans_to_nulls() - if dtype is not None: - column = column.astype(dtype) - return column - - def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]: """ Return the headers and frames resulting diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 29c67eec8db..848563b8384 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -3222,7 +3222,10 @@ def set_index( # label-like if is_scalar(col) or isinstance(col, tuple): if col in self._column_names: - data_to_add.append(self[col]._column.copy(deep=True)) + if drop and inplace: + data_to_add.append(self[col]._column) + else: + data_to_add.append(self[col]._column.copy(deep=True)) names.append(col) if drop: to_drop.append(col) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index af849022054..35a2d3170bb 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -619,7 +619,6 @@ def copy(self, deep: bool = True) -> Self: """ return self._from_data( self._data.copy(deep=deep), - # Indexes are immutable so copies can always be shallow. self.index.copy(deep=deep), attrs=copy.deepcopy(self.attrs) if deep else self._attrs, ) diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py index 9c563f01b72..3a8bd36fd96 100644 --- a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py +++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py @@ -3,7 +3,6 @@ import pandas as pd import cudf -from cudf.testing import assert_eq def test_rename_shallow_copy(): @@ -13,4 +12,6 @@ def test_rename_shallow_copy(): idx = cudf.Index([1]) result = idx.rename("a") - assert_eq(idx._column, result._column) + assert idx._column.base_data.get_ptr( + mode="read" + ) == result._column.base_data.get_ptr(mode="read")