From a2fe30ce73ace914994c7bf98b394a7549a50268 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Fri, 26 Sep 2025 16:25:15 +0000
Subject: [PATCH 1/2] Fix copy semantics bugs

---
 .../cudf/cudf/core/buffer/spillable_buffer.py |  1 -
 python/cudf/cudf/core/column/column.py        | 32 ++++++++++++++-----
 python/cudf/cudf/core/column/numerical.py     |  2 +-
 python/cudf/cudf/core/dataframe.py            |  2 +-
 python/cudf/cudf/core/index.py                | 10 ++++--
 python/cudf/cudf/core/indexed_frame.py        |  2 +-
 .../cudf/pandas/scripts/conftest-patch.py     |  4 ---
 .../indexes/rangeindex/methods/test_rename.py |  3 +-
 8 files changed, 37 insertions(+), 19 deletions(-)

diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
index 25fbfd318a2..dda32008238 100644
--- a/python/cudf/cudf/core/buffer/spillable_buffer.py
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -247,7 +247,6 @@ def mark_exposed(self) -> None:
 
         This also unspills the buffer (unspillable buffers cannot be spilled!).
         """
-
         self._manager.spill_to_device_limit()
         with self.lock:
             if not self.exposed:
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 20415d9b6b7..a9b8daf507d 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -24,6 +24,7 @@
 import rmm
 
 import cudf
+from cudf.api.extensions import no_default
 from cudf.api.types import (
     _is_categorical_dtype,
     infer_dtype,
@@ -612,7 +613,9 @@ def from_pylibcudf(
         )
 
     @classmethod
-    def from_cuda_array_interface(cls, arbitrary: Any) -> Self:
+    def from_cuda_array_interface(
+        cls, arbitrary: Any, data_ptr_exposed=no_default
+    ) -> Self:
         """
         Create a Column from an object implementing the CUDA array interface.
 
@@ -647,9 +650,11 @@ def from_cuda_array_interface(cls, arbitrary: Any) -> Self:
         else:
             mask = None
 
+        if data_ptr_exposed is no_default:
+            data_ptr_exposed = cudf.get_option("copy_on_write")
         column = ColumnBase.from_pylibcudf(
             plc.Column.from_cuda_array_interface(arbitrary),
-            data_ptr_exposed=cudf.get_option("copy_on_write"),
+            data_ptr_exposed=data_ptr_exposed,
         )
         if mask is not None:
             column = column.set_mask(mask)
@@ -2755,12 +2760,7 @@ def as_column(
             return arbitrary.astype(dtype)
         return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
-        column = ColumnBase.from_cuda_array_interface(arbitrary)
-        if nan_as_null is not False:
-            column = column.nans_to_nulls()
-        if dtype is not None:
-            column = column.astype(dtype)
-        return column
+        return _column_from_cuda_array_interface(arbitrary, nan_as_null, dtype)
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
         column = ColumnBase.from_arrow(arbitrary)
         if nan_as_null is not False:
@@ -2886,6 +2886,9 @@ def as_column(
                 arbitrary = np.asarray(arbitrary)
             else:
                 arbitrary = cp.asarray(arbitrary)
+                return _column_from_cuda_array_interface(
+                    arbitrary, nan_as_null, dtype, data_ptr_exposed=False
+                )
             return as_column(
                 arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
             )
@@ -3253,6 +3256,19 @@ def as_column(
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
+def _column_from_cuda_array_interface(
+    arbitrary, nan_as_null, dtype, data_ptr_exposed=no_default
+):
+    column = ColumnBase.from_cuda_array_interface(
+        arbitrary, data_ptr_exposed=data_ptr_exposed
+    )
+    if nan_as_null is not False:
+        column = column.nans_to_nulls()
+    if dtype is not None:
+        column = column.astype(dtype)
+    return column
+
+
 def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     """
     Return the headers and frames resulting
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index c0ebc3343a1..0ad54b185ad 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -567,7 +567,7 @@ def as_numerical_column(self, dtype: Dtype) -> NumericalColumn:
                     # If the dtype is a pandas nullable extension type, we need to
                     # float column doesn't have any NaNs.
                     res = self.nans_to_nulls()
-                    res._dtype = dtype
+                    res._dtype = dtype  # type: ignore[has-type]
                     return res
                 else:
                     self._dtype = dtype
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index afc46e5ace3..29c67eec8db 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3222,7 +3222,7 @@ def set_index(
             # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    data_to_add.append(self[col]._column)
+                    data_to_add.append(self[col]._column.copy(deep=True))
                     names.append(col)
                     if drop:
                         to_drop.append(col)
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index 017099636aa..27ef00d8376 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -1949,7 +1949,7 @@ def copy(self, name: Hashable = None, deep: bool = False) -> Self:
         New index instance.
         """
         name = self.name if name is None else name
-        col = self._column.copy(deep=True) if deep else self._column
+        col = self._column.copy(deep=deep)
         return type(self)._from_column(col, name=name)
 
     @_performance_tracking
@@ -5505,7 +5505,13 @@ def _as_index(
             )
         return data.copy(deep=copy)
     elif isinstance(data, Index):
-        idx = data.copy(deep=copy).rename(name)
+        if not isinstance(data, cudf.RangeIndex):
+            idx = type(data)._from_column(
+                data._column.copy(deep=copy) if copy else data._column,
+                name=name,
+            )
+        else:
+            idx = data.copy(deep=copy).rename(name)
     elif isinstance(data, ColumnBase):
         raise ValueError("Use cudf.Index._from_column instead.")
     elif isinstance(data, (pd.RangeIndex, range)):
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 46d407606df..af849022054 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -620,7 +620,7 @@ def copy(self, deep: bool = True) -> Self:
         return self._from_data(
             self._data.copy(deep=deep),
             # Indexes are immutable so copies can always be shallow.
-            self.index.copy(deep=False),
+            self.index.copy(deep=deep),
             attrs=copy.deepcopy(self.attrs) if deep else self._attrs,
         )
 
diff --git a/python/cudf/cudf/pandas/scripts/conftest-patch.py b/python/cudf/cudf/pandas/scripts/conftest-patch.py
index 61f085759a7..356c28474ae 100644
--- a/python/cudf/cudf/pandas/scripts/conftest-patch.py
+++ b/python/cudf/cudf/pandas/scripts/conftest-patch.py
@@ -6741,10 +6741,6 @@ def pytest_unconfigure(config):
     "tests/indexes/test_common.py::TestCommon::test_to_frame[uint8-new_name]",
     "tests/indexes/test_common.py::test_ndarray_compat_properties[multi]",
     "tests/indexes/test_common.py::test_ndarray_compat_properties[tuples]",
-    "tests/indexes/test_common.py::test_sort_values_invalid_na_position[nullable_int-None]",
-    "tests/indexes/test_common.py::test_sort_values_invalid_na_position[nullable_int-middle]",
-    "tests/indexes/test_common.py::test_sort_values_with_missing[nullable_int-first]",
-    "tests/indexes/test_common.py::test_sort_values_with_missing[nullable_int-last]",
     "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index1]",
     "tests/indexes/test_datetimelike.py::TestDatetimeLike::test_argsort_matches_array[simple_index2]",
     "tests/indexes/test_indexing.py::TestGetIndexer::test_get_indexer_base[multi]",
diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
index 01841d11647..9c563f01b72 100644
--- a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
+++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
@@ -3,6 +3,7 @@
 import pandas as pd
 
 import cudf
+from cudf.testing import assert_eq
 
 
 def test_rename_shallow_copy():
@@ -12,4 +13,4 @@ def test_rename_shallow_copy():
 
     idx = cudf.Index([1])
     result = idx.rename("a")
-    assert idx._column is result._column
+    assert_eq(idx._column, result._column)

From 13e5480e414a52664e19205105748476aa216e75 Mon Sep 17 00:00:00 2001
From: galipremsagar <sagarprem75@gmail.com>
Date: Mon, 29 Sep 2025 18:11:11 +0000
Subject: [PATCH 2/2] Address reviews

---
 python/cudf/cudf/core/column/column.py        | 31 +++++++++----------
 python/cudf/cudf/core/dataframe.py            |  5 ++-
 python/cudf/cudf/core/indexed_frame.py        |  1 -
 .../indexes/rangeindex/methods/test_rename.py |  5 +--
 4 files changed, 22 insertions(+), 20 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a9b8daf507d..24ab59e8293 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2760,7 +2760,12 @@ def as_column(
             return arbitrary.astype(dtype)
         return arbitrary
     elif hasattr(arbitrary, "__cuda_array_interface__"):
-        return _column_from_cuda_array_interface(arbitrary, nan_as_null, dtype)
+        column = ColumnBase.from_cuda_array_interface(arbitrary)
+        if nan_as_null is not False:
+            column = column.nans_to_nulls()
+        if dtype is not None:
+            column = column.astype(dtype)
+        return column
     elif isinstance(arbitrary, (pa.Array, pa.ChunkedArray)):
         column = ColumnBase.from_arrow(arbitrary)
         if nan_as_null is not False:
@@ -2886,9 +2891,16 @@ def as_column(
                 arbitrary = np.asarray(arbitrary)
             else:
                 arbitrary = cp.asarray(arbitrary)
-                return _column_from_cuda_array_interface(
-                    arbitrary, nan_as_null, dtype, data_ptr_exposed=False
+                # Explicitly passing `data_ptr_exposed` to
+                # reuse existing memory created by cupy here
+                column = ColumnBase.from_cuda_array_interface(
+                    arbitrary, data_ptr_exposed=False
                 )
+                if nan_as_null is not False:
+                    column = column.nans_to_nulls()
+                if dtype is not None:
+                    column = column.astype(dtype)
+                return column
             return as_column(
                 arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
             )
@@ -3256,19 +3268,6 @@ def as_column(
         return as_column(arbitrary, nan_as_null=nan_as_null, dtype=dtype)
 
 
-def _column_from_cuda_array_interface(
-    arbitrary, nan_as_null, dtype, data_ptr_exposed=no_default
-):
-    column = ColumnBase.from_cuda_array_interface(
-        arbitrary, data_ptr_exposed=data_ptr_exposed
-    )
-    if nan_as_null is not False:
-        column = column.nans_to_nulls()
-    if dtype is not None:
-        column = column.astype(dtype)
-    return column
-
-
 def serialize_columns(columns: list[ColumnBase]) -> tuple[list[dict], list]:
     """
     Return the headers and frames resulting
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index 29c67eec8db..848563b8384 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -3222,7 +3222,10 @@ def set_index(
             # label-like
             if is_scalar(col) or isinstance(col, tuple):
                 if col in self._column_names:
-                    data_to_add.append(self[col]._column.copy(deep=True))
+                    if drop and inplace:
+                        data_to_add.append(self[col]._column)
+                    else:
+                        data_to_add.append(self[col]._column.copy(deep=True))
                     names.append(col)
                     if drop:
                         to_drop.append(col)
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index af849022054..35a2d3170bb 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -619,7 +619,6 @@ def copy(self, deep: bool = True) -> Self:
         """
         return self._from_data(
             self._data.copy(deep=deep),
-            # Indexes are immutable so copies can always be shallow.
             self.index.copy(deep=deep),
             attrs=copy.deepcopy(self.attrs) if deep else self._attrs,
         )
diff --git a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
index 9c563f01b72..3a8bd36fd96 100644
--- a/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
+++ b/python/cudf/cudf/tests/indexes/rangeindex/methods/test_rename.py
@@ -3,7 +3,6 @@
 import pandas as pd
 
 import cudf
-from cudf.testing import assert_eq
 
 
 def test_rename_shallow_copy():
@@ -13,4 +12,6 @@ def test_rename_shallow_copy():
 
     idx = cudf.Index([1])
     result = idx.rename("a")
-    assert_eq(idx._column, result._column)
+    assert idx._column.base_data.get_ptr(
+        mode="read"
+    ) == result._column.base_data.get_ptr(mode="read")