Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 2 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -343,13 +343,8 @@ def to_pandas(
raise NotImplementedError(f"{arrow_type=} is not supported.")

if self.categories.dtype.kind == "f":
col = type(self)(
data=self.data, # type: ignore[arg-type]
size=self.size,
dtype=self.dtype,
mask=self.notnull().fillna(False).as_mask(),
children=self.children,
)
new_mask = self.notnull().fillna(False).as_mask()
col = self.set_mask(new_mask)
else:
col = self

Expand Down
59 changes: 41 additions & 18 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -398,14 +398,22 @@ def set_mask(self, value) -> Self:
dbuf.copy_from_host(value)
mask = as_buffer(dbuf)

return build_column( # type: ignore[return-value]
data=self.data,
dtype=self.dtype,
mask=mask,
size=self.size,
offset=0,
children=self.children,
)
if mask is not None:
new_mask: plc.gpumemoryview | None = plc.gpumemoryview(mask)
new_null_count = plc.null_mask.null_count(
new_mask,
0,
self.size,
)
else:
new_mask = None
new_null_count = 0
new_plc_column = self.to_pylibcudf(
mode="read", use_base=False
).with_mask(new_mask, new_null_count)
return self.from_pylibcudf( # type: ignore[return-value]
new_plc_column,
)._with_type_metadata(self.dtype)

@property
def null_count(self) -> int:
Expand Down Expand Up @@ -488,7 +496,9 @@ def _mimic_inplace(
# underlying buffers as exposed before this function can itself be exposed
# publicly. User requests to convert to pylibcudf must assume that the
# data may be modified afterwards.
def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column:
def to_pylibcudf(
self, mode: Literal["read", "write"], *, use_base: bool = True
) -> plc.Column:
"""Convert this Column to a pylibcudf.Column.

This function will generate a pylibcudf Column pointing to the same
Expand All @@ -501,6 +511,9 @@ def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column:
to may be modified by the caller. If "read", the data pointed to
must not be modified by the caller. Failure to fulfill this
contract will cause incorrect behavior.
use_base : bool, default True
Whether to use the column's base data, mask, and children,
or data, mask, and children relative to a 0 offset.

Returns
-------
Expand All @@ -522,29 +535,39 @@ def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column:

data = None
if col.base_data is not None:
if use_base:
data_buff = col.base_data
else:
data_buff = col.data # type: ignore[assignment]
cai = cuda_array_interface_wrapper(
ptr=col.base_data.get_ptr(mode=mode),
size=col.base_data.size,
owner=col.base_data,
ptr=data_buff.get_ptr(mode=mode),
size=data_buff.size,
owner=data_buff,
)
data = plc.gpumemoryview(cai)

mask = None
if self.nullable:
# TODO: Are we intentionally use self's mask instead of col's?
# Where is the mask stored for categoricals?
if use_base:
mask_buff = self.base_mask
else:
mask_buff = self.mask
cai = cuda_array_interface_wrapper(
ptr=self.base_mask.get_ptr(mode=mode), # type: ignore[union-attr]
size=self.base_mask.size, # type: ignore[union-attr]
owner=self.base_mask,
ptr=mask_buff.get_ptr(mode=mode), # type: ignore[union-attr]
size=mask_buff.size, # type: ignore[union-attr]
owner=mask_buff,
)
mask = plc.gpumemoryview(cai)

children = []
if col.base_children:
children = [
child_column.to_pylibcudf(mode=mode)
for child_column in col.base_children
child_column.to_pylibcudf(mode=mode, use_base=use_base)
for child_column in (
col.base_children if use_base else col.children
)
]

return plc.Column(
Expand All @@ -553,7 +576,7 @@ def to_pylibcudf(self, mode: Literal["read", "write"]) -> plc.Column:
data,
mask,
self.null_count,
self.offset,
self.offset if use_base else 0,
children,
)

Expand Down
7 changes: 2 additions & 5 deletions python/cudf/cudf/datasets.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
# Copyright (c) 2020-2024, NVIDIA CORPORATION.
# Copyright (c) 2020-2025, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
Expand Down Expand Up @@ -69,10 +69,7 @@ def timeseries(
size=len(index),
p=[1 - nulls_frequency, nulls_frequency],
)
mask_buf = cudf.core.column.as_column(mask).as_mask()
masked_col = gdf[col]._column.set_mask(mask_buf)
gdf[col] = cudf.Series._from_column(masked_col, index=gdf.index)

gdf.loc[mask, col] = None
return gdf


Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -198,7 +198,10 @@ def test_dataframe_reduction_error():


def test_mean_timeseries(numeric_only):
gdf = cudf.datasets.timeseries()
gdf = cudf.DataFrame(
{"a": ["a", "b", "c"], "b": range(3), "c": [-1.0, 12.2, 0.0]},
index=pd.date_range("2020-01-01", periods=3, name="timestamp"),
)
if not numeric_only:
gdf = gdf.select_dtypes(include="number")
pdf = gdf.to_pandas()
Expand Down