Skip to content
Open
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
7 changes: 4 additions & 3 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -2129,12 +2129,13 @@ def sanitize_objects(ndarray[object] values, set na_values) -> int:

for i in range(n):
val = values[i]
memo_key = (val, type(val))
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Perf impact? I suspect this hashing is slower

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

measured performance by sanitizing 5 million values

import time
import numpy as np
import contextlib

@contextlib.contextmanager
def log_duration(title):
    start = time.perf_counter()
    try:
        yield
    finally:
        end = time.perf_counter()
        print(f"{title}: {end - start:.4f} seconds")

million = 10**6

def get_synthetic_data():
    int_values = np.array([i for i in range(million)], dtype=object)
    float_values = np.array([float(i) for i in range(million)], dtype=object)
    str_values = np.array([str(i) for i in range(million)], dtype=object)
    bool_values = np.array([i % 2 == 0 for i in range(million)], dtype=object)
    none_values = np.array(["NA" for _ in range(million)], dtype=object)

    mixed_values = np.empty(million * 5, dtype=object)
    mixed_values[0::5] = int_values
    mixed_values[1::5] = float_values
    mixed_values[2::5] = str_values
    mixed_values[3::5] = bool_values
    mixed_values[4::5] = none_values
    np.random.seed(42)
    np.random.shuffle(mixed_values)
    return mixed_values

values = get_synthetic_data()

with log_duration("sanitize_objects_old"):
    sanitize_objects_old(values,na_values={"NA"})

values = get_synthetic_data()

with log_duration("sanitize_objects_include_type_in_memo_key"):
    sanitize_objects(values,na_values={"NA"})

values = get_synthetic_data()

with log_duration("sanitize_objects_skip_bool"):
    sanitize_objects_skip_bool(values,na_values={"NA"})

Output

sanitize_objects_old: 1.5880 seconds
sanitize_objects_include_type_in_memo_key: 1.9344 seconds
sanitize_objects_skip_bool: 1.6926 seconds

yes you are right, using type in memo key is 20% slower, skipping those 4 values seems a better option, python automatically uses references for same int and bool values, so skipping will not increase memory footprint even if the data is mixture of only those 4 values

if val in na_values:
values[i] = onan
na_count += 1
elif val in memo:
values[i] = memo[val]
elif memo_key in memo:
values[i] = memo[memo_key]
else:
memo[val] = val
memo[memo_key] = val

return na_count
11 changes: 11 additions & 0 deletions pandas/tests/io/parser/common/test_common_basic.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@

from pandas._config import using_string_dtype

from pandas._libs import parsers as libparsers
from pandas.compat import HAS_PYARROW
from pandas.errors import (
EmptyDataError,
Expand Down Expand Up @@ -830,3 +831,13 @@ def test_read_seek(all_parsers):
actual = parser.read_csv(file)
expected = parser.read_csv(StringIO(content))
tm.assert_frame_equal(actual, expected)


def test_dtype_conversion_in_sanitization():
# GH60088
values = np.array([1, True], dtype=object)
expected = np.array([1, True], dtype=object)
libparsers.sanitize_objects(values, na_values=set())
for v, e in zip(values, expected):
assert v == e
assert type(v) == type(e)
Loading