Skip to content
Merged
Show file tree
Hide file tree
Changes from 14 commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
be21b2e
perf: verify for float numbers during tokenization
Alvaro-Kothe Oct 8, 2025
fc10a5f
fix: try other dtypes instead of skipping to float64
Alvaro-Kothe Oct 8, 2025
ab2fab8
fix: don't throw error when casting is expected
Alvaro-Kothe Oct 8, 2025
7e8033d
fix: fix tuple error
Alvaro-Kothe Oct 8, 2025
5219386
fix: remove decimal_separator argument
Alvaro-Kothe Oct 8, 2025
4ff07e3
fix: early return on overflow, but still check next chars
Alvaro-Kothe Oct 8, 2025
c7fc292
fix: don't flag int with trailing whitespace as invalid
Alvaro-Kothe Oct 8, 2025
4c8d770
chore: better error message
Alvaro-Kothe Oct 8, 2025
35f075a
docs: document function to check for invalid character
Alvaro-Kothe Oct 8, 2025
448f944
Merge branch 'main' into perf/read-csv
Alvaro-Kothe Oct 9, 2025
cf0a26d
fix: make `check_for_invalid_char` return status code
Alvaro-Kothe Oct 9, 2025
2e5a47c
refactor: add TokenizerError enum
Alvaro-Kothe Oct 9, 2025
ca32c01
refactor: assign error in if-else block
Alvaro-Kothe Oct 9, 2025
46c9883
docs: update documentation for TokenizerError
Alvaro-Kothe Oct 9, 2025
69c35ee
Merge remote-tracking branch 'upstream/main' into perf/read-csv
Alvaro-Kothe Oct 16, 2025
40983dd
fix: ignore overflow error if encounter invalid char
Alvaro-Kothe Oct 17, 2025
00be2c2
chore: improve flag variable name
Alvaro-Kothe Oct 17, 2025
06297b6
simplify diff
Alvaro-Kothe Oct 17, 2025
4f6c9a8
fix: remove unnecessary branch
Alvaro-Kothe Oct 17, 2025
832d99e
chore: remove confusing comment
Alvaro-Kothe Oct 20, 2025
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 3 additions & 2 deletions pandas/_libs/include/pandas/parser/pd_parser.h
Original file line number Diff line number Diff line change
Expand Up @@ -37,9 +37,10 @@ typedef struct {
int (*parser_trim_buffers)(parser_t *);
int (*tokenize_all_rows)(parser_t *, const char *);
int (*tokenize_nrows)(parser_t *, size_t, const char *);
int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
int64_t (*str_to_int64)(const char *, int64_t, int64_t, TokenizerError *,
char);
uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
int *, char);
TokenizerError *, char);
double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
int *);
Expand Down
15 changes: 9 additions & 6 deletions pandas/_libs/include/pandas/parser/tokenizer.h
Original file line number Diff line number Diff line change
Expand Up @@ -14,10 +14,6 @@ See LICENSE for the license
#define PY_SSIZE_T_CLEAN
#include <Python.h>

#define ERROR_NO_DIGITS 1
#define ERROR_OVERFLOW 2
#define ERROR_INVALID_CHARS 3

#include <stdint.h>

#define STREAM_INIT_SIZE 32
Expand Down Expand Up @@ -50,6 +46,13 @@ See LICENSE for the license
* duplication of some file I/O.
*/

typedef enum {
TOKENIZER_OK,
ERROR_NO_DIGITS,
ERROR_OVERFLOW,
ERROR_INVALID_CHARS,
} TokenizerError;

typedef enum {
START_RECORD,
START_FIELD,
Expand Down Expand Up @@ -209,9 +212,9 @@ void uint_state_init(uint_state *self);
int uint64_conflict(uint_state *self);

uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep);
uint64_t uint_max, TokenizerError *error, char tsep);
int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
int *error, char tsep);
TokenizerError *error, char tsep);
double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
int skip_trailing, int *error, int *maybe_int);
double precise_xstrtod(const char *p, char **q, char decimal, char sci,
Expand Down
138 changes: 50 additions & 88 deletions pandas/_libs/parsers.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,10 @@ cdef extern from "pandas/parser/tokenizer.h":
SKIP_LINE
FINISHED

enum: ERROR_OVERFLOW
ctypedef enum TokenizerError:
TOKENIZER_OK,
ERROR_OVERFLOW,
ERROR_INVALID_CHARS

ctypedef enum BadLineHandleMethod:
ERROR,
Expand Down Expand Up @@ -282,9 +285,9 @@ cdef extern from "pandas/parser/pd_parser.h":
int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil

int64_t str_to_int64(char *p_item, int64_t int_min,
int64_t int_max, int *error, char tsep) nogil
int64_t int_max, TokenizerError *error, char tsep) nogil
uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
uint64_t uint_max, int *error, char tsep) nogil
uint64_t uint_max, TokenizerError *error, char tsep) nogil

double xstrtod(const char *p, char **q, char decimal,
char sci, char tsep, int skip_trailing,
Expand Down Expand Up @@ -1058,7 +1061,7 @@ cdef class TextReader:
if col_dtype is not None:
col_res, na_count = self._convert_with_dtype(
col_dtype, i, start, end, na_filter,
1, na_hashset, na_fset)
1, na_hashset, na_fset, False)

# Fallback on the parse (e.g. we requested int dtype,
# but its actually a float).
Expand All @@ -1069,30 +1072,34 @@ cdef class TextReader:
return self._string_convert(i, start, end, na_filter, na_hashset)
else:
col_res = None
maybe_int = True
for dt in self.dtype_cast_order:
if (dt.kind in "iu" and
self._column_has_float(i, start, end, na_filter, na_hashset)):
if not maybe_int and dt.kind in "iu":
continue

try:
col_res, na_count = self._convert_with_dtype(
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
except ValueError:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
except ValueError as e:
if str(e) == "Number is not int":
maybe_int = False
continue
else:
# This error is raised from trying to convert to uint64,
# and we discover that we cannot convert to any numerical
# dtype successfully. As a result, we leave the data
# column AS IS with object dtype.
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset, False)
except OverflowError:
try:
col_res, na_count = _try_pylong(self.parser, i, start,
end, na_filter, na_hashset)
except ValueError:
col_res, na_count = self._convert_with_dtype(
np.dtype("object"), i, start, end, 0,
0, na_hashset, na_fset)
0, na_hashset, na_fset, False)

if col_res is not None:
break
Expand Down Expand Up @@ -1140,7 +1147,7 @@ cdef class TextReader:
bint na_filter,
bint user_dtype,
kh_str_starts_t *na_hashset,
set na_fset):
set na_fset, bint raise_on_float):
if isinstance(dtype, CategoricalDtype):
# TODO: I suspect that _categorical_convert could be
# optimized when dtype is an instance of CategoricalDtype
Expand Down Expand Up @@ -1181,14 +1188,14 @@ cdef class TextReader:

elif dtype.kind in "iu":
try:
result, na_count = _try_int64(self.parser, i, start,
end, na_filter, na_hashset)
result, na_count = _try_int64(self.parser, i, start, end,
na_filter, na_hashset, raise_on_float)
if user_dtype and na_count is not None:
if na_count > 0:
raise ValueError(f"Integer column has NA values in column {i}")
except OverflowError:
result = _try_uint64(self.parser, i, start, end,
na_filter, na_hashset)
na_filter, na_hashset, raise_on_float)
na_count = 0

if result is not None and dtype != "int64":
Expand Down Expand Up @@ -1351,59 +1358,6 @@ cdef class TextReader:
else:
return None

cdef bint _column_has_float(self, Py_ssize_t col,
int64_t start, int64_t end,
bint na_filter, kh_str_starts_t *na_hashset):
"""Check if the column contains any float number."""
cdef:
Py_ssize_t i, j, lines = end - start
coliter_t it
const char *word = NULL
const char *ignored_chars = " +-"
const char *digits = "0123456789"
const char *float_indicating_chars = "eE"
char null_byte = 0

coliter_setup(&it, self.parser, col, start)

for i in range(lines):
COLITER_NEXT(it, word)

if na_filter and kh_get_str_starts_item(na_hashset, word):
continue

found_first_digit = False
j = 0
while word[j] != null_byte:
if word[j] == self.parser.decimal:
return True
elif not found_first_digit and word[j] in ignored_chars:
# no-op
pass
elif not found_first_digit and word[j] not in digits:
# word isn't numeric
return False
elif not found_first_digit and word[j] in digits:
found_first_digit = True
elif word[j] in float_indicating_chars:
# preceding chars indicates numeric and
# current char indicates float
return True
elif word[j] not in digits:
# previous characters indicates numeric
# current character shows otherwise
return False
elif word[j] in digits:
# no-op
pass
else:
raise AssertionError(
f"Unhandled case {word[j]=} {found_first_digit=}"
)
j += 1

return False

# Factor out code common to TextReader.__dealloc__ and TextReader.close
# It cannot be a class method, since calling self.close() in __dealloc__
# which causes a class attribute lookup and violates best practices
Expand Down Expand Up @@ -1800,7 +1754,8 @@ cdef int _try_double_nogil(parser_t *parser,

cdef _try_uint64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset,
bint raise_on_float):
cdef:
int error
Py_ssize_t lines
Expand All @@ -1822,7 +1777,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None

if uint64_conflict(&state):
raise ValueError("Cannot convert to numerical dtype")
Expand All @@ -1839,7 +1797,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
const kh_str_starts_t *na_hashset,
uint64_t *data, uint_state *state) nogil:
cdef:
int error
TokenizerError error = TOKENIZER_OK
Py_ssize_t i, lines = line_end - line_start
coliter_t it
const char *word = NULL
Expand Down Expand Up @@ -1872,9 +1830,10 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,

cdef _try_int64(parser_t *parser, int64_t col,
int64_t line_start, int64_t line_end,
bint na_filter, kh_str_starts_t *na_hashset):
bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
cdef:
int error, na_count = 0
TokenizerError error = TOKENIZER_OK
int na_count = 0
Py_ssize_t lines
coliter_t it
int64_t *data
Expand All @@ -1892,18 +1851,21 @@ cdef _try_int64(parser_t *parser, int64_t col,
if error == ERROR_OVERFLOW:
# Can't get the word variable
raise OverflowError("Overflow")
return None, None
elif raise_on_float and error == ERROR_INVALID_CHARS:
raise ValueError("Number is not int")
elif not raise_on_float or error != ERROR_INVALID_CHARS:
return None, None

return result, na_count


cdef int _try_int64_nogil(parser_t *parser, int64_t col,
int64_t line_start,
int64_t line_end, bint na_filter,
const kh_str_starts_t *na_hashset, int64_t NA,
int64_t *data, int *na_count) nogil:
cdef TokenizerError _try_int64_nogil(parser_t *parser, int64_t col,
int64_t line_start,
int64_t line_end, bint na_filter,
const kh_str_starts_t *na_hashset, int64_t NA,
int64_t *data, int *na_count) nogil:
cdef:
int error
TokenizerError error = TOKENIZER_OK
Py_ssize_t i, lines = line_end - line_start
coliter_t it
const char *word = NULL
Expand All @@ -1922,17 +1884,17 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,

data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
if error != TOKENIZER_OK:
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This pattern would definitely be cleaner with a macro to return of non-zero (in a follow up PR is fine)

return error
else:
for i in range(lines):
COLITER_NEXT(it, word)
data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
&error, parser.thousands)
if error != 0:
if error != TOKENIZER_OK:
return error

return 0
return error

cdef _try_pylong(parser_t *parser, Py_ssize_t col,
int64_t line_start, int64_t line_end,
Expand Down
Loading
Loading