pandas-dev · WillAyd · Oct 21, 2025 · Oct 8, 2025 · Oct 8, 2025 · Oct 8, 2025
diff --git a/pandas/_libs/include/pandas/parser/pd_parser.h b/pandas/_libs/include/pandas/parser/pd_parser.h
@@ -37,9 +37,10 @@ typedef struct {
   int (*parser_trim_buffers)(parser_t *);
   int (*tokenize_all_rows)(parser_t *, const char *);
   int (*tokenize_nrows)(parser_t *, size_t, const char *);
-  int64_t (*str_to_int64)(const char *, int64_t, int64_t, int *, char);
+  int64_t (*str_to_int64)(const char *, int64_t, int64_t, TokenizerError *,
+                          char);
   uint64_t (*str_to_uint64)(uint_state *, const char *, int64_t, uint64_t,
-                            int *, char);
+                            TokenizerError *, char);
   double (*xstrtod)(const char *, char **, char, char, char, int, int *, int *);
   double (*precise_xstrtod)(const char *, char **, char, char, char, int, int *,
                             int *);

diff --git a/pandas/_libs/include/pandas/parser/tokenizer.h b/pandas/_libs/include/pandas/parser/tokenizer.h
@@ -14,10 +14,6 @@ See LICENSE for the license
 #define PY_SSIZE_T_CLEAN
 #include <Python.h>
 
-#define ERROR_NO_DIGITS 1
-#define ERROR_OVERFLOW 2
-#define ERROR_INVALID_CHARS 3
-
 #include <stdint.h>
 
 #define STREAM_INIT_SIZE 32
@@ -50,6 +46,13 @@ See LICENSE for the license
  *        duplication of some file I/O.
  */
 
+typedef enum {
+  TOKENIZER_OK,
+  ERROR_NO_DIGITS,
+  ERROR_OVERFLOW,
+  ERROR_INVALID_CHARS,
+} TokenizerError;
+
 typedef enum {
   START_RECORD,
   START_FIELD,
@@ -209,9 +212,9 @@ void uint_state_init(uint_state *self);
 int uint64_conflict(uint_state *self);
 
 uint64_t str_to_uint64(uint_state *state, const char *p_item, int64_t int_max,
-                       uint64_t uint_max, int *error, char tsep);
+                       uint64_t uint_max, TokenizerError *error, char tsep);
 int64_t str_to_int64(const char *p_item, int64_t int_min, int64_t int_max,
-                     int *error, char tsep);
+                     TokenizerError *error, char tsep);
 double xstrtod(const char *p, char **q, char decimal, char sci, char tsep,
                int skip_trailing, int *error, int *maybe_int);
 double precise_xstrtod(const char *p, char **q, char decimal, char sci,

diff --git a/pandas/_libs/parsers.pyx b/pandas/_libs/parsers.pyx
@@ -149,7 +149,10 @@ cdef extern from "pandas/parser/tokenizer.h":
         SKIP_LINE
         FINISHED
 
-    enum: ERROR_OVERFLOW
+    ctypedef enum TokenizerError:
+        TOKENIZER_OK,
+        ERROR_OVERFLOW,
+        ERROR_INVALID_CHARS
 
     ctypedef enum BadLineHandleMethod:
         ERROR,
@@ -282,9 +285,9 @@ cdef extern from "pandas/parser/pd_parser.h":
     int tokenize_nrows(parser_t *self, size_t nrows, const char *encoding_errors) nogil
 
     int64_t str_to_int64(char *p_item, int64_t int_min,
-                         int64_t int_max, int *error, char tsep) nogil
+                         int64_t int_max, TokenizerError *error, char tsep) nogil
     uint64_t str_to_uint64(uint_state *state, char *p_item, int64_t int_max,
-                           uint64_t uint_max, int *error, char tsep) nogil
+                           uint64_t uint_max, TokenizerError *error, char tsep) nogil
 
     double xstrtod(const char *p, char **q, char decimal,
                    char sci, char tsep, int skip_trailing,
@@ -1058,7 +1061,7 @@ cdef class TextReader:
         if col_dtype is not None:
             col_res, na_count = self._convert_with_dtype(
                 col_dtype, i, start, end, na_filter,
-                1, na_hashset, na_fset)
+                1, na_hashset, na_fset, False)
 
             # Fallback on the parse (e.g. we requested int dtype,
             # but its actually a float).
@@ -1069,30 +1072,34 @@ cdef class TextReader:
             return self._string_convert(i, start, end, na_filter, na_hashset)
         else:
             col_res = None
+            maybe_int = True
             for dt in self.dtype_cast_order:
-                if (dt.kind in "iu" and
-                        self._column_has_float(i, start, end, na_filter, na_hashset)):
+                if not maybe_int and dt.kind in "iu":
                     continue
 
                 try:
                     col_res, na_count = self._convert_with_dtype(
-                        dt, i, start, end, na_filter, 0, na_hashset, na_fset)
-                except ValueError:
-                    # This error is raised from trying to convert to uint64,
-                    # and we discover that we cannot convert to any numerical
-                    # dtype successfully. As a result, we leave the data
-                    # column AS IS with object dtype.
-                    col_res, na_count = self._convert_with_dtype(
-                        np.dtype("object"), i, start, end, 0,
-                        0, na_hashset, na_fset)
+                        dt, i, start, end, na_filter, 0, na_hashset, na_fset, True)
+                except ValueError as e:
+                    if str(e) == "Number is not int":
+                        maybe_int = False
+                        continue
+                    else:
+                        # This error is raised from trying to convert to uint64,
+                        # and we discover that we cannot convert to any numerical
+                        # dtype successfully. As a result, we leave the data
+                        # column AS IS with object dtype.
+                        col_res, na_count = self._convert_with_dtype(
+                            np.dtype("object"), i, start, end, 0,
+                            0, na_hashset, na_fset, False)
                 except OverflowError:
                     try:
                         col_res, na_count = _try_pylong(self.parser, i, start,
                                                         end, na_filter, na_hashset)
                     except ValueError:
                         col_res, na_count = self._convert_with_dtype(
                             np.dtype("object"), i, start, end, 0,
-                            0, na_hashset, na_fset)
+                            0, na_hashset, na_fset, False)
 
                 if col_res is not None:
                     break
@@ -1140,7 +1147,7 @@ cdef class TextReader:
                              bint na_filter,
                              bint user_dtype,
                              kh_str_starts_t *na_hashset,
-                             set na_fset):
+                             set na_fset, bint raise_on_float):
         if isinstance(dtype, CategoricalDtype):
             # TODO: I suspect that _categorical_convert could be
             # optimized when dtype is an instance of CategoricalDtype
@@ -1181,14 +1188,14 @@ cdef class TextReader:
 
         elif dtype.kind in "iu":
             try:
-                result, na_count = _try_int64(self.parser, i, start,
-                                              end, na_filter, na_hashset)
+                result, na_count = _try_int64(self.parser, i, start, end,
+                                              na_filter, na_hashset, raise_on_float)
                 if user_dtype and na_count is not None:
                     if na_count > 0:
                         raise ValueError(f"Integer column has NA values in column {i}")
             except OverflowError:
                 result = _try_uint64(self.parser, i, start, end,
-                                     na_filter, na_hashset)
+                                     na_filter, na_hashset, raise_on_float)
                 na_count = 0
 
             if result is not None and dtype != "int64":
@@ -1351,59 +1358,6 @@ cdef class TextReader:
             else:
                 return None
 
-    cdef bint _column_has_float(self, Py_ssize_t col,
-                                int64_t start, int64_t end,
-                                bint na_filter, kh_str_starts_t *na_hashset):
-        """Check if the column contains any float number."""
-        cdef:
-            Py_ssize_t i, j, lines = end - start
-            coliter_t it
-            const char *word = NULL
-            const char *ignored_chars = " +-"
-            const char *digits = "0123456789"
-            const char *float_indicating_chars = "eE"
-            char null_byte = 0
-
-        coliter_setup(&it, self.parser, col, start)
-
-        for i in range(lines):
-            COLITER_NEXT(it, word)
-
-            if na_filter and kh_get_str_starts_item(na_hashset, word):
-                continue
-
-            found_first_digit = False
-            j = 0
-            while word[j] != null_byte:
-                if word[j] == self.parser.decimal:
-                    return True
-                elif not found_first_digit and word[j] in ignored_chars:
-                    # no-op
-                    pass
-                elif not found_first_digit and word[j] not in digits:
-                    # word isn't numeric
-                    return False
-                elif not found_first_digit and word[j] in digits:
-                    found_first_digit = True
-                elif word[j] in float_indicating_chars:
-                    # preceding chars indicates numeric and
-                    # current char indicates float
-                    return True
-                elif word[j] not in digits:
-                    # previous characters indicates numeric
-                    # current character shows otherwise
-                    return False
-                elif word[j] in digits:
-                    # no-op
-                    pass
-                else:
-                    raise AssertionError(
-                            f"Unhandled case {word[j]=} {found_first_digit=}"
-                            )
-                j += 1
-
-        return False
-
 # Factor out code common to TextReader.__dealloc__ and TextReader.close
 # It cannot be a class method, since calling self.close() in __dealloc__
 # which causes a class attribute lookup and violates best practices
@@ -1800,7 +1754,8 @@ cdef int _try_double_nogil(parser_t *parser,
 
 cdef _try_uint64(parser_t *parser, int64_t col,
                  int64_t line_start, int64_t line_end,
-                 bint na_filter, kh_str_starts_t *na_hashset):
+                 bint na_filter, kh_str_starts_t *na_hashset,
+                 bint raise_on_float):
     cdef:
         int error
         Py_ssize_t lines
@@ -1822,7 +1777,10 @@ cdef _try_uint64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        return None
+        elif raise_on_float and error == ERROR_INVALID_CHARS:
+            raise ValueError("Number is not int")
+        elif not raise_on_float or error != ERROR_INVALID_CHARS:
+            return None
 
     if uint64_conflict(&state):
         raise ValueError("Cannot convert to numerical dtype")
@@ -1839,7 +1797,7 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
                            const kh_str_starts_t *na_hashset,
                            uint64_t *data, uint_state *state) nogil:
     cdef:
-        int error
+        TokenizerError error = TOKENIZER_OK
         Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
@@ -1872,9 +1830,10 @@ cdef int _try_uint64_nogil(parser_t *parser, int64_t col,
 
 cdef _try_int64(parser_t *parser, int64_t col,
                 int64_t line_start, int64_t line_end,
-                bint na_filter, kh_str_starts_t *na_hashset):
+                bint na_filter, kh_str_starts_t *na_hashset, bint raise_on_float):
     cdef:
-        int error, na_count = 0
+        TokenizerError error = TOKENIZER_OK
+        int na_count = 0
         Py_ssize_t lines
         coliter_t it
         int64_t *data
@@ -1892,18 +1851,21 @@ cdef _try_int64(parser_t *parser, int64_t col,
         if error == ERROR_OVERFLOW:
             # Can't get the word variable
             raise OverflowError("Overflow")
-        return None, None
+        elif raise_on_float and error == ERROR_INVALID_CHARS:
+            raise ValueError("Number is not int")
+        elif not raise_on_float or error != ERROR_INVALID_CHARS:
+            return None, None
 
     return result, na_count
 
 
-cdef int _try_int64_nogil(parser_t *parser, int64_t col,
-                          int64_t line_start,
-                          int64_t line_end, bint na_filter,
-                          const kh_str_starts_t *na_hashset, int64_t NA,
-                          int64_t *data, int *na_count) nogil:
+cdef TokenizerError _try_int64_nogil(parser_t *parser, int64_t col,
+                                     int64_t line_start,
+                                     int64_t line_end, bint na_filter,
+                                     const kh_str_starts_t *na_hashset, int64_t NA,
+                                     int64_t *data, int *na_count) nogil:
     cdef:
-        int error
+        TokenizerError error = TOKENIZER_OK
         Py_ssize_t i, lines = line_end - line_start
         coliter_t it
         const char *word = NULL
@@ -1922,17 +1884,17 @@ cdef int _try_int64_nogil(parser_t *parser, int64_t col,
 
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
-            if error != 0:
+            if error != TOKENIZER_OK:
                 return error
     else:
         for i in range(lines):
             COLITER_NEXT(it, word)
             data[i] = str_to_int64(word, INT64_MIN, INT64_MAX,
                                    &error, parser.thousands)
-            if error != 0:
+            if error != TOKENIZER_OK:
                 return error
 
-    return 0
+    return error
 
 cdef _try_pylong(parser_t *parser, Py_ssize_t col,
                  int64_t line_start, int64_t line_end,