Skip to content

Commit ef14cb7

Browse files
committed
REF: remove unreachable, stronger typing in parsers.pyx
1 parent ebca3c5 commit ef14cb7

File tree

3 files changed

+88
-61
lines changed

3 files changed

+88
-61
lines changed

pandas/_libs/parsers.pyx

Lines changed: 38 additions & 41 deletions
Original file line numberDiff line numberDiff line change
@@ -358,7 +358,7 @@ cdef class TextReader:
358358
int64_t leading_cols, table_width
359359
object delimiter # bytes or str
360360
object converters
361-
object na_values
361+
object na_values # dict[hashable, set[str]] | list[str]
362362
list header # list[list[non-negative integers]]
363363
object index_col
364364
object skiprows
@@ -390,8 +390,8 @@ cdef class TextReader:
390390
usecols=None,
391391
on_bad_lines=ERROR,
392392
bint na_filter=True,
393-
na_values=None,
394-
na_fvalues=None,
393+
na_values=None, # dict[hashable, set[str]] | set[str]
394+
na_fvalues=None, # dict[hashable, set[float]] | set[float]
395395
bint keep_default_na=True,
396396
true_values=None,
397397
false_values=None,
@@ -486,9 +486,17 @@ cdef class TextReader:
486486

487487
self.delimiter = delimiter
488488

489+
# na_fvalues is created from user-provided na_value in _clean_na_values
490+
# which ensures that either
491+
# a) na_values is set[str] and na_fvalues is set[float]
492+
# b) na_values is dict[Hashable, set[str]] and
493+
# na_fvalues is dict[Hashable, set[float]]
494+
# (tests for this case are in test_na_values.py)
495+
if not isinstance(na_values, dict):
496+
# i.e. it must be a set
497+
na_values = list(na_values)
498+
489499
self.na_values = na_values
490-
if na_fvalues is None:
491-
na_fvalues = set()
492500
self.na_fvalues = na_fvalues
493501

494502
self.true_values = _maybe_encode(true_values) + _true_values
@@ -929,7 +937,8 @@ cdef class TextReader:
929937
int nused
930938
kh_str_starts_t *na_hashset = NULL
931939
int64_t start, end
932-
object name, na_flist, col_dtype = None
940+
object name, col_dtype = None
941+
set na_fset
933942
bint na_filter = 0
934943
int64_t num_cols
935944
dict results
@@ -1021,18 +1030,15 @@ cdef class TextReader:
10211030
results[i] = _apply_converter(conv, self.parser, i, start, end)
10221031
continue
10231032

1024-
# Collect the list of NaN values associated with the column.
1033+
# Collect the set of NaN values associated with the column.
10251034
# If we aren't supposed to do that, or none are collected,
10261035
# we set `na_filter` to `0` (`1` otherwise).
1027-
na_flist = set()
1036+
na_fset = set()
10281037

10291038
if self.na_filter:
1030-
na_list, na_flist = self._get_na_list(i, name)
1031-
if na_list is None:
1032-
na_filter = 0
1033-
else:
1034-
na_filter = 1
1035-
na_hashset = kset_from_list(na_list)
1039+
na_list, na_fset = self._get_na_list(i, name)
1040+
na_filter = 1
1041+
na_hashset = kset_from_list(na_list)
10361042
else:
10371043
na_filter = 0
10381044

@@ -1041,7 +1047,7 @@ cdef class TextReader:
10411047
try:
10421048
col_res, na_count = self._convert_tokens(
10431049
i, start, end, name, na_filter, na_hashset,
1044-
na_flist, col_dtype)
1050+
na_fset, col_dtype)
10451051
finally:
10461052
# gh-21353
10471053
#
@@ -1075,12 +1081,12 @@ cdef class TextReader:
10751081
cdef _convert_tokens(self, Py_ssize_t i, int64_t start,
10761082
int64_t end, object name, bint na_filter,
10771083
kh_str_starts_t *na_hashset,
1078-
object na_flist, object col_dtype):
1084+
set na_fset, object col_dtype):
10791085

10801086
if col_dtype is not None:
10811087
col_res, na_count = self._convert_with_dtype(
10821088
col_dtype, i, start, end, na_filter,
1083-
1, na_hashset, na_flist)
1089+
1, na_hashset, na_fset)
10841090

10851091
# Fallback on the parse (e.g. we requested int dtype,
10861092
# but its actually a float).
@@ -1094,19 +1100,19 @@ cdef class TextReader:
10941100
for dt in self.dtype_cast_order:
10951101
try:
10961102
col_res, na_count = self._convert_with_dtype(
1097-
dt, i, start, end, na_filter, 0, na_hashset, na_flist)
1103+
dt, i, start, end, na_filter, 0, na_hashset, na_fset)
10981104
except ValueError:
10991105
# This error is raised from trying to convert to uint64,
11001106
# and we discover that we cannot convert to any numerical
11011107
# dtype successfully. As a result, we leave the data
11021108
# column AS IS with object dtype.
11031109
col_res, na_count = self._convert_with_dtype(
11041110
np.dtype("object"), i, start, end, 0,
1105-
0, na_hashset, na_flist)
1111+
0, na_hashset, na_fset)
11061112
except OverflowError:
11071113
col_res, na_count = self._convert_with_dtype(
11081114
np.dtype("object"), i, start, end, na_filter,
1109-
0, na_hashset, na_flist)
1115+
0, na_hashset, na_fset)
11101116

11111117
if col_res is not None:
11121118
break
@@ -1154,7 +1160,7 @@ cdef class TextReader:
11541160
bint na_filter,
11551161
bint user_dtype,
11561162
kh_str_starts_t *na_hashset,
1157-
object na_flist):
1163+
set na_fset):
11581164
if isinstance(dtype, CategoricalDtype):
11591165
# TODO: I suspect that _categorical_convert could be
11601166
# optimized when dtype is an instance of CategoricalDtype
@@ -1212,7 +1218,7 @@ cdef class TextReader:
12121218

12131219
elif dtype.kind == "f":
12141220
result, na_count = _try_double(self.parser, i, start, end,
1215-
na_filter, na_hashset, na_flist)
1221+
na_filter, na_hashset, na_fset)
12161222

12171223
if result is not None and dtype != "float64":
12181224
result = result.astype(dtype)
@@ -1272,10 +1278,6 @@ cdef class TextReader:
12721278
return self.converters.get(i)
12731279

12741280
cdef _get_na_list(self, Py_ssize_t i, name):
1275-
# Note: updates self.na_values, self.na_fvalues
1276-
if self.na_values is None:
1277-
return None, set()
1278-
12791281
if isinstance(self.na_values, dict):
12801282
key = None
12811283
values = None
@@ -1300,11 +1302,6 @@ cdef class TextReader:
13001302

13011303
return _ensure_encoded(values), fvalues
13021304
else:
1303-
if not isinstance(self.na_values, list):
1304-
self.na_values = list(self.na_values)
1305-
if not isinstance(self.na_fvalues, set):
1306-
self.na_fvalues = set(self.na_fvalues)
1307-
13081305
return _ensure_encoded(self.na_values), self.na_fvalues
13091306

13101307
cdef _free_na_set(self, kh_str_starts_t *table):
@@ -1622,27 +1619,27 @@ cdef:
16221619
# -> tuple[ndarray[float64_t], int] | tuple[None, None]
16231620
cdef _try_double(parser_t *parser, int64_t col,
16241621
int64_t line_start, int64_t line_end,
1625-
bint na_filter, kh_str_starts_t *na_hashset, object na_flist):
1622+
bint na_filter, kh_str_starts_t *na_hashset, set na_fset):
16261623
cdef:
16271624
int error, na_count = 0
16281625
Py_ssize_t lines
16291626
float64_t *data
16301627
float64_t NA = na_values[np.float64]
1631-
kh_float64_t *na_fset
1628+
kh_float64_t *na_fhashset
16321629
ndarray[float64_t] result
1633-
bint use_na_flist = len(na_flist) > 0
1630+
bint use_na_flist = len(na_fset) > 0
16341631

16351632
lines = line_end - line_start
16361633
result = np.empty(lines, dtype=np.float64)
16371634
data = <float64_t *>result.data
1638-
na_fset = kset_float64_from_list(na_flist)
1635+
na_fhashset = kset_float64_from_set(na_fset)
16391636
with nogil:
16401637
error = _try_double_nogil(parser, parser.double_converter,
16411638
col, line_start, line_end,
16421639
na_filter, na_hashset, use_na_flist,
1643-
na_fset, NA, data, &na_count)
1640+
na_fhashset, NA, data, &na_count)
16441641

1645-
kh_destroy_float64(na_fset)
1642+
kh_destroy_float64(na_fhashset)
16461643
if error != 0:
16471644
return None, None
16481645
return result, na_count
@@ -1655,7 +1652,7 @@ cdef int _try_double_nogil(parser_t *parser,
16551652
int64_t col, int64_t line_start, int64_t line_end,
16561653
bint na_filter, kh_str_starts_t *na_hashset,
16571654
bint use_na_flist,
1658-
const kh_float64_t *na_flist,
1655+
const kh_float64_t *na_fhashset,
16591656
float64_t NA, float64_t *data,
16601657
int *na_count) nogil:
16611658
cdef:
@@ -1694,8 +1691,8 @@ cdef int _try_double_nogil(parser_t *parser,
16941691
else:
16951692
return 1
16961693
if use_na_flist:
1697-
k64 = kh_get_float64(na_flist, data[0])
1698-
if k64 != na_flist.n_buckets:
1694+
k64 = kh_get_float64(na_fhashset, data[0])
1695+
if k64 != na_fhashset.n_buckets:
16991696
na_count[0] += 1
17001697
data[0] = NA
17011698
data += 1
@@ -1977,7 +1974,7 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL:
19771974
return table
19781975

19791976

1980-
cdef kh_float64_t* kset_float64_from_list(values) except NULL:
1977+
cdef kh_float64_t* kset_float64_from_set(set values) except NULL:
19811978
# caller takes responsibility for freeing the hash table
19821979
cdef:
19831980
kh_float64_t *table

pandas/io/parsers/readers.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1666,7 +1666,7 @@ def _clean_na_values(na_values, keep_default_na: bool = True, floatify: bool = T
16661666
return na_values, na_fvalues
16671667

16681668

1669-
def _floatify_na_values(na_values):
1669+
def _floatify_na_values(na_values) -> set[float]:
16701670
# create float versions of the na_values
16711671
result = set()
16721672
for v in na_values:

0 commit comments

Comments
 (0)