@@ -358,7 +358,7 @@ cdef class TextReader:
358
358
int64_t leading_cols, table_width
359
359
object delimiter # bytes or str
360
360
object converters
361
- object na_values
361
+ object na_values # dict[hashable, set[str]] | list[str]
362
362
list header # list[list[non-negative integers]]
363
363
object index_col
364
364
object skiprows
@@ -390,8 +390,8 @@ cdef class TextReader:
390
390
usecols = None ,
391
391
on_bad_lines = ERROR,
392
392
bint na_filter = True ,
393
- na_values = None ,
394
- na_fvalues = None ,
393
+ na_values = None , # dict[ hashable , set[str]] | set[str]
394
+ na_fvalues = None , # dict[ hashable , set[float]] | set[float]
395
395
bint keep_default_na = True ,
396
396
true_values = None ,
397
397
false_values = None ,
@@ -486,9 +486,17 @@ cdef class TextReader:
486
486
487
487
self .delimiter = delimiter
488
488
489
+ # na_fvalues is created from user-provided na_value in _clean_na_values
490
+ # which ensures that either
491
+ # a) na_values is set[str] and na_fvalues is set[float]
492
+ # b) na_values is dict[Hashable, set[str]] and
493
+ # na_fvalues is dict[Hashable, set[float]]
494
+ # (tests for this case are in test_na_values.py)
495
+ if not isinstance (na_values, dict ):
496
+ # i.e. it must be a set
497
+ na_values = list (na_values)
498
+
489
499
self .na_values = na_values
490
- if na_fvalues is None :
491
- na_fvalues = set ()
492
500
self .na_fvalues = na_fvalues
493
501
494
502
self .true_values = _maybe_encode(true_values) + _true_values
@@ -929,7 +937,8 @@ cdef class TextReader:
929
937
int nused
930
938
kh_str_starts_t *na_hashset = NULL
931
939
int64_t start , end
932
- object name , na_flist , col_dtype = None
940
+ object name , col_dtype = None
941
+ set na_fset
933
942
bint na_filter = 0
934
943
int64_t num_cols
935
944
dict results
@@ -1021,18 +1030,15 @@ cdef class TextReader:
1021
1030
results[i] = _apply_converter(conv, self .parser, i, start, end)
1022
1031
continue
1023
1032
1024
- # Collect the list of NaN values associated with the column.
1033
+ # Collect the set of NaN values associated with the column.
1025
1034
# If we aren't supposed to do that, or none are collected,
1026
1035
# we set `na_filter` to `0` (`1` otherwise).
1027
- na_flist = set ()
1036
+ na_fset = set ()
1028
1037
1029
1038
if self .na_filter:
1030
- na_list, na_flist = self ._get_na_list(i, name)
1031
- if na_list is None :
1032
- na_filter = 0
1033
- else :
1034
- na_filter = 1
1035
- na_hashset = kset_from_list(na_list)
1039
+ na_list, na_fset = self ._get_na_list(i, name)
1040
+ na_filter = 1
1041
+ na_hashset = kset_from_list(na_list)
1036
1042
else :
1037
1043
na_filter = 0
1038
1044
@@ -1041,7 +1047,7 @@ cdef class TextReader:
1041
1047
try :
1042
1048
col_res, na_count = self ._convert_tokens(
1043
1049
i, start, end, name, na_filter, na_hashset,
1044
- na_flist , col_dtype)
1050
+ na_fset , col_dtype)
1045
1051
finally :
1046
1052
# gh-21353
1047
1053
#
@@ -1075,12 +1081,12 @@ cdef class TextReader:
1075
1081
cdef _convert_tokens(self , Py_ssize_t i, int64_t start,
1076
1082
int64_t end, object name, bint na_filter,
1077
1083
kh_str_starts_t * na_hashset,
1078
- object na_flist , object col_dtype):
1084
+ set na_fset , object col_dtype):
1079
1085
1080
1086
if col_dtype is not None :
1081
1087
col_res, na_count = self ._convert_with_dtype(
1082
1088
col_dtype, i, start, end, na_filter,
1083
- 1 , na_hashset, na_flist )
1089
+ 1 , na_hashset, na_fset )
1084
1090
1085
1091
# Fallback on the parse (e.g. we requested int dtype,
1086
1092
# but its actually a float).
@@ -1094,19 +1100,19 @@ cdef class TextReader:
1094
1100
for dt in self .dtype_cast_order:
1095
1101
try :
1096
1102
col_res, na_count = self ._convert_with_dtype(
1097
- dt, i, start, end, na_filter, 0 , na_hashset, na_flist )
1103
+ dt, i, start, end, na_filter, 0 , na_hashset, na_fset )
1098
1104
except ValueError :
1099
1105
# This error is raised from trying to convert to uint64,
1100
1106
# and we discover that we cannot convert to any numerical
1101
1107
# dtype successfully. As a result, we leave the data
1102
1108
# column AS IS with object dtype.
1103
1109
col_res, na_count = self ._convert_with_dtype(
1104
1110
np.dtype(" object" ), i, start, end, 0 ,
1105
- 0 , na_hashset, na_flist )
1111
+ 0 , na_hashset, na_fset )
1106
1112
except OverflowError :
1107
1113
col_res, na_count = self ._convert_with_dtype(
1108
1114
np.dtype(" object" ), i, start, end, na_filter,
1109
- 0 , na_hashset, na_flist )
1115
+ 0 , na_hashset, na_fset )
1110
1116
1111
1117
if col_res is not None :
1112
1118
break
@@ -1154,7 +1160,7 @@ cdef class TextReader:
1154
1160
bint na_filter,
1155
1161
bint user_dtype,
1156
1162
kh_str_starts_t * na_hashset,
1157
- object na_flist ):
1163
+ set na_fset ):
1158
1164
if isinstance (dtype, CategoricalDtype):
1159
1165
# TODO: I suspect that _categorical_convert could be
1160
1166
# optimized when dtype is an instance of CategoricalDtype
@@ -1212,7 +1218,7 @@ cdef class TextReader:
1212
1218
1213
1219
elif dtype.kind == " f" :
1214
1220
result, na_count = _try_double(self .parser, i, start, end,
1215
- na_filter, na_hashset, na_flist )
1221
+ na_filter, na_hashset, na_fset )
1216
1222
1217
1223
if result is not None and dtype != " float64" :
1218
1224
result = result.astype(dtype)
@@ -1272,10 +1278,6 @@ cdef class TextReader:
1272
1278
return self .converters.get(i)
1273
1279
1274
1280
cdef _get_na_list(self , Py_ssize_t i, name):
1275
- # Note: updates self.na_values, self.na_fvalues
1276
- if self .na_values is None :
1277
- return None , set ()
1278
-
1279
1281
if isinstance (self .na_values, dict ):
1280
1282
key = None
1281
1283
values = None
@@ -1300,11 +1302,6 @@ cdef class TextReader:
1300
1302
1301
1303
return _ensure_encoded(values), fvalues
1302
1304
else :
1303
- if not isinstance (self .na_values, list ):
1304
- self .na_values = list (self .na_values)
1305
- if not isinstance (self .na_fvalues, set ):
1306
- self .na_fvalues = set (self .na_fvalues)
1307
-
1308
1305
return _ensure_encoded(self .na_values), self .na_fvalues
1309
1306
1310
1307
cdef _free_na_set(self , kh_str_starts_t * table):
@@ -1622,27 +1619,27 @@ cdef:
1622
1619
# -> tuple[ndarray[float64_t], int] | tuple[None, None]
1623
1620
cdef _try_double(parser_t * parser, int64_t col,
1624
1621
int64_t line_start, int64_t line_end,
1625
- bint na_filter, kh_str_starts_t * na_hashset, object na_flist ):
1622
+ bint na_filter, kh_str_starts_t * na_hashset, set na_fset ):
1626
1623
cdef:
1627
1624
int error, na_count = 0
1628
1625
Py_ssize_t lines
1629
1626
float64_t * data
1630
1627
float64_t NA = na_values[np.float64]
1631
- kh_float64_t * na_fset
1628
+ kh_float64_t * na_fhashset
1632
1629
ndarray[float64_t] result
1633
- bint use_na_flist = len (na_flist ) > 0
1630
+ bint use_na_flist = len (na_fset ) > 0
1634
1631
1635
1632
lines = line_end - line_start
1636
1633
result = np.empty(lines, dtype = np.float64)
1637
1634
data = < float64_t * > result.data
1638
- na_fset = kset_float64_from_list(na_flist )
1635
+ na_fhashset = kset_float64_from_set(na_fset )
1639
1636
with nogil:
1640
1637
error = _try_double_nogil(parser, parser.double_converter,
1641
1638
col, line_start, line_end,
1642
1639
na_filter, na_hashset, use_na_flist,
1643
- na_fset , NA, data, & na_count)
1640
+ na_fhashset , NA, data, & na_count)
1644
1641
1645
- kh_destroy_float64(na_fset )
1642
+ kh_destroy_float64(na_fhashset )
1646
1643
if error != 0 :
1647
1644
return None , None
1648
1645
return result, na_count
@@ -1655,7 +1652,7 @@ cdef int _try_double_nogil(parser_t *parser,
1655
1652
int64_t col, int64_t line_start, int64_t line_end,
1656
1653
bint na_filter, kh_str_starts_t * na_hashset,
1657
1654
bint use_na_flist,
1658
- const kh_float64_t * na_flist ,
1655
+ const kh_float64_t * na_fhashset ,
1659
1656
float64_t NA, float64_t * data,
1660
1657
int * na_count) nogil:
1661
1658
cdef:
@@ -1694,8 +1691,8 @@ cdef int _try_double_nogil(parser_t *parser,
1694
1691
else :
1695
1692
return 1
1696
1693
if use_na_flist:
1697
- k64 = kh_get_float64(na_flist , data[0 ])
1698
- if k64 != na_flist .n_buckets:
1694
+ k64 = kh_get_float64(na_fhashset , data[0 ])
1695
+ if k64 != na_fhashset .n_buckets:
1699
1696
na_count[0 ] += 1
1700
1697
data[0 ] = NA
1701
1698
data += 1
@@ -1977,7 +1974,7 @@ cdef kh_str_starts_t* kset_from_list(list values) except NULL:
1977
1974
return table
1978
1975
1979
1976
1980
- cdef kh_float64_t* kset_float64_from_list( values) except NULL :
1977
+ cdef kh_float64_t* kset_float64_from_set( set values) except NULL :
1981
1978
# caller takes responsibility for freeing the hash table
1982
1979
cdef:
1983
1980
kh_float64_t * table
0 commit comments