@@ -34,7 +34,7 @@ is the limit of what we think we should speculatively build.
3434"""
3535
3636from libc.stdlib cimport calloc, free
37- from libc.stdint cimport uint8_t
37+ from libc.stdint cimport uint64_t, uint32_t
3838
3939from opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
4040from opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
@@ -45,43 +45,39 @@ cimport numpy
4545cdef extern from " <stdint.h>" :
4646 ctypedef unsigned long uintptr_t
4747
48- # Define sizes for the Bloom filters
49- cdef uint32_t BYTE_ARRAY_SIZE_TINY = 1 * 1024 # 1 KB for <= 1K records
50- cdef uint32_t BYTE_ARRAY_SIZE_SMALL = 64 * 1024 # 64 KB for <= 60K records
51- cdef uint32_t BYTE_ARRAY_SIZE_LARGE = 1024 * 1024 # 1 MB for <= 1M records
52- cdef uint32_t BYTE_ARRAY_SIZE_HUGE = 16 * 1024 * 1024 # 16 MB for <= 16M records
53-
54- cdef uint32_t BIT_ARRAY_SIZE_TINY = BYTE_ARRAY_SIZE_TINY << 3 # 8 Kbits
55- cdef uint32_t BIT_ARRAY_SIZE_SMALL = BYTE_ARRAY_SIZE_SMALL << 3 # 512 Kbits
56- cdef uint32_t BIT_ARRAY_SIZE_LARGE = BYTE_ARRAY_SIZE_LARGE << 3 # 8 Mbits
57- cdef uint32_t BIT_ARRAY_SIZE_HUGE = BYTE_ARRAY_SIZE_HUGE << 3 # 128 Mbits
48+ # Define sizes for the Bloom filters - now in 64-bit chunks
49+ cdef uint32_t BIT64_ARRAY_SIZE_TINY = 128 # 128 * 64 = 8,192 bits
50+ cdef uint32_t BIT64_ARRAY_SIZE_SMALL = 8 * 1024 # 8K * 64 = 524,288 bits
51+ cdef uint32_t BIT64_ARRAY_SIZE_LARGE = 128 * 1024 # 128K * 64 = 8,388,608 bits
52+ cdef uint32_t BIT64_ARRAY_SIZE_HUGE = 2 * 1024 * 1024 # 2M * 64 = 134,217,728 bits
5853
54+ # Golden ratio constant for second hash
55+ cdef uint64_t GOLDEN_RATIO = 0x9E3779B97F4A7C15 ULL
5956
6057cdef class BloomFilter:
61- # defined in the .pxd file only - here so they aren't magic
62- # cdef unsigned char* bit_array
63- # cdef uint32_t bit_array_size
64- # cdef uint32_t byte_array_size
6558
6659 def __cinit__ (self , uint32_t expected_records = 50000 ):
6760 """ Initialize Bloom Filter based on expected number of records."""
6861 if expected_records <= 1 _000:
69- self .byte_array_size = BYTE_ARRAY_SIZE_TINY
70- self .bit_array_size = BIT_ARRAY_SIZE_TINY
62+ self .bit64_array_size = BIT64_ARRAY_SIZE_TINY
63+ self .bit_array_size_bits = BIT64_ARRAY_SIZE_TINY * 64
7164 elif expected_records <= 62 _000:
72- self .byte_array_size = BYTE_ARRAY_SIZE_SMALL
73- self .bit_array_size = BIT_ARRAY_SIZE_SMALL
65+ self .bit64_array_size = BIT64_ARRAY_SIZE_SMALL
66+ self .bit_array_size_bits = BIT64_ARRAY_SIZE_SMALL * 64
7467 elif expected_records <= 1 _000_000:
75- self .byte_array_size = BYTE_ARRAY_SIZE_LARGE
76- self .bit_array_size = BIT_ARRAY_SIZE_LARGE
68+ self .bit64_array_size = BIT64_ARRAY_SIZE_LARGE
69+ self .bit_array_size_bits = BIT64_ARRAY_SIZE_LARGE * 64
7770 elif expected_records <= 16 _000_000:
78- self .byte_array_size = BYTE_ARRAY_SIZE_HUGE
79- self .bit_array_size = BIT_ARRAY_SIZE_HUGE
71+ self .bit64_array_size = BIT64_ARRAY_SIZE_HUGE
72+ self .bit_array_size_bits = BIT64_ARRAY_SIZE_HUGE * 64
8073 else :
8174 raise ValueError (" Too many records for this Bloom filter implementation" )
8275
83- # Allocate memory
84- self .bit_array = < unsigned char * > calloc(self .byte_array_size, sizeof(uint8_t))
76+ # Precompute mask for faster modulo operations
77+ self .bit_mask = self .bit_array_size_bits - 1
78+
79+ # Allocate 64-bit aligned memory
80+ self .bit_array = < uint64_t* > calloc(self .bit64_array_size, sizeof(uint64_t))
8581 if not self .bit_array:
8682 raise MemoryError (" Failed to allocate memory for the Bloom filter." )
8783
@@ -90,61 +86,80 @@ cdef class BloomFilter:
9086 free(self .bit_array)
9187
9288 cdef inline void _add(self , const uint64_t item):
93- cdef uint32_t h1, h2
89+ cdef uint64_t h1, h2
90+
91+ # Use bit mask for fast modulo (works because sizes are powers of 2)
92+ h1 = item & self .bit_mask
93+ # Better hash mixing for second position
94+ h2 = (item * GOLDEN_RATIO) & self .bit_mask
9495
95- h1 = item & (self .bit_array_size - 1 )
96- # Apply the golden ratio to the item and use a mask to keep within the
97- # size of the bit array.
98- h2 = (item * 2654435769 U) & (self .bit_array_size - 1 )
99- self .bit_array[h1 >> 3 ] |= 1 << (h1 & 7 )
100- self .bit_array[h2 >> 3 ] |= 1 << (h2 & 7 )
96+ # Set bits using 64-bit operations
97+ self .bit_array[h1 >> 6 ] |= (< uint64_t> 1 ) << (h1 & 0x3F )
98+ self .bit_array[h2 >> 6 ] |= (< uint64_t> 1 ) << (h2 & 0x3F )
10199
102100 cpdef void add(self , const uint64_t item):
103101 self ._add(item)
104102
105103 cdef inline bint _possibly_contains(self , const uint64_t item):
106- """ Check if the item might be in the set"""
107- cdef uint32_t h1, h2
104+ cdef uint64_t h1, h2, mask1, mask2
108105
109- h1 = item & (self .bit_array_size - 1 )
110- h2 = (item * 2654435769 U) & (self .bit_array_size - 1 )
111- return (((self .bit_array[h1 >> 3 ] >> (h1 & 7 )) & 1 ) != 0 ) and \
112- (((self .bit_array[h2 >> 3 ] >> (h2 & 7 )) & 1 ) != 0 )
106+ h1 = item & self .bit_mask
107+ h2 = (item * GOLDEN_RATIO) & self .bit_mask
108+
109+ # Check both bits with single 64-bit load each
110+ mask1 = (< uint64_t> 1 ) << (h1 & 0x3F )
111+ mask2 = (< uint64_t> 1 ) << (h2 & 0x3F )
112+
113+ return (self .bit_array[h1 >> 6 ] & mask1) != 0 and \
114+ (self .bit_array[h2 >> 6 ] & mask2) != 0
113115
114116 cpdef bint possibly_contains(self , const uint64_t item):
115117 return self ._possibly_contains(item)
116118
117119 cpdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] possibly_contains_many(self , object relation, list columns):
118120 """
119- Return a boolean array indicating whether each row in `relation` might be in the Bloom filter.
120- Null-containing rows are considered not present (False).
121+ Optimized batch checking with better memory access patterns.
121122 """
122123 cdef Py_ssize_t num_rows = relation.num_rows
123- cdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] result = numpy.zeros(num_rows, dtype = numpy.bool )
124+ cdef numpy.ndarray[numpy.npy_bool, ndim= 1 ] result = numpy.zeros(num_rows, dtype = numpy.bool_ )
124125 cdef uint8_t[::1 ] result_view = result
125126 cdef int64_t[::1 ] valid_row_ids = non_null_row_indices(relation, columns)
126127 cdef Py_ssize_t num_valid_rows = valid_row_ids.shape[0 ]
127128 cdef numpy.ndarray[numpy.uint64_t, ndim= 1 ] row_hashes_np = numpy.zeros(num_rows, dtype = numpy.uint64)
128129 cdef uint64_t[::1 ] row_hashes = row_hashes_np
129130 cdef Py_ssize_t i
130131 cdef int64_t row_id
132+ cdef uint64_t hash_val, h1, h2, mask1, mask2
131133
132134 if num_valid_rows == 0 :
133135 return result
134136
135137 # Compute hashes only for non-null rows
136138 compute_row_hashes(relation, columns, row_hashes)
137139
140+ # Precompute constants
141+ cdef uint64_t bit_mask = self .bit_mask
142+ cdef uint64_t golden_ratio = GOLDEN_RATIO
143+ cdef uint64_t* bit_array = self .bit_array
144+
138145 for i in range (num_valid_rows):
139146 row_id = valid_row_ids[i]
140- result_view[row_id] = self ._possibly_contains(row_hashes[row_id])
147+ hash_val = row_hashes[row_id]
148+
149+ h1 = hash_val & bit_mask
150+ h2 = (hash_val * golden_ratio) & bit_mask
151+
152+ mask1 = (< uint64_t> 1 ) << (h1 & 0x3F )
153+ mask2 = (< uint64_t> 1 ) << (h2 & 0x3F )
154+
155+ result_view[row_id] = (bit_array[h1 >> 6 ] & mask1) != 0 and \
156+ (bit_array[h2 >> 6 ] & mask2) != 0
141157
142158 return result
143159
144160cpdef BloomFilter create_bloom_filter(object relation, list columns):
145161 """
146- Create a BloomFilter from the specified `columns` in `relation`,
147- ignoring rows with nulls in any of the columns.
162+ Optimized Bloom filter creation with better cache behavior.
148163 """
149164 cdef:
150165 Py_ssize_t num_rows = relation.num_rows
@@ -153,16 +168,30 @@ cpdef BloomFilter create_bloom_filter(object relation, list columns):
153168 numpy.ndarray[numpy.uint64_t, ndim= 1 ] row_hashes_np = numpy.empty(num_rows, dtype = numpy.uint64)
154169 uint64_t[::1 ] row_hashes = row_hashes_np
155170 Py_ssize_t i
171+ int64_t row_id
156172 BloomFilter bf = BloomFilter(num_valid_rows)
173+ uint64_t hash_val, h1, h2
157174
158175 if num_valid_rows == 0 :
159176 return bf
160177
161178 # Populate row hashes using the selected columns
162179 compute_row_hashes(relation, columns, row_hashes)
163180
181+ # Precompute constants for faster access
182+ cdef uint64_t bit_mask = bf.bit_mask
183+ cdef uint64_t golden_ratio = GOLDEN_RATIO
184+ cdef uint64_t* bit_array = bf.bit_array
185+
164186 # Add to bloom filter
165187 for i in range (num_valid_rows):
166- bf._add(row_hashes[valid_row_ids[i]])
188+ row_id = valid_row_ids[i]
189+ hash_val = row_hashes[row_id]
190+
191+ h1 = hash_val & bit_mask
192+ h2 = (hash_val * golden_ratio) & bit_mask
193+
194+ bit_array[h1 >> 6 ] |= (< uint64_t> 1 ) << (h1 & 0x3F )
195+ bit_array[h2 >> 6 ] |= (< uint64_t> 1 ) << (h2 & 0x3F )
167196
168197 return bf
0 commit comments