mabel-dev
diff --git a/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions b/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opteryx/compiled/structures/bloom_filter.pxd‎
Lines changed: 4 additions & 3 deletions b/‎opteryx/compiled/structures/bloom_filter.pxd‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎opteryx/compiled/structures/bloom_filter.pyx‎
Lines changed: 74 additions & 45 deletions b/‎opteryx/compiled/structures/bloom_filter.pyx‎
Lines changed: 74 additions & 45 deletions
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1664
+__build__ = 1666
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1664"
+__version__ = "0.26.0-beta.1666"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -11,9 +11,10 @@ cimport numpy
 
 # Declaration of the BloomFilter class
 cdef class BloomFilter:
-    cdef unsigned char* bit_array
-    cdef uint32_t bit_array_size
-    cdef uint32_t byte_array_size
+    cdef uint64_t* bit_array
+    cdef uint32_t bit64_array_size
+    cdef uint32_t bit_array_size_bits
+    cdef uint64_t bit_mask
 
     cdef inline void _add(self, const uint64_t item)
     cdef inline bint _possibly_contains(self, const uint64_t item)
 
@@ -34,7 +34,7 @@ is the limit of what we think we should speculatively build.
 """
 
 from libc.stdlib cimport calloc, free
-from libc.stdint cimport uint8_t
+from libc.stdint cimport uint64_t, uint32_t
 
 from opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
 from opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
@@ -45,43 +45,39 @@ cimport numpy
 cdef extern from "<stdint.h>":
     ctypedef unsigned long uintptr_t
 
-# Define sizes for the Bloom filters
-cdef uint32_t BYTE_ARRAY_SIZE_TINY = 1 * 1024          # 1 KB for <= 1K records
-cdef uint32_t BYTE_ARRAY_SIZE_SMALL = 64 * 1024        # 64 KB for <= 60K records
-cdef uint32_t BYTE_ARRAY_SIZE_LARGE = 1024 * 1024      # 1 MB for <=  1M records
-cdef uint32_t BYTE_ARRAY_SIZE_HUGE = 16 * 1024 * 1024  # 16 MB for <= 16M records
-
-cdef uint32_t BIT_ARRAY_SIZE_TINY = BYTE_ARRAY_SIZE_TINY << 3    # 8 Kbits
-cdef uint32_t BIT_ARRAY_SIZE_SMALL = BYTE_ARRAY_SIZE_SMALL << 3  # 512 Kbits
-cdef uint32_t BIT_ARRAY_SIZE_LARGE = BYTE_ARRAY_SIZE_LARGE << 3  # 8 Mbits
-cdef uint32_t BIT_ARRAY_SIZE_HUGE = BYTE_ARRAY_SIZE_HUGE << 3    # 128 Mbits
+# Define sizes for the Bloom filters - now in 64-bit chunks
+cdef uint32_t BIT64_ARRAY_SIZE_TINY = 128  # 128 * 64 = 8,192 bits
+cdef uint32_t BIT64_ARRAY_SIZE_SMALL = 8 * 1024  # 8K * 64 = 524,288 bits
+cdef uint32_t BIT64_ARRAY_SIZE_LARGE = 128 * 1024  # 128K * 64 = 8,388,608 bits
+cdef uint32_t BIT64_ARRAY_SIZE_HUGE = 2 * 1024 * 1024  # 2M * 64 = 134,217,728 bits
 
+# Golden ratio constant for second hash
+cdef uint64_t GOLDEN_RATIO = 0x9E3779B97F4A7C15ULL
 
 cdef class BloomFilter:
-    # defined in the .pxd file only - here so they aren't magic
-    # cdef unsigned char* bit_array
-    # cdef uint32_t bit_array_size
-    # cdef uint32_t byte_array_size
 
     def __cinit__(self, uint32_t expected_records=50000):
         """Initialize Bloom Filter based on expected number of records."""
         if expected_records <= 1_000:
-            self.byte_array_size = BYTE_ARRAY_SIZE_TINY
-            self.bit_array_size = BIT_ARRAY_SIZE_TINY
+            self.bit64_array_size = BIT64_ARRAY_SIZE_TINY
+            self.bit_array_size_bits = BIT64_ARRAY_SIZE_TINY * 64
         elif expected_records <= 62_000:
-            self.byte_array_size = BYTE_ARRAY_SIZE_SMALL
-            self.bit_array_size = BIT_ARRAY_SIZE_SMALL
+            self.bit64_array_size = BIT64_ARRAY_SIZE_SMALL
+            self.bit_array_size_bits = BIT64_ARRAY_SIZE_SMALL * 64
         elif expected_records <= 1_000_000:
-            self.byte_array_size = BYTE_ARRAY_SIZE_LARGE
-            self.bit_array_size = BIT_ARRAY_SIZE_LARGE
+            self.bit64_array_size = BIT64_ARRAY_SIZE_LARGE
+            self.bit_array_size_bits = BIT64_ARRAY_SIZE_LARGE * 64
         elif expected_records <= 16_000_000:
-            self.byte_array_size = BYTE_ARRAY_SIZE_HUGE
-            self.bit_array_size = BIT_ARRAY_SIZE_HUGE
+            self.bit64_array_size = BIT64_ARRAY_SIZE_HUGE
+            self.bit_array_size_bits = BIT64_ARRAY_SIZE_HUGE * 64
         else:
             raise ValueError("Too many records for this Bloom filter implementation")
 
-        # Allocate memory
-        self.bit_array = <unsigned char*>calloc(self.byte_array_size, sizeof(uint8_t))
+        # Precompute mask for faster modulo operations
+        self.bit_mask = self.bit_array_size_bits - 1
+
+        # Allocate 64-bit aligned memory
+        self.bit_array = <uint64_t*>calloc(self.bit64_array_size, sizeof(uint64_t))
         if not self.bit_array:
             raise MemoryError("Failed to allocate memory for the Bloom filter.")
 
@@ -90,61 +86,80 @@ cdef class BloomFilter:
             free(self.bit_array)
 
     cdef inline void _add(self, const uint64_t item):
-        cdef uint32_t h1, h2
+        cdef uint64_t h1, h2
+
+        # Use bit mask for fast modulo (works because sizes are powers of 2)
+        h1 = item & self.bit_mask
+        # Better hash mixing for second position
+        h2 = (item * GOLDEN_RATIO) & self.bit_mask
 
-        h1 = item & (self.bit_array_size - 1)
-        # Apply the golden ratio to the item and use a mask to keep within the
-        # size of the bit array.
-        h2 = (item * 2654435769U) & (self.bit_array_size - 1)
-        self.bit_array[h1 >> 3] |= 1 << (h1 & 7)
-        self.bit_array[h2 >> 3] |= 1 << (h2 & 7)
+        # Set bits using 64-bit operations
+        self.bit_array[h1 >> 6] |= (<uint64_t>1) << (h1 & 0x3F)
+        self.bit_array[h2 >> 6] |= (<uint64_t>1) << (h2 & 0x3F)
 
     cpdef void add(self, const uint64_t item):
         self._add(item)
 
     cdef inline bint _possibly_contains(self, const uint64_t item):
-        """Check if the item might be in the set"""
-        cdef uint32_t h1, h2
+        cdef uint64_t h1, h2, mask1, mask2
 
-        h1 = item & (self.bit_array_size - 1)
-        h2 = (item * 2654435769U) & (self.bit_array_size - 1)
-        return (((self.bit_array[h1 >> 3] >> (h1 & 7)) & 1) != 0) and \
-               (((self.bit_array[h2 >> 3] >> (h2 & 7)) & 1) != 0)
+        h1 = item & self.bit_mask
+        h2 = (item * GOLDEN_RATIO) & self.bit_mask
+
+        # Check both bits with single 64-bit load each
+        mask1 = (<uint64_t>1) << (h1 & 0x3F)
+        mask2 = (<uint64_t>1) << (h2 & 0x3F)
+
+        return (self.bit_array[h1 >> 6] & mask1) != 0 and \
+               (self.bit_array[h2 >> 6] & mask2) != 0
 
     cpdef bint possibly_contains(self, const uint64_t item):
         return self._possibly_contains(item)
 
     cpdef numpy.ndarray[numpy.npy_bool, ndim=1] possibly_contains_many(self, object relation, list columns):
         """
-        Return a boolean array indicating whether each row in `relation` might be in the Bloom filter.
-        Null-containing rows are considered not present (False).
+        Optimized batch checking with better memory access patterns.
         """
         cdef Py_ssize_t num_rows = relation.num_rows
-        cdef numpy.ndarray[numpy.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=numpy.bool)
+        cdef numpy.ndarray[numpy.npy_bool, ndim=1] result = numpy.zeros(num_rows, dtype=numpy.bool_)
         cdef uint8_t[::1] result_view = result
         cdef int64_t[::1] valid_row_ids = non_null_row_indices(relation, columns)
         cdef Py_ssize_t num_valid_rows = valid_row_ids.shape[0]
         cdef numpy.ndarray[numpy.uint64_t, ndim=1] row_hashes_np = numpy.zeros(num_rows, dtype=numpy.uint64)
         cdef uint64_t[::1] row_hashes = row_hashes_np
         cdef Py_ssize_t i
         cdef int64_t row_id
+        cdef uint64_t hash_val, h1, h2, mask1, mask2
 
         if num_valid_rows == 0:
             return result
 
         # Compute hashes only for non-null rows
         compute_row_hashes(relation, columns, row_hashes)
 
+        # Precompute constants
+        cdef uint64_t bit_mask = self.bit_mask
+        cdef uint64_t golden_ratio = GOLDEN_RATIO
+        cdef uint64_t* bit_array = self.bit_array
+
         for i in range(num_valid_rows):
             row_id = valid_row_ids[i]
-            result_view[row_id] = self._possibly_contains(row_hashes[row_id])
+            hash_val = row_hashes[row_id]
+
+            h1 = hash_val & bit_mask
+            h2 = (hash_val * golden_ratio) & bit_mask
+
+            mask1 = (<uint64_t>1) << (h1 & 0x3F)
+            mask2 = (<uint64_t>1) << (h2 & 0x3F)
+
+            result_view[row_id] = (bit_array[h1 >> 6] & mask1) != 0 and \
+                (bit_array[h2 >> 6] & mask2) != 0
 
         return result
 
 cpdef BloomFilter create_bloom_filter(object relation, list columns):
     """
-    Create a BloomFilter from the specified `columns` in `relation`,
-    ignoring rows with nulls in any of the columns.
+    Optimized Bloom filter creation with better cache behavior.
     """
     cdef:
         Py_ssize_t num_rows = relation.num_rows
@@ -153,16 +168,30 @@ cpdef BloomFilter create_bloom_filter(object relation, list columns):
         numpy.ndarray[numpy.uint64_t, ndim=1] row_hashes_np = numpy.empty(num_rows, dtype=numpy.uint64)
         uint64_t[::1] row_hashes = row_hashes_np
         Py_ssize_t i
+        int64_t row_id
         BloomFilter bf = BloomFilter(num_valid_rows)
+        uint64_t hash_val, h1, h2
 
     if num_valid_rows == 0:
         return bf
 
     # Populate row hashes using the selected columns
     compute_row_hashes(relation, columns, row_hashes)
 
+    # Precompute constants for faster access
+    cdef uint64_t bit_mask = bf.bit_mask
+    cdef uint64_t golden_ratio = GOLDEN_RATIO
+    cdef uint64_t* bit_array = bf.bit_array
+
     # Add to bloom filter
     for i in range(num_valid_rows):
-        bf._add(row_hashes[valid_row_ids[i]])
+        row_id = valid_row_ids[i]
+        hash_val = row_hashes[row_id]
+
+        h1 = hash_val & bit_mask
+        h2 = (hash_val * golden_ratio) & bit_mask
+
+        bit_array[h1 >> 6] |= (<uint64_t>1) << (h1 & 0x3F)
+        bit_array[h2 >> 6] |= (<uint64_t>1) << (h2 & 0x3F)
 
     return bf