mabel-dev
diff --git a/‎.gitignore‎
Lines changed: 1 addition & 0 deletions b/‎.gitignore‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions b/‎opteryx/__version__.py‎
Lines changed: 2 additions & 2 deletions
diff --git a/‎opteryx/compiled/joins/cross_join.pyx‎
Lines changed: 9 additions & 3 deletions b/‎opteryx/compiled/joins/cross_join.pyx‎
Lines changed: 9 additions & 3 deletions
diff --git a/‎opteryx/compiled/joins/filter_join.pyx‎
Lines changed: 3 additions & 3 deletions b/‎opteryx/compiled/joins/filter_join.pyx‎
Lines changed: 3 additions & 3 deletions
diff --git a/‎opteryx/compiled/joins/inner_join.pyx‎
Lines changed: 77 additions & 58 deletions b/‎opteryx/compiled/joins/inner_join.pyx‎
Lines changed: 77 additions & 58 deletions
diff --git a/‎opteryx/compiled/joins/nested_loop_join.pyx‎
Lines changed: 65 additions & 0 deletions b/‎opteryx/compiled/joins/nested_loop_join.pyx‎
Lines changed: 65 additions & 0 deletions
diff --git a/‎opteryx/compiled/structures/buffers.pxd‎
Lines changed: 1 addition & 0 deletions b/‎opteryx/compiled/structures/buffers.pxd‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎opteryx/compiled/structures/buffers.pyx‎
Lines changed: 1 addition & 0 deletions b/‎opteryx/compiled/structures/buffers.pyx‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎opteryx/operators/inner_join_node.py‎
Lines changed: 31 additions & 8 deletions b/‎opteryx/operators/inner_join_node.py‎
Lines changed: 31 additions & 8 deletions
@@ -195,3 +195,4 @@ opteryx/third_party/tktech/csimdjson.cpp
 opteryx/third_party/ulfjack/ryu.c
 opteryx/compiled/joins/joins.pyx
 pyiceberg_catalog.db
+opteryx/compiled/joins/joins.h
@@ -1,9 +1,9 @@
 # THIS FILE IS AUTOMATICALLY UPDATED DURING THE BUILD PROCESS
 # DO NOT EDIT THIS FILE DIRECTLY
 
-__build__ = 1722
+__build__ = 1724
 __author__ = "@joocer"
-__version__ = "0.26.0-beta.1722"
+__version__ = "0.26.0-beta.1724"
 
 # Store the version here so:
 # 1) we don't load dependencies by storing it in __init__.py
 
@@ -213,7 +213,7 @@ cpdef tuple build_filtered_rows_indices_and_column(object column, set valid_valu
         Py_ssize_t arr_offset = column.offset
         const int32_t* offsets32 = <const int32_t*><uintptr_t>(buffers[1].address)
         Py_ssize_t i, j, k = 0, start, end, str_len
-        Py_ssize_t allocated_size = row_count * 4
+        Py_ssize_t allocated_size = row_count * 4 if row_count > 0 else 4
 
         numpy.ndarray indices = numpy.empty(allocated_size, dtype=numpy.int64)
         int64_t[::1] indices_mv = indices
@@ -258,9 +258,15 @@ cpdef tuple build_filtered_rows_indices_and_column(object column, set valid_valu
             if value_bytes in valid_bytes:
                 if k >= allocated_size:
                     allocated_size *= 2
-                    indices = numpy.resize(indices, allocated_size)
+                    new_indices = numpy.empty(allocated_size, dtype=numpy.int64)
+                    new_indices[:k] = indices_mv[:k]
+                    indices = new_indices
                     indices_mv = indices
-                    flat_data = numpy.resize(flat_mv.base, allocated_size)
+
+                    new_flat = numpy.empty(allocated_size, dtype=object)
+                    new_flat[:k] = flat_data[:k]
+                    flat_data = new_flat
+                    flat_mv = flat_data
 
                 flat_mv[k] = value_bytes
                 indices_mv[k] = i
 
@@ -18,7 +18,7 @@ from opteryx.third_party.abseil.containers cimport FlatHashSet
 cpdef FlatHashSet filter_join_set(table, list columns=None, FlatHashSet seen_hashes=None):
     cdef:
         Py_ssize_t num_rows = table.num_rows
-        uint64_t[::1] row_hashes = numpy.zeros(num_rows, dtype=numpy.uint64)
+        uint64_t[::1] row_hashes = numpy.empty(num_rows, dtype=numpy.uint64)
         list columns_of_interest = columns if columns else table.column_names
         Py_ssize_t row_idx
 
@@ -37,7 +37,7 @@ cpdef semi_join(object relation, list join_columns, FlatHashSet seen_hashes):
     cdef:
         Py_ssize_t num_rows = relation.num_rows
         Py_ssize_t row_idx, count = 0
-        uint64_t[::1] row_hashes = numpy.zeros(num_rows, dtype=numpy.uint64)
+        uint64_t[::1] row_hashes = numpy.empty(num_rows, dtype=numpy.uint64)
         numpy.ndarray[int64_t, ndim=1] index_buffer = numpy.empty(num_rows, dtype=numpy.int64)
 
     compute_row_hashes(relation, join_columns, row_hashes)
@@ -53,7 +53,7 @@ cpdef anti_join(object relation, list join_columns, FlatHashSet seen_hashes):
     cdef:
         Py_ssize_t num_rows = relation.num_rows
         Py_ssize_t row_idx, count = 0
-        uint64_t[::1] row_hashes = numpy.zeros(num_rows, dtype=numpy.uint64)
+        uint64_t[::1] row_hashes = numpy.empty(num_rows, dtype=numpy.uint64)
         numpy.ndarray[int64_t, ndim=1] index_buffer = numpy.empty(num_rows, dtype=numpy.int64)
 
     compute_row_hashes(relation, join_columns, row_hashes)
 
@@ -11,47 +11,105 @@ cimport numpy
 numpy.import_array()
 
 from libc.stdint cimport int64_t, uint64_t
+from libc.stddef cimport size_t
+from libcpp.vector cimport vector
 
-from opteryx.third_party.abseil.containers cimport FlatHashMap
-from opteryx.compiled.structures.buffers cimport IntBuffer
+from time import perf_counter_ns
+
+from opteryx.third_party.abseil.containers cimport (
+    FlatHashMap,
+    IdentityHash,
+    flat_hash_map,
+)
+from opteryx.compiled.structures.buffers cimport CIntBuffer, IntBuffer
 from opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
 from opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
 
+cdef extern from "join_kernels.h":
+    void inner_join_probe(
+        flat_hash_map[uint64_t, vector[int64_t], IdentityHash]* left_map,
+        const int64_t* non_null_indices,
+        size_t non_null_count,
+        const uint64_t* row_hashes,
+        size_t row_hash_count,
+        CIntBuffer* left_out,
+        CIntBuffer* right_out
+    ) nogil
+
+cdef public long long last_hash_time_ns = 0
+cdef public long long last_probe_time_ns = 0
+cdef public long long last_materialize_time_ns = 0
+cdef public Py_ssize_t last_rows_hashed = 0
+cdef public Py_ssize_t last_candidate_rows = 0
+cdef public Py_ssize_t last_result_rows = 0
+
 
 cpdef tuple inner_join(object right_relation, list join_columns, FlatHashMap left_hash_table):
     """
     Perform an inner join between a right-hand relation and a pre-built left-side hash table.
     This function uses precomputed hashes and avoids null rows for optimal speed.
     """
+    global last_hash_time_ns, last_probe_time_ns, last_materialize_time_ns
+    global last_rows_hashed, last_candidate_rows, last_result_rows
     cdef IntBuffer left_indexes = IntBuffer()
     cdef IntBuffer right_indexes = IntBuffer()
     cdef int64_t num_rows = right_relation.num_rows
     cdef int64_t[::1] non_null_indices = non_null_row_indices(right_relation, join_columns)
+    cdef Py_ssize_t candidate_count = non_null_indices.shape[0]
+
+    if candidate_count == 0 or num_rows == 0:
+        last_hash_time_ns = 0
+        last_probe_time_ns = 0
+        last_rows_hashed = num_rows
+        last_candidate_rows = candidate_count
+        last_result_rows = 0
+        last_materialize_time_ns = 0
+        return numpy.empty(0, dtype=numpy.int64), numpy.empty(0, dtype=numpy.int64)
+
     cdef uint64_t[::1] row_hashes = numpy.empty(num_rows, dtype=numpy.uint64)
-    cdef int64_t i, row_idx
-    cdef uint64_t hash_val
-    cdef size_t match_count
-    cdef int j
+    cdef long long t_start = perf_counter_ns()
 
     # Precompute hashes for right relation
     compute_row_hashes(right_relation, join_columns, row_hashes)
+    cdef long long t_after_hash = perf_counter_ns()
+    last_hash_time_ns = t_after_hash - t_start
+
+    with nogil:
+        inner_join_probe(
+            &left_hash_table._map,
+            &non_null_indices[0],
+            <size_t>candidate_count,
+            &row_hashes[0],
+            <size_t>num_rows,
+            left_indexes.c_buffer,
+            right_indexes.c_buffer,
+        )
+    cdef long long t_after_probe = perf_counter_ns()
+    last_probe_time_ns = t_after_probe - t_after_hash
+    last_rows_hashed = num_rows
+    last_candidate_rows = candidate_count
 
-    for i in range(non_null_indices.shape[0]):
-        row_idx = non_null_indices[i]
-        hash_val = row_hashes[row_idx]
+    # Return matched row indices from both sides
+    cdef long long t_before_numpy = perf_counter_ns()
+    cdef numpy.ndarray[int64_t, ndim=1] left_np = left_indexes.to_numpy()
+    cdef numpy.ndarray[int64_t, ndim=1] right_np = right_indexes.to_numpy()
+    cdef long long t_after_numpy = perf_counter_ns()
+    last_result_rows = left_np.shape[0]
+    last_materialize_time_ns = t_after_numpy - t_before_numpy
 
-        # Probe the left-side hash table
-        left_matches = left_hash_table.get(hash_val)
-        match_count = left_matches.size()
-        if match_count == 0:
-            continue
+    return left_np, right_np
 
-        for j in range(match_count):
-            left_indexes.append(left_matches[j])
-            right_indexes.append(row_idx)
 
-    # Return matched row indices from both sides
-    return left_indexes.to_numpy(), right_indexes.to_numpy()
+cpdef tuple get_last_inner_join_metrics():
+    """Return instrumentation captured during the most recent inner join call."""
+    return (
+        last_hash_time_ns,
+        last_probe_time_ns,
+        last_rows_hashed,
+        last_candidate_rows,
+        last_result_rows,
+        last_materialize_time_ns,
+    )
 
 
 cpdef FlatHashMap build_side_hash_map(object relation, list join_columns):
@@ -72,42 +130,3 @@ cpdef FlatHashMap build_side_hash_map(object relation, list join_columns):
         ht.insert(row_hashes[row_idx], row_idx)
 
     return ht
-
-
-cpdef tuple nested_loop_join(left_relation, right_relation, list left_columns, list right_columns):
-    """
-    A buffer-aware nested loop join using direct Arrow buffer access and hash computation.
-    Only intended for small relations (<1000 rows), primarily used for correctness testing or fallbacks.
-    """
-    # determine the rows we're going to try to join on
-    cdef int64_t[::1] left_non_null_indices = non_null_row_indices(left_relation, left_columns)
-    cdef int64_t[::1] right_non_null_indices = non_null_row_indices(right_relation, right_columns)
-
-    cdef int64_t nl = left_non_null_indices.shape[0]
-    cdef int64_t nr = right_non_null_indices.shape[0]
-    cdef IntBuffer left_indexes = IntBuffer()
-    cdef IntBuffer right_indexes = IntBuffer()
-    cdef int64_t left_non_null_idx, right_non_null_idx, left_record_idx, right_record_idx
-
-    cdef uint64_t[::1] left_hashes = numpy.empty(nl, dtype=numpy.uint64)
-    cdef uint64_t[::1] right_hashes = numpy.empty(nr, dtype=numpy.uint64)
-
-    # remove the rows from the relations
-    left_relation = left_relation.select(sorted(set(left_columns))).drop_null()
-    right_relation = right_relation.select(sorted(set(right_columns))).drop_null()
-
-    # build hashes for the columns we're joining on
-    compute_row_hashes(left_relation, left_columns, left_hashes)
-    compute_row_hashes(right_relation, right_columns, right_hashes)
-
-    # Compare each pair of rows (naive quadratic approach)
-    for left_non_null_idx in range(nl):
-        for right_non_null_idx in range(nr):
-            # if we have a match, look up the offset in the original table
-            if left_hashes[left_non_null_idx] == right_hashes[right_non_null_idx]:
-                left_record_idx = left_non_null_indices[left_non_null_idx]
-                right_record_idx = right_non_null_indices[right_non_null_idx]
-                left_indexes.append(left_record_idx)
-                right_indexes.append(right_record_idx)
-
-    return (left_indexes.to_numpy(), right_indexes.to_numpy())
 
@@ -0,0 +1,65 @@
+# cython: language_level=3
+# cython: nonecheck=False
+# cython: cdivision=True
+# cython: initializedcheck=False
+# cython: infer_types=True
+# cython: wraparound=False
+# cython: boundscheck=False
+
+import numpy
+cimport numpy
+numpy.import_array()
+
+from libc.stdint cimport int64_t, uint64_t
+
+from opteryx.compiled.structures.buffers cimport IntBuffer
+from opteryx.compiled.table_ops.hash_ops cimport compute_row_hashes
+from opteryx.compiled.table_ops.null_avoidant_ops cimport non_null_row_indices
+
+
+cpdef tuple nested_loop_join(left_relation, right_relation, list left_columns, list right_columns):
+    """
+    Perform a buffer-aware nested loop join using Arrow buffer hashing.
+
+    This implementation is optimized for small relations where building a hash map would be
+    more expensive than a quadratic scan.
+    """
+    cdef int64_t[::1] left_non_null_indices = non_null_row_indices(left_relation, left_columns)
+    cdef int64_t[::1] right_non_null_indices = non_null_row_indices(right_relation, right_columns)
+
+    cdef int64_t nl = left_non_null_indices.shape[0]
+    cdef int64_t nr = right_non_null_indices.shape[0]
+
+    if nl == 0 or nr == 0:
+        return numpy.empty(0, dtype=numpy.int64), numpy.empty(0, dtype=numpy.int64)
+
+    cdef IntBuffer left_indexes = IntBuffer()
+    cdef IntBuffer right_indexes = IntBuffer()
+    cdef uint64_t[::1] left_hashes = numpy.empty(left_relation.num_rows, dtype=numpy.uint64)
+    cdef uint64_t[::1] right_hashes = numpy.empty(right_relation.num_rows, dtype=numpy.uint64)
+    cdef int64_t i, j, left_row, right_row
+    cdef uint64_t left_hash, right_hash
+
+    compute_row_hashes(left_relation, left_columns, left_hashes)
+    compute_row_hashes(right_relation, right_columns, right_hashes)
+
+    if nl <= nr:
+        for i in range(nl):
+            left_row = left_non_null_indices[i]
+            left_hash = left_hashes[left_row]
+            for j in range(nr):
+                right_row = right_non_null_indices[j]
+                if left_hash == right_hashes[right_row]:
+                    left_indexes.append(left_row)
+                    right_indexes.append(right_row)
+    else:
+        for j in range(nr):
+            right_row = right_non_null_indices[j]
+            right_hash = right_hashes[right_row]
+            for i in range(nl):
+                left_row = left_non_null_indices[i]
+                if right_hash == left_hashes[left_row]:
+                    left_indexes.append(left_row)
+                    right_indexes.append(right_row)
+
+    return left_indexes.to_numpy(), right_indexes.to_numpy()
@@ -20,6 +20,7 @@ cdef extern from "intbuffer.h" namespace "":
         void extend(const int64_t* values, size_t count)
         const int64_t* data() const
         size_t size() const
+    void append_repeated(int64_t value, size_t count)
 
 
 cdef class IntBuffer:
 
@@ -23,6 +23,7 @@ cdef extern from "intbuffer.h":
         void extend(const int64_t* data, size_t count) nogil
         const int64_t* data() nogil
         size_t size() nogil
+        void append_repeated(int64_t value, size_t count) nogil
 
 cdef class IntBuffer:
 
 
@@ -26,6 +26,7 @@
 
 from opteryx import EOS
 from opteryx.compiled.joins import build_side_hash_map
+from opteryx.compiled.joins import get_last_inner_join_metrics
 from opteryx.compiled.joins import inner_join
 from opteryx.compiled.structures.bloom_filter import create_bloom_filter
 from opteryx.models import QueryProperties
@@ -71,8 +72,9 @@ def execute(self, morsel: Table, join_leg: str) -> Table:
 
                     start = time.monotonic_ns()
                     self.left_hash = build_side_hash_map(self.left_relation, self.left_columns)
-                    self.statistics.time_inner_join_build_side_hash_map += (
-                        time.monotonic_ns() - start
+                    self.statistics.increase(
+                        "time_inner_join_build_side_hash_map",
+                        time.monotonic_ns() - start,
                     )
 
                     # If the left side is small enough to quickly build a bloom filter, do that.
@@ -83,8 +85,10 @@ def execute(self, morsel: Table, join_leg: str) -> Table:
                         self.left_filter = create_bloom_filter(
                             self.left_relation, self.left_columns
                         )
-                        self.statistics.time_build_bloom_filter += time.monotonic_ns() - start
-                        self.statistics.feature_bloom_filter += 1
+                        self.statistics.increase(
+                            "time_build_bloom_filter", time.monotonic_ns() - start
+                        )
+                        self.statistics.increase("feature_bloom_filter", 1)
                 else:
                     if self.left_buffer_columns is None:
                         self.left_buffer_columns = morsel.schema.names
@@ -106,7 +110,7 @@ def execute(self, morsel: Table, join_leg: str) -> Table:
                     maybe_in_left = self.left_filter.possibly_contains_many(
                         morsel, self.right_columns
                     )
-                    self.statistics.time_bloom_filtering += time.monotonic_ns() - start
+                    self.statistics.increase("time_bloom_filtering", time.monotonic_ns() - start)
                     morsel = morsel.filter(maybe_in_left)
 
                     # If the bloom filter is not effective, disable it.
@@ -115,13 +119,32 @@ def execute(self, morsel: Table, join_leg: str) -> Table:
                     eliminated_rows = len(maybe_in_left) - morsel.num_rows
                     if eliminated_rows < 0.05 * len(maybe_in_left):
                         self.left_filter = None
-                        self.statistics.feature_dynamically_disabled_bloom_filter += 1
+                        self.statistics.increase("feature_dynamically_disabled_bloom_filter", 1)
 
-                    self.statistics.rows_eliminated_by_bloom_filter += eliminated_rows
+                    self.statistics.increase("rows_eliminated_by_bloom_filter", eliminated_rows)
 
                 # do the join
                 left_indicies, right_indicies = inner_join(
                     morsel, self.right_columns, self.left_hash
                 )
 
-                yield align_tables(morsel, self.left_relation, right_indicies, left_indicies)
+                # record detailed timing and row counts for diagnostics
+                (
+                    hash_time,
+                    probe_time,
+                    rows_hashed,
+                    candidate_rows,
+                    matched_rows,
+                    materialize_time,
+                ) = get_last_inner_join_metrics()
+                self.statistics.increase("time_inner_join_hash", hash_time)
+                self.statistics.increase("time_inner_join_probe", probe_time)
+                self.statistics.increase("rows_inner_join_hashed", rows_hashed)
+                self.statistics.increase("rows_inner_join_candidates", candidate_rows)
+                self.statistics.increase("time_inner_join_indices", materialize_time)
+                self.statistics.increase("rows_inner_join_matched", matched_rows)
+                start = time.monotonic_ns()
+                aligned = align_tables(morsel, self.left_relation, right_indicies, left_indicies)
+                self.statistics.increase("time_inner_join_align", time.monotonic_ns() - start)
+
+                yield aligned