Skip to content

Commit ba66c2f

Browse files
Optimize generic InList static filtering
Introduces NestedTypeFilter for non-primitive constant IN lists. Replaces the legacy ArrayStaticFilter fallback with a HashTable-backed lookup and shared bitmap result construction.
1 parent 3aefba7 commit ba66c2f

3 files changed

Lines changed: 211 additions & 87 deletions

File tree

datafusion/physical-expr/src/expressions/in_list.rs

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -38,6 +38,7 @@ use datafusion_expr::{ColumnarValue, expr_vec_fmt};
3838

3939
mod array_static_filter;
4040
mod primitive_filter;
41+
mod result;
4142
mod static_filter;
4243
mod strategy;
4344

datafusion/physical-expr/src/expressions/in_list/array_static_filter.rs

Lines changed: 94 additions & 87 deletions
Original file line numberDiff line numberDiff line change
@@ -23,23 +23,109 @@ use arrow::buffer::{BooleanBuffer, NullBuffer};
2323
use arrow::compute::{SortOptions, take};
2424
use arrow::datatypes::DataType;
2525
use arrow::util::bit_iterator::BitIndexIterator;
26-
use datafusion_common::HashMap;
2726
use datafusion_common::Result;
28-
use datafusion_common::hash_utils::{RandomState, with_hashes};
29-
use hashbrown::hash_map::RawEntryMut;
27+
use datafusion_common::hash_utils::with_hashes;
3028

29+
use datafusion_common::hash_utils::RandomState;
30+
use hashbrown::HashTable;
31+
32+
use super::result::build_in_list_result;
3133
use super::static_filter::StaticFilter;
3234

3335
/// Static filter for InList that stores the array and hash set for O(1) lookups
3436
#[derive(Debug, Clone)]
3537
pub(super) struct ArrayStaticFilter {
3638
in_array: ArrayRef,
3739
state: RandomState,
38-
/// Used to provide a lookup from value to in list index
40+
/// Stores indices into `in_array` for O(1) lookups.
41+
table: HashTable<usize>,
42+
}
43+
44+
impl ArrayStaticFilter {
45+
/// Creates a filter using dynamic comparison for array values.
46+
pub(super) fn try_new(in_array: ArrayRef) -> Result<Self> {
47+
// Null type has no natural order - return empty hash set
48+
if in_array.data_type() == &DataType::Null {
49+
return Ok(Self {
50+
in_array,
51+
state: RandomState::default(),
52+
table: HashTable::new(),
53+
});
54+
}
55+
56+
let state = RandomState::default();
57+
let table = Self::build_haystack_table(&in_array, &state)?;
58+
59+
Ok(Self {
60+
in_array,
61+
state,
62+
table,
63+
})
64+
}
65+
66+
/// Build a hash table from haystack values for O(1) lookups.
3967
///
40-
/// Note: usize::hash is not used, instead the raw entry
41-
/// API is used to store entries w.r.t their value
42-
map: HashMap<usize, (), ()>,
68+
/// Each unique non-null value's index is stored, keyed by its hash.
69+
/// Uses dynamic comparison via `make_comparator` for complex types.
70+
fn build_haystack_table(
71+
haystack: &ArrayRef,
72+
state: &RandomState,
73+
) -> Result<HashTable<usize>> {
74+
let mut table = HashTable::new();
75+
76+
with_hashes([haystack.as_ref()], state, |hashes| -> Result<()> {
77+
let cmp = make_comparator(haystack, haystack, SortOptions::default())?;
78+
79+
let insert_value = |idx| {
80+
let hash = hashes[idx];
81+
// Only insert if not already present (deduplication)
82+
if table.find(hash, |&x| cmp(x, idx).is_eq()).is_none() {
83+
table.insert_unique(hash, idx, |&x| hashes[x]);
84+
}
85+
};
86+
87+
match haystack.nulls() {
88+
Some(nulls) => {
89+
BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
90+
.for_each(insert_value)
91+
}
92+
None => (0..haystack.len()).for_each(insert_value),
93+
}
94+
95+
Ok(())
96+
})?;
97+
98+
Ok(table)
99+
}
100+
101+
/// Check which needle values exist in the haystack.
102+
///
103+
/// Hashes each needle value and looks it up in the pre-built haystack table.
104+
/// Uses dynamic comparison via `make_comparator` for complex types.
105+
fn find_needles_in_haystack(
106+
&self,
107+
needles: &dyn Array,
108+
negated: bool,
109+
) -> Result<BooleanArray> {
110+
let needle_nulls = needles.logical_nulls();
111+
let haystack_has_nulls = self.in_array.null_count() != 0;
112+
113+
with_hashes([needles], &self.state, |needle_hashes| {
114+
let cmp = make_comparator(needles, &self.in_array, SortOptions::default())?;
115+
116+
Ok(build_in_list_result(
117+
needles.len(),
118+
needle_nulls.as_ref(),
119+
haystack_has_nulls,
120+
negated,
121+
#[inline(always)]
122+
|i| {
123+
let hash = needle_hashes[i];
124+
self.table.find(hash, |&idx| cmp(i, idx).is_eq()).is_some()
125+
},
126+
))
127+
})
128+
}
43129
}
44130

45131
impl StaticFilter for ArrayStaticFilter {
@@ -76,85 +162,6 @@ impl StaticFilter for ArrayStaticFilter {
76162
_ => {}
77163
}
78164

79-
let needle_nulls = v.logical_nulls();
80-
let needle_nulls = needle_nulls.as_ref();
81-
let haystack_has_nulls = self.in_array.null_count() != 0;
82-
83-
with_hashes([v], &self.state, |hashes| {
84-
let cmp = make_comparator(v, &self.in_array, SortOptions::default())?;
85-
Ok((0..v.len())
86-
.map(|i| {
87-
// SQL three-valued logic: null IN (...) is always null
88-
if needle_nulls.is_some_and(|nulls| nulls.is_null(i)) {
89-
return None;
90-
}
91-
92-
let hash = hashes[i];
93-
let contains = self
94-
.map
95-
.raw_entry()
96-
.from_hash(hash, |idx| cmp(i, *idx).is_eq())
97-
.is_some();
98-
99-
match contains {
100-
true => Some(!negated),
101-
false if haystack_has_nulls => None,
102-
false => Some(negated),
103-
}
104-
})
105-
.collect())
106-
})
107-
}
108-
}
109-
110-
impl ArrayStaticFilter {
111-
/// Computes a [`StaticFilter`] for the provided [`Array`] if there
112-
/// are nulls present or there are more than the configured number of
113-
/// elements.
114-
///
115-
/// Note: This is split into a separate function as higher-rank trait bounds currently
116-
/// cause type inference to misbehave
117-
pub(super) fn try_new(in_array: ArrayRef) -> Result<ArrayStaticFilter> {
118-
// Null type has no natural order - return empty hash set
119-
if in_array.data_type() == &DataType::Null {
120-
return Ok(ArrayStaticFilter {
121-
in_array,
122-
state: RandomState::default(),
123-
map: HashMap::with_hasher(()),
124-
});
125-
}
126-
127-
let state = RandomState::default();
128-
let mut map: HashMap<usize, (), ()> = HashMap::with_hasher(());
129-
130-
with_hashes([&in_array], &state, |hashes| -> Result<()> {
131-
let cmp = make_comparator(&in_array, &in_array, SortOptions::default())?;
132-
133-
let insert_value = |idx| {
134-
let hash = hashes[idx];
135-
if let RawEntryMut::Vacant(v) = map
136-
.raw_entry_mut()
137-
.from_hash(hash, |x| cmp(*x, idx).is_eq())
138-
{
139-
v.insert_with_hasher(hash, idx, (), |x| hashes[*x]);
140-
}
141-
};
142-
143-
match in_array.nulls() {
144-
Some(nulls) => {
145-
BitIndexIterator::new(nulls.validity(), nulls.offset(), nulls.len())
146-
.for_each(insert_value)
147-
}
148-
None => (0..in_array.len()).for_each(insert_value),
149-
}
150-
151-
Ok(())
152-
})?;
153-
154-
Ok(Self {
155-
in_array,
156-
state,
157-
map,
158-
})
165+
self.find_needles_in_haystack(v, negated)
159166
}
160167
}
Lines changed: 116 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,116 @@
1+
// Licensed to the Apache Software Foundation (ASF) under one
2+
// or more contributor license agreements. See the NOTICE file
3+
// distributed with this work for additional information
4+
// regarding copyright ownership. The ASF licenses this file
5+
// to you under the Apache License, Version 2.0 (the
6+
// "License"); you may not use this file except in compliance
7+
// with the License. You may obtain a copy of the License at
8+
//
9+
// http://www.apache.org/licenses/LICENSE-2.0
10+
//
11+
// Unless required by applicable law or agreed to in writing,
12+
// software distributed under the License is distributed on an
13+
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14+
// KIND, either express or implied. See the License for the
15+
// specific language governing permissions and limitations
16+
// under the License.
17+
18+
//! Result building helpers for InList operations
19+
//!
20+
//! This module provides unified logic for building BooleanArray results
21+
//! from IN list membership tests, handling null propagation correctly
22+
//! according to SQL three-valued logic.
23+
24+
use arrow::array::BooleanArray;
25+
use arrow::buffer::{BooleanBuffer, NullBuffer};
26+
27+
// =============================================================================
28+
// RESULT BUILDER FOR IN LIST OPERATIONS
29+
// =============================================================================
30+
//
31+
// Truth table for (needle_nulls, haystack_has_nulls, negated):
32+
// (Some, true, false) → values: valid & contains, nulls: valid & contains
33+
// (None, true, false) → values: contains, nulls: contains
34+
// (Some, true, true) → values: valid ^ (valid & contains), nulls: valid & contains
35+
// (None, true, true) → values: !contains, nulls: contains
36+
// (Some, false, false) → values: valid & contains, nulls: valid
37+
// (Some, false, true) → values: valid & !contains, nulls: valid
38+
// (None, false, false) → values: contains, nulls: none
39+
// (None, false, true) → values: !contains, nulls: none
40+
41+
/// Builds a BooleanArray result for IN list operations (optimized for cheap contains).
42+
///
43+
/// This function handles the complex null propagation logic for SQL IN lists:
44+
/// - If the needle value is null, the result is null
45+
/// - If the needle is not in the set AND the haystack has nulls, the result is null
46+
/// - Otherwise, the result is true/false based on membership and negation
47+
///
48+
/// This version computes contains for ALL positions (including nulls), then applies
49+
/// null masking via bitmap operations. This is optimal for cheap contains checks
50+
/// (like DirectProbeFilter) where the branch overhead exceeds the check cost.
51+
#[inline]
52+
pub(crate) fn build_in_list_result<C>(
53+
len: usize,
54+
needle_nulls: Option<&NullBuffer>,
55+
haystack_has_nulls: bool,
56+
negated: bool,
57+
contains: C,
58+
) -> BooleanArray
59+
where
60+
C: FnMut(usize) -> bool,
61+
{
62+
// Always compute the contains buffer without checking nulls in the loop.
63+
// The null check inside the loop hurts vectorization and branch prediction.
64+
// Nulls are handled by build_result_from_contains using bitmap operations.
65+
let contains_buf = BooleanBuffer::collect_bool(len, contains);
66+
build_result_from_contains(needle_nulls, haystack_has_nulls, negated, contains_buf)
67+
}
68+
69+
/// Builds a BooleanArray result from a pre-computed contains buffer.
70+
///
71+
/// This version does NOT assume contains_buf is pre-masked at null positions.
72+
/// It handles nulls using bitmap operations which are more vectorization-friendly.
73+
#[inline]
74+
pub(crate) fn build_result_from_contains(
75+
needle_nulls: Option<&NullBuffer>,
76+
haystack_has_nulls: bool,
77+
negated: bool,
78+
contains_buf: BooleanBuffer,
79+
) -> BooleanArray {
80+
match (needle_nulls, haystack_has_nulls, negated) {
81+
// Haystack has nulls: result is null unless value is found
82+
(Some(v), true, false) => {
83+
// values: valid & contains, nulls: valid & contains
84+
// Result is valid (not null) only when needle is valid AND found in haystack
85+
let values = v.inner() & &contains_buf;
86+
BooleanArray::new(values.clone(), Some(NullBuffer::new(values)))
87+
}
88+
(None, true, false) => {
89+
BooleanArray::new(contains_buf.clone(), Some(NullBuffer::new(contains_buf)))
90+
}
91+
(Some(v), true, true) => {
92+
// NOT IN with nulls: true if valid and not found, null if found or needle null
93+
// values: valid & !contains, nulls: valid & contains
94+
// Result is valid only when needle is valid AND found (because NOT IN with
95+
// haystack nulls returns NULL when value isn't definitively excluded)
96+
let valid = v.inner();
97+
let values = valid & &(!&contains_buf);
98+
let nulls = valid & &contains_buf;
99+
BooleanArray::new(values, Some(NullBuffer::new(nulls)))
100+
}
101+
(None, true, true) => {
102+
BooleanArray::new(!&contains_buf, Some(NullBuffer::new(contains_buf)))
103+
}
104+
// Haystack has no nulls: result validity follows needle validity
105+
(Some(v), false, false) => {
106+
// values: valid & contains (mask out nulls), nulls: valid
107+
BooleanArray::new(v.inner() & &contains_buf, Some(v.clone()))
108+
}
109+
(Some(v), false, true) => {
110+
// values: valid & !contains, nulls: valid
111+
BooleanArray::new(v.inner() & &(!&contains_buf), Some(v.clone()))
112+
}
113+
(None, false, false) => BooleanArray::new(contains_buf, None),
114+
(None, false, true) => BooleanArray::new(!&contains_buf, None),
115+
}
116+
}

0 commit comments

Comments
 (0)