@@ -23,23 +23,109 @@ use arrow::buffer::{BooleanBuffer, NullBuffer};
2323use arrow:: compute:: { SortOptions , take} ;
2424use arrow:: datatypes:: DataType ;
2525use arrow:: util:: bit_iterator:: BitIndexIterator ;
26- use datafusion_common:: HashMap ;
2726use datafusion_common:: Result ;
28- use datafusion_common:: hash_utils:: { RandomState , with_hashes} ;
29- use hashbrown:: hash_map:: RawEntryMut ;
27+ use datafusion_common:: hash_utils:: with_hashes;
3028
29+ use datafusion_common:: hash_utils:: RandomState ;
30+ use hashbrown:: HashTable ;
31+
32+ use super :: result:: build_in_list_result;
3133use super :: static_filter:: StaticFilter ;
3234
3335/// Static filter for InList that stores the array and hash set for O(1) lookups
3436#[ derive( Debug , Clone ) ]
3537pub ( super ) struct ArrayStaticFilter {
3638 in_array : ArrayRef ,
3739 state : RandomState ,
38- /// Used to provide a lookup from value to in list index
40+ /// Stores indices into `in_array` for O(1) lookups.
41+ table : HashTable < usize > ,
42+ }
43+
44+ impl ArrayStaticFilter {
45+ /// Creates a filter using dynamic comparison for array values.
46+ pub ( super ) fn try_new ( in_array : ArrayRef ) -> Result < Self > {
47+ // Null type has no natural order - return empty hash set
48+ if in_array. data_type ( ) == & DataType :: Null {
49+ return Ok ( Self {
50+ in_array,
51+ state : RandomState :: default ( ) ,
52+ table : HashTable :: new ( ) ,
53+ } ) ;
54+ }
55+
56+ let state = RandomState :: default ( ) ;
57+ let table = Self :: build_haystack_table ( & in_array, & state) ?;
58+
59+ Ok ( Self {
60+ in_array,
61+ state,
62+ table,
63+ } )
64+ }
65+
66+ /// Build a hash table from haystack values for O(1) lookups.
3967 ///
40- /// Note: usize::hash is not used, instead the raw entry
41- /// API is used to store entries w.r.t their value
42- map : HashMap < usize , ( ) , ( ) > ,
68+ /// Each unique non-null value's index is stored, keyed by its hash.
69+ /// Uses dynamic comparison via `make_comparator` for complex types.
70+ fn build_haystack_table (
71+ haystack : & ArrayRef ,
72+ state : & RandomState ,
73+ ) -> Result < HashTable < usize > > {
74+ let mut table = HashTable :: new ( ) ;
75+
76+ with_hashes ( [ haystack. as_ref ( ) ] , state, |hashes| -> Result < ( ) > {
77+ let cmp = make_comparator ( haystack, haystack, SortOptions :: default ( ) ) ?;
78+
79+ let insert_value = |idx| {
80+ let hash = hashes[ idx] ;
81+ // Only insert if not already present (deduplication)
82+ if table. find ( hash, |& x| cmp ( x, idx) . is_eq ( ) ) . is_none ( ) {
83+ table. insert_unique ( hash, idx, |& x| hashes[ x] ) ;
84+ }
85+ } ;
86+
87+ match haystack. nulls ( ) {
88+ Some ( nulls) => {
89+ BitIndexIterator :: new ( nulls. validity ( ) , nulls. offset ( ) , nulls. len ( ) )
90+ . for_each ( insert_value)
91+ }
92+ None => ( 0 ..haystack. len ( ) ) . for_each ( insert_value) ,
93+ }
94+
95+ Ok ( ( ) )
96+ } ) ?;
97+
98+ Ok ( table)
99+ }
100+
101+ /// Check which needle values exist in the haystack.
102+ ///
103+ /// Hashes each needle value and looks it up in the pre-built haystack table.
104+ /// Uses dynamic comparison via `make_comparator` for complex types.
105+ fn find_needles_in_haystack (
106+ & self ,
107+ needles : & dyn Array ,
108+ negated : bool ,
109+ ) -> Result < BooleanArray > {
110+ let needle_nulls = needles. logical_nulls ( ) ;
111+ let haystack_has_nulls = self . in_array . null_count ( ) != 0 ;
112+
113+ with_hashes ( [ needles] , & self . state , |needle_hashes| {
114+ let cmp = make_comparator ( needles, & self . in_array , SortOptions :: default ( ) ) ?;
115+
116+ Ok ( build_in_list_result (
117+ needles. len ( ) ,
118+ needle_nulls. as_ref ( ) ,
119+ haystack_has_nulls,
120+ negated,
121+ #[ inline( always) ]
122+ |i| {
123+ let hash = needle_hashes[ i] ;
124+ self . table . find ( hash, |& idx| cmp ( i, idx) . is_eq ( ) ) . is_some ( )
125+ } ,
126+ ) )
127+ } )
128+ }
43129}
44130
45131impl StaticFilter for ArrayStaticFilter {
@@ -76,85 +162,6 @@ impl StaticFilter for ArrayStaticFilter {
76162 _ => { }
77163 }
78164
79- let needle_nulls = v. logical_nulls ( ) ;
80- let needle_nulls = needle_nulls. as_ref ( ) ;
81- let haystack_has_nulls = self . in_array . null_count ( ) != 0 ;
82-
83- with_hashes ( [ v] , & self . state , |hashes| {
84- let cmp = make_comparator ( v, & self . in_array , SortOptions :: default ( ) ) ?;
85- Ok ( ( 0 ..v. len ( ) )
86- . map ( |i| {
87- // SQL three-valued logic: null IN (...) is always null
88- if needle_nulls. is_some_and ( |nulls| nulls. is_null ( i) ) {
89- return None ;
90- }
91-
92- let hash = hashes[ i] ;
93- let contains = self
94- . map
95- . raw_entry ( )
96- . from_hash ( hash, |idx| cmp ( i, * idx) . is_eq ( ) )
97- . is_some ( ) ;
98-
99- match contains {
100- true => Some ( !negated) ,
101- false if haystack_has_nulls => None ,
102- false => Some ( negated) ,
103- }
104- } )
105- . collect ( ) )
106- } )
107- }
108- }
109-
110- impl ArrayStaticFilter {
111- /// Computes a [`StaticFilter`] for the provided [`Array`] if there
112- /// are nulls present or there are more than the configured number of
113- /// elements.
114- ///
115- /// Note: This is split into a separate function as higher-rank trait bounds currently
116- /// cause type inference to misbehave
117- pub ( super ) fn try_new ( in_array : ArrayRef ) -> Result < ArrayStaticFilter > {
118- // Null type has no natural order - return empty hash set
119- if in_array. data_type ( ) == & DataType :: Null {
120- return Ok ( ArrayStaticFilter {
121- in_array,
122- state : RandomState :: default ( ) ,
123- map : HashMap :: with_hasher ( ( ) ) ,
124- } ) ;
125- }
126-
127- let state = RandomState :: default ( ) ;
128- let mut map: HashMap < usize , ( ) , ( ) > = HashMap :: with_hasher ( ( ) ) ;
129-
130- with_hashes ( [ & in_array] , & state, |hashes| -> Result < ( ) > {
131- let cmp = make_comparator ( & in_array, & in_array, SortOptions :: default ( ) ) ?;
132-
133- let insert_value = |idx| {
134- let hash = hashes[ idx] ;
135- if let RawEntryMut :: Vacant ( v) = map
136- . raw_entry_mut ( )
137- . from_hash ( hash, |x| cmp ( * x, idx) . is_eq ( ) )
138- {
139- v. insert_with_hasher ( hash, idx, ( ) , |x| hashes[ * x] ) ;
140- }
141- } ;
142-
143- match in_array. nulls ( ) {
144- Some ( nulls) => {
145- BitIndexIterator :: new ( nulls. validity ( ) , nulls. offset ( ) , nulls. len ( ) )
146- . for_each ( insert_value)
147- }
148- None => ( 0 ..in_array. len ( ) ) . for_each ( insert_value) ,
149- }
150-
151- Ok ( ( ) )
152- } ) ?;
153-
154- Ok ( Self {
155- in_array,
156- state,
157- map,
158- } )
165+ self . find_needles_in_haystack ( v, negated)
159166 }
160167}
0 commit comments