Skip to content

Commit ef75ab8

Browse files
SOLR-18195: CombinedQueryComponent supports Collapse now (#4277)
Support for using {!collapse} with CombinedQueryComponent (RRF)
1 parent a1df852 commit ef75ab8

8 files changed

Lines changed: 542 additions & 17 deletions

File tree

Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
# See https://github.com/apache/solr/blob/main/dev-docs/changelog.adoc
2+
title: Support for using {!collapse} with CombinedQueryComponent (RRF)
3+
type: added
4+
authors:
5+
- name: Sonu Sharma
6+
nick: ercsonusharma
7+
links:
8+
- name: SOLR-18195
9+
url: https://issues.apache.org/jira/browse/SOLR-18195

solr/core/src/java/org/apache/solr/handler/component/CombinedQueryComponent.java

Lines changed: 48 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -31,6 +31,7 @@
3131
import java.util.Set;
3232
import java.util.stream.Collectors;
3333
import org.apache.lucene.search.Explanation;
34+
import org.apache.lucene.search.Query;
3435
import org.apache.lucene.search.Sort;
3536
import org.apache.lucene.search.SortField;
3637
import org.apache.solr.client.solrj.SolrServerException;
@@ -42,6 +43,7 @@
4243
import org.apache.solr.common.params.GroupParams;
4344
import org.apache.solr.common.params.ShardParams;
4445
import org.apache.solr.common.params.SolrParams;
46+
import org.apache.solr.common.util.CollectionUtil;
4547
import org.apache.solr.common.util.NamedList;
4648
import org.apache.solr.common.util.SimpleOrderedMap;
4749
import org.apache.solr.common.util.StrUtils;
@@ -53,8 +55,10 @@
5355
import org.apache.solr.response.SolrQueryResponse;
5456
import org.apache.solr.schema.IndexSchema;
5557
import org.apache.solr.schema.SchemaField;
58+
import org.apache.solr.search.CollapsingQParserPlugin;
5659
import org.apache.solr.search.DocListAndSet;
5760
import org.apache.solr.search.QueryResult;
61+
import org.apache.solr.search.SolrIndexSearcher;
5862
import org.apache.solr.search.SolrReturnFields;
5963
import org.apache.solr.search.SortSpec;
6064
import org.apache.solr.util.SolrResponseUtil;
@@ -236,7 +240,10 @@ private void prepareCombinedResponseBuilder(
236240
boolean partialResults,
237241
boolean segmentTerminatedEarly,
238242
Boolean setMaxHitsTerminatedEarly) {
239-
QueryResult combinedQueryResult = QueryAndResponseCombiner.simpleCombine(queryResults);
243+
SolrIndexSearcher searcher = crb.req.getSearcher();
244+
List<Query> collapseFilters = getCollapseFilters(crb.getFilters());
245+
QueryResult combinedQueryResult =
246+
QueryAndResponseCombiner.simpleCombine(queryResults, collapseFilters, searcher);
240247
combinedQueryResult.setPartialResults(partialResults);
241248
combinedQueryResult.setSegmentTerminatedEarly(segmentTerminatedEarly);
242249
combinedQueryResult.setMaxHitsTerminatedEarly(setMaxHitsTerminatedEarly);
@@ -258,6 +265,21 @@ private void prepareCombinedResponseBuilder(
258265
}
259266
}
260267

268+
/** Extracts the list of CollapsingPostFilter query from the filter list, if present. */
269+
private static List<Query> getCollapseFilters(List<Query> filters) {
270+
if (CollectionUtil.isNotEmpty(filters)) {
271+
return filters.stream()
272+
.filter(q -> q instanceof CollapsingQParserPlugin.CollapsingPostFilter)
273+
.toList();
274+
}
275+
return List.of();
276+
}
277+
278+
/**
279+
* Each shard response contains both "response" and "response_per_query". Only "response" is
280+
* deduplicated across sub-queries, so any processing of "response_per_query" must exclude docs
281+
* not present in "response" to avoid reintroducing docs that were eliminated while deduplication.
282+
*/
261283
@Override
262284
protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
263285
SortSpec ss = rb.getSortSpec();
@@ -296,6 +318,10 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
296318
long approximateTotalHits = 0;
297319
Map<String, List<ShardDoc>> shardDocMap = new HashMap<>();
298320
String[] queriesToCombineKeys = rb.req.getParams().getParams(CombinerParams.COMBINER_QUERY);
321+
// Build per-shard set of doc IDs from the shard's combined (deduplicated) response.
322+
// Used to filter per-query docs so that RRF doesn't reintroduce docs
323+
// excluded by collapse at the shard level.
324+
Map<String, Set<Object>> combinedDocIdsPerShard = HashMap.newHashMap(sreq.responses.size());
299325
// TODO: to be parallelized outer loop
300326
for (int queryIndex = 0; queryIndex < queriesToCombineKeys.length; queryIndex++) {
301327
int failedShardCount = 0;
@@ -377,9 +403,15 @@ protected void mergeIds(ResponseBuilder rb, ShardRequest sreq) {
377403
: new NamedList<>();
378404
// go through every doc in this response, construct a ShardDoc, and
379405
// put it in the uniqueDoc to dedup
406+
Set<Object> thisShardCombinedIds =
407+
combinedDocIdsPerShard.computeIfAbsent(
408+
srsp.getShard(), shard -> extractIdsFromCombinedResponse(rb, srsp, uniqueKeyField));
380409
for (int i = 0; i < docs.size(); i++) {
381410
SolrDocument doc = docs.get(i);
382411
Object id = doc.getFieldValue(uniqueKeyField.getName());
412+
if (!thisShardCombinedIds.contains(id)) {
413+
continue;
414+
}
383415
ShardDoc shardDoc = new ShardDoc();
384416
shardDoc.id = id;
385417
shardDoc.orderInShard = i;
@@ -610,4 +642,19 @@ protected boolean lessThan(ShardDoc docA, ShardDoc docB) {
610642
for (int i = 0; i < resultSize; i++) responseDocs.add(null);
611643
return resultIds;
612644
}
645+
646+
/**
647+
* Extracts the set of doc IDs from the shard's combined response (produced by simpleCombine).
648+
* Returns an empty set if the combined response is not available.
649+
*/
650+
private static Set<Object> extractIdsFromCombinedResponse(
651+
ResponseBuilder rb, ShardResponse srsp, SchemaField uniqueKeyField) {
652+
Object response = SolrResponseUtil.getSubsectionFromShardResponse(rb, srsp, "response", false);
653+
if (response instanceof SolrDocumentList docList) {
654+
return docList.stream()
655+
.map(doc -> doc.getFieldValue(uniqueKeyField.getName()))
656+
.collect(Collectors.toSet());
657+
}
658+
return Set.of();
659+
}
613660
}

solr/core/src/java/org/apache/solr/handler/component/combine/QueryAndResponseCombiner.java

Lines changed: 150 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -16,19 +16,33 @@
1616
*/
1717
package org.apache.solr.handler.component.combine;
1818

19+
import java.io.IOException;
1920
import java.util.HashMap;
21+
import java.util.HashSet;
2022
import java.util.List;
2123
import java.util.Map;
24+
import java.util.Set;
25+
import org.apache.lucene.index.LeafReaderContext;
26+
import org.apache.lucene.internal.hppc.IntDoubleHashMap;
27+
import org.apache.lucene.queries.function.FunctionScoreQuery;
28+
import org.apache.lucene.search.DoubleValues;
29+
import org.apache.lucene.search.DoubleValuesSource;
2230
import org.apache.lucene.search.Explanation;
31+
import org.apache.lucene.search.IndexSearcher;
32+
import org.apache.lucene.search.Query;
2333
import org.apache.lucene.search.TotalHits;
2434
import org.apache.solr.common.SolrException;
2535
import org.apache.solr.common.params.SolrParams;
36+
import org.apache.solr.common.util.CollectionUtil;
2637
import org.apache.solr.common.util.SimpleOrderedMap;
2738
import org.apache.solr.handler.component.ShardDoc;
2839
import org.apache.solr.search.DocIterator;
2940
import org.apache.solr.search.DocSet;
3041
import org.apache.solr.search.DocSlice;
42+
import org.apache.solr.search.QueryCommand;
3143
import org.apache.solr.search.QueryResult;
44+
import org.apache.solr.search.SolrIndexSearcher;
45+
import org.apache.solr.search.SortedIntDocSet;
3246
import org.apache.solr.util.plugin.NamedListInitializedPlugin;
3347

3448
/**
@@ -49,12 +63,19 @@ public abstract List<ShardDoc> combine(
4963
Map<String, List<ShardDoc>> queriesDocMap, SolrParams solrParams);
5064

5165
/**
52-
* Simple combine query result list as a union.
66+
* Combine query result list as a union, optionally deduplicating by a collapse field. When a
67+
* collapse filter is provided, only one document per unique field value is kept (based on the
68+
* collapse sort/score selection). This ensures that collapse semantics are preserved across
69+
* combined queries.
5370
*
5471
* @param queryResults the query results to be combined
72+
* @param collapseFilters the collapse post filters, or empty if no collapse dedup is needed
73+
* @param searcher the searcher to read field values from, required when collapseFilters is
74+
* non-empty
5575
* @return the combined query result
5676
*/
57-
public static QueryResult simpleCombine(List<QueryResult> queryResults) {
77+
public static QueryResult simpleCombine(
78+
List<QueryResult> queryResults, List<Query> collapseFilters, SolrIndexSearcher searcher) {
5879
QueryResult combinedQueryResults = new QueryResult();
5980
DocSet combinedDocSet = null;
6081
Map<Integer, Float> uniqueDocIds = new HashMap<>();
@@ -71,6 +92,19 @@ public static QueryResult simpleCombine(List<QueryResult> queryResults) {
7192
combinedDocSet = combinedDocSet.union(queryResult.getDocSet());
7293
}
7394
}
95+
96+
// If collapse fields are specified, deduplicate by field value across combined queries.
97+
// Each sub-query already collapsed individually, but different sub-queries may have
98+
// selected different group heads for the same field value.
99+
int removedByCollapse = 0;
100+
if (CollectionUtil.isNotEmpty(collapseFilters) && searcher != null && queryResults.size() > 1) {
101+
int preCollapseSize = uniqueDocIds.size();
102+
combinedDocSet =
103+
removeCollapsedDuplicatesViaSearcher(
104+
collapseFilters, searcher, uniqueDocIds, combinedDocSet);
105+
removedByCollapse = preCollapseSize - uniqueDocIds.size();
106+
}
107+
74108
int combinedResultsLength = uniqueDocIds.size();
75109
int[] combinedResultsDocIds = new int[combinedResultsLength];
76110
float[] combinedResultScores = new float[combinedResultsLength];
@@ -87,14 +121,64 @@ public static QueryResult simpleCombine(List<QueryResult> queryResults) {
87121
combinedResultsLength,
88122
combinedResultsDocIds,
89123
combinedResultScores,
90-
Math.max(combinedResultsLength, totalMatches),
124+
Math.max(combinedResultsLength, totalMatches - removedByCollapse),
91125
combinedResultScores.length > 0 ? combinedResultScores[0] : 0,
92126
TotalHits.Relation.GREATER_THAN_OR_EQUAL_TO);
93127
combinedQueryResults.setDocList(combinedResultSlice);
94128
combinedQueryResults.setDocSet(combinedDocSet);
95129
return combinedQueryResults;
96130
}
97131

132+
/**
133+
* Removes collapsed duplicates across combined sub-queries. Ensures that only one document per
134+
* collapse field value retained across the merged results. Entries removed by collapsing are also
135+
* removed from {@code uniqueDocIds} (mutated in place).
136+
*
137+
* @return the collapsed combined DocSet, or null if combinedDocSet was null
138+
*/
139+
private static DocSet removeCollapsedDuplicatesViaSearcher(
140+
List<Query> collapseFilters,
141+
SolrIndexSearcher searcher,
142+
Map<Integer, Float> uniqueDocIds,
143+
DocSet combinedDocSet) {
144+
IntDoubleHashMap scoreMap = new IntDoubleHashMap(uniqueDocIds.size());
145+
uniqueDocIds.forEach((doc, score) -> scoreMap.put(doc, score.doubleValue()));
146+
Query baseQuery;
147+
boolean needDocSet;
148+
if (combinedDocSet != null) {
149+
baseQuery = combinedDocSet.makeQuery();
150+
needDocSet = true;
151+
} else {
152+
int[] queryDocIds =
153+
uniqueDocIds.keySet().stream().mapToInt(Integer::intValue).sorted().toArray();
154+
baseQuery = new SortedIntDocSet(queryDocIds).makeQuery();
155+
needDocSet = false;
156+
}
157+
Query scoredQuery =
158+
FunctionScoreQuery.boostByValue(baseQuery, new PrecomputedScoreValuesSource(scoreMap));
159+
160+
try {
161+
QueryCommand cmd =
162+
new QueryCommand()
163+
.setQuery(scoredQuery)
164+
.setFilterList(collapseFilters)
165+
.setLen(uniqueDocIds.size())
166+
.setNeedDocSet(needDocSet);
167+
QueryResult result = searcher.search(cmd);
168+
169+
Set<Integer> retainedDocIds = HashSet.newHashSet(result.getDocList().size());
170+
DocIterator iter = result.getDocList().iterator();
171+
while (iter.hasNext()) {
172+
retainedDocIds.add(iter.nextDoc());
173+
}
174+
175+
uniqueDocIds.keySet().retainAll(retainedDocIds);
176+
return needDocSet ? result.getDocSet() : null;
177+
} catch (IOException e) {
178+
throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, e);
179+
}
180+
}
181+
98182
/**
99183
* Retrieves a list of explanations for the given queries and results.
100184
*
@@ -127,4 +211,67 @@ public static QueryAndResponseCombiner getImplementation(
127211
throw new SolrException(
128212
SolrException.ErrorCode.BAD_REQUEST, "Unknown Combining algorithm: " + algorithm);
129213
}
214+
215+
/**
216+
* A {@link DoubleValuesSource} backed by a global doc ID to score map. Returns pre-computed
217+
* scores for specific document IDs.
218+
*/
219+
private static class PrecomputedScoreValuesSource extends DoubleValuesSource {
220+
221+
private final IntDoubleHashMap scoreByDoc;
222+
223+
PrecomputedScoreValuesSource(IntDoubleHashMap scoreByDoc) {
224+
this.scoreByDoc = scoreByDoc;
225+
}
226+
227+
@Override
228+
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues existing) {
229+
int base = ctx.docBase;
230+
return new DoubleValues() {
231+
private double currentScore;
232+
233+
@Override
234+
public double doubleValue() {
235+
return currentScore;
236+
}
237+
238+
@Override
239+
public boolean advanceExact(int doc) {
240+
int globalDoc = base + doc;
241+
currentScore = scoreByDoc.get(globalDoc);
242+
return true;
243+
}
244+
};
245+
}
246+
247+
@Override
248+
public boolean needsScores() {
249+
return false;
250+
}
251+
252+
@Override
253+
public DoubleValuesSource rewrite(IndexSearcher searcher) {
254+
return this;
255+
}
256+
257+
@Override
258+
public boolean isCacheable(LeafReaderContext ctx) {
259+
return false;
260+
}
261+
262+
@Override
263+
public boolean equals(Object o) {
264+
return o instanceof PrecomputedScoreValuesSource other && scoreByDoc.equals(other.scoreByDoc);
265+
}
266+
267+
@Override
268+
public int hashCode() {
269+
return scoreByDoc.hashCode();
270+
}
271+
272+
@Override
273+
public String toString() {
274+
return "PrecomputedScoreValuesSource(docs=" + scoreByDoc.size() + ")";
275+
}
276+
}
130277
}

solr/core/src/test-files/solr/collection1/conf/schema-vector-catchall.xml

Lines changed: 2 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -504,7 +504,7 @@
504504
</analyzer>
505505
</fieldType>
506506
<copyField source="*_commongrams" dest="*_commongrams_stop" />
507-
507+
508508
<fieldType name="binary" class="solr.BinaryField" />
509509
<fieldType name="collation" class="solr.CollationField" language="en" />
510510
<fieldType name="icuCollation" class="solr.ICUCollationField" locale="en" />
@@ -691,6 +691,7 @@
691691

692692

693693
<dynamicField name="*_s" type="string" indexed="true" stored="true"/>
694+
<dynamicField name="*_sdv" type="string" indexed="true" stored="true" docValues="true" multiValued="false"/>
694695
<dynamicField name="*_s1" type="string" indexed="true" stored="true" multiValued="false"/>
695696
<dynamicField name="*_s1_ns" type="string" indexed="true" stored="false" multiValued="false"/>
696697
<dynamicField name="*_l" type="long" indexed="true" stored="true"/>
@@ -857,14 +858,9 @@
857858
<copyField source="lowerfilt1" dest="lowerfilt1and2"/>
858859
<copyField source="lowerfilt" dest="lowerfilt1and2"/>
859860

860-
<copyField source="*" dest="text"/>
861-
<copyField source="id" dest="range_facet_l"/>
862-
<copyField source="id" dest="id_i1"/>
863861
<copyField source="range_facet_f" dest="range_facet_d"/>
864862
<copyField source="range_facet_f1" dest="range_facet_f1_dv"/>
865863

866-
<copyField source="id" dest="range_facet_l_dv"/>
867-
<copyField source="id" dest="range_facet_i_dv"/>
868864
<copyField source="range_facet_f" dest="range_facet_f_dv"/>
869865
<copyField source="range_facet_f" dest="range_facet_d_dv"/>
870866
<copyField source="bday" dest="range_facet_dt_dv"/>

0 commit comments

Comments
 (0)