blob: 02ed123f3a7efed4a626198ec88c8ff4c2d87787 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.solr.search.join;
import java.io.IOException;
import java.util.ArrayList;
import java.util.List;
import java.util.TreeSet;
import org.apache.lucene.index.DocValues;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.SortedSetDocValues;
import org.apache.lucene.index.Term;
import org.apache.lucene.search.AutomatonQuery;
import org.apache.lucene.search.Collector;
import org.apache.lucene.search.DocValuesTermsQuery;
import org.apache.lucene.search.Query;
import org.apache.lucene.search.ScoreMode;
import org.apache.lucene.search.SimpleCollector;
import org.apache.lucene.search.TermInSetQuery;
import org.apache.lucene.util.BitSet;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefHash;
import org.apache.lucene.util.FixedBitSet;
import org.apache.lucene.util.automaton.Automaton;
import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
import org.apache.solr.schema.SchemaField;
import org.apache.solr.search.DocSet;
/**
* A graph hit collector. This accumulates the edges for a given graph traversal.
* On each collect method, the collector skips edge extraction for nodes that it has
* already traversed.
* @lucene.internal
*/
abstract class GraphEdgeCollector extends SimpleCollector implements Collector {
// For graph traversal, the result set that has already been visited and thus can be skipped for during value collection.
DocSet skipSet;
// known leaf nodes
DocSet leafNodes;
int numHits = 0; // number of documents visited
BitSet bits; // if not null, used to collect documents visited
int base;
SchemaField collectField;
// skipSet and leafNodes may be null
GraphEdgeCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
this.collectField = collectField;
this.skipSet = skipSet;
this.leafNodes = leafNodes;
}
// Set to use to collect docs being visited
// TODO: this should be replaced with a more general delegating collector
public void setCollectDocs(FixedBitSet target) {
this.bits = target;
}
// the number of docs visited
public int getNumHits() {
return numHits;
}
public void collect(int segDoc) throws IOException {
int doc = segDoc + base;
if (skipSet != null && skipSet.exists(doc)) {
// when skipSet == all nodes visited so far, then this represents a cycle and we can
// keep track of that here in the future if we need to.
return;
}
if (bits != null) bits.set(doc);
// increment the hit count so we know how many docs we traversed this time.
numHits++;
// Optimization to not look up edges for a document that is a leaf node (i.e. has no outgoing edges)
if (leafNodes == null || !leafNodes.exists(doc)) {
addEdgeIdsToResult(segDoc);
}
// Note: tracking links in for each result would be a huge memory hog... so not implementing at this time.
}
abstract void addEdgeIdsToResult(int doc) throws IOException;
private void addDocToResult(int docWithBase) {
// this document is part of the traversal. mark it in our bitmap.
bits.set(docWithBase);
// increment the hit count so we know how many docs we traversed this time.
numHits++;
}
@Override
public void doSetNextReader(LeafReaderContext context) throws IOException {
base = context.docBase;
}
public abstract Query getResultQuery(SchemaField matchField, boolean useAutomaton);
@Override
public ScoreMode scoreMode() {
return ScoreMode.COMPLETE_NO_SCORES;
}
static class GraphTermsCollector extends GraphEdgeCollector {
// all the collected terms
private BytesRefHash collectorTerms;
private SortedSetDocValues docTermOrds;
GraphTermsCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
super(collectField, skipSet, leafNodes);
this.collectorTerms = new BytesRefHash();
}
@Override
public void doSetNextReader(LeafReaderContext context) throws IOException {
super.doSetNextReader(context);
// Grab the updated doc values.
docTermOrds = DocValues.getSortedSet(context.reader(), collectField.getName());
}
@Override
void addEdgeIdsToResult(int doc) throws IOException {
// set the doc to pull the edges ids for.
if (doc > docTermOrds.docID()) {
docTermOrds.advance(doc);
}
if (doc == docTermOrds.docID()) {
BytesRef edgeValue = new BytesRef();
long ord;
while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
edgeValue = docTermOrds.lookupOrd(ord);
// add the edge id to the collector terms.
collectorTerms.add(edgeValue);
}
}
}
@Override
public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
if (collectorTerms == null || collectorTerms.size() == 0) {
// return null if there are no terms (edges) to traverse.
return null;
} else {
// Create a query
Query q = null;
// TODO: see if we should dynamically select this based on the frontier size.
if (useAutomaton) {
// build an automaton based query for the frontier.
Automaton autn = buildAutomaton(collectorTerms);
AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
q = autnQuery;
} else {
List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
for (int i = 0; i < collectorTerms.size(); i++) {
BytesRef ref = new BytesRef();
collectorTerms.get(i, ref);
termList.add(ref);
}
q = (matchField.hasDocValues() && !matchField.indexed())
? new DocValuesTermsQuery(matchField.getName(), termList)
: new TermInSetQuery(matchField.getName(), termList);
}
return q;
}
}
/**
* Build an automaton to represent the frontier query
*/
private Automaton buildAutomaton(BytesRefHash termBytesHash) {
// need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
for (int i = 0; i < termBytesHash.size(); i++) {
BytesRef ref = new BytesRef();
termBytesHash.get(i, ref);
terms.add(ref);
}
final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
return a;
}
}
}