| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.solr.search.join; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.TreeSet; |
| |
| import org.apache.lucene.index.DocValues; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.SortedSetDocValues; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.search.AutomatonQuery; |
| import org.apache.lucene.search.Collector; |
| import org.apache.lucene.search.DocValuesTermsQuery; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.ScoreMode; |
| import org.apache.lucene.search.SimpleCollector; |
| import org.apache.lucene.search.TermInSetQuery; |
| import org.apache.lucene.util.BitSet; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefHash; |
| import org.apache.lucene.util.FixedBitSet; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder; |
| import org.apache.solr.schema.SchemaField; |
| import org.apache.solr.search.DocSet; |
| |
| /** |
| * A graph hit collector. This accumulates the edges for a given graph traversal. |
| * On each collect method, the collector skips edge extraction for nodes that it has |
| * already traversed. |
| * @lucene.internal |
| */ |
| abstract class GraphEdgeCollector extends SimpleCollector implements Collector { |
| // For graph traversal, the result set that has already been visited and thus can be skipped for during value collection. |
| DocSet skipSet; |
| // known leaf nodes |
| DocSet leafNodes; |
| |
| int numHits = 0; // number of documents visited |
| BitSet bits; // if not null, used to collect documents visited |
| |
| int base; |
| |
| SchemaField collectField; |
| |
| // skipSet and leafNodes may be null |
| GraphEdgeCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) { |
| this.collectField = collectField; |
| this.skipSet = skipSet; |
| this.leafNodes = leafNodes; |
| } |
| |
| // Set to use to collect docs being visited |
| // TODO: this should be replaced with a more general delegating collector |
| public void setCollectDocs(FixedBitSet target) { |
| this.bits = target; |
| } |
| |
| // the number of docs visited |
| public int getNumHits() { |
| return numHits; |
| } |
| |
| public void collect(int segDoc) throws IOException { |
| int doc = segDoc + base; |
| if (skipSet != null && skipSet.exists(doc)) { |
| // when skipSet == all nodes visited so far, then this represents a cycle and we can |
| // keep track of that here in the future if we need to. |
| return; |
| } |
| |
| if (bits != null) bits.set(doc); |
| // increment the hit count so we know how many docs we traversed this time. |
| numHits++; |
| |
| // Optimization to not look up edges for a document that is a leaf node (i.e. has no outgoing edges) |
| if (leafNodes == null || !leafNodes.exists(doc)) { |
| addEdgeIdsToResult(segDoc); |
| } |
| // Note: tracking links in for each result would be a huge memory hog... so not implementing at this time. |
| } |
| |
| abstract void addEdgeIdsToResult(int doc) throws IOException; |
| |
| private void addDocToResult(int docWithBase) { |
| // this document is part of the traversal. mark it in our bitmap. |
| bits.set(docWithBase); |
| // increment the hit count so we know how many docs we traversed this time. |
| numHits++; |
| } |
| |
| @Override |
| public void doSetNextReader(LeafReaderContext context) throws IOException { |
| base = context.docBase; |
| } |
| |
| public abstract Query getResultQuery(SchemaField matchField, boolean useAutomaton); |
| |
| @Override |
| public ScoreMode scoreMode() { |
| return ScoreMode.COMPLETE_NO_SCORES; |
| } |
| |
| |
| static class GraphTermsCollector extends GraphEdgeCollector { |
| // all the collected terms |
| private BytesRefHash collectorTerms; |
| private SortedSetDocValues docTermOrds; |
| |
| |
| GraphTermsCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) { |
| super(collectField, skipSet, leafNodes); |
| this.collectorTerms = new BytesRefHash(); |
| } |
| |
| @Override |
| public void doSetNextReader(LeafReaderContext context) throws IOException { |
| super.doSetNextReader(context); |
| // Grab the updated doc values. |
| docTermOrds = DocValues.getSortedSet(context.reader(), collectField.getName()); |
| } |
| |
| @Override |
| void addEdgeIdsToResult(int doc) throws IOException { |
| // set the doc to pull the edges ids for. |
| if (doc > docTermOrds.docID()) { |
| docTermOrds.advance(doc); |
| } |
| if (doc == docTermOrds.docID()) { |
| BytesRef edgeValue = new BytesRef(); |
| long ord; |
| while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) { |
| edgeValue = docTermOrds.lookupOrd(ord); |
| // add the edge id to the collector terms. |
| collectorTerms.add(edgeValue); |
| } |
| } |
| } |
| |
| @Override |
| public Query getResultQuery(SchemaField matchField, boolean useAutomaton) { |
| if (collectorTerms == null || collectorTerms.size() == 0) { |
| // return null if there are no terms (edges) to traverse. |
| return null; |
| } else { |
| // Create a query |
| Query q = null; |
| |
| // TODO: see if we should dynamically select this based on the frontier size. |
| if (useAutomaton) { |
| // build an automaton based query for the frontier. |
| Automaton autn = buildAutomaton(collectorTerms); |
| AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn); |
| q = autnQuery; |
| } else { |
| List<BytesRef> termList = new ArrayList<>(collectorTerms.size()); |
| for (int i = 0; i < collectorTerms.size(); i++) { |
| BytesRef ref = new BytesRef(); |
| collectorTerms.get(i, ref); |
| termList.add(ref); |
| } |
| q = (matchField.hasDocValues() && !matchField.indexed()) |
| ? new DocValuesTermsQuery(matchField.getName(), termList) |
| : new TermInSetQuery(matchField.getName(), termList); |
| } |
| |
| return q; |
| } |
| } |
| |
| |
| /** |
| * Build an automaton to represent the frontier query |
| */ |
| private Automaton buildAutomaton(BytesRefHash termBytesHash) { |
| // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?) |
| final TreeSet<BytesRef> terms = new TreeSet<BytesRef>(); |
| for (int i = 0; i < termBytesHash.size(); i++) { |
| BytesRef ref = new BytesRef(); |
| termBytesHash.get(i, ref); |
| terms.add(ref); |
| } |
| final Automaton a = DaciukMihovAutomatonBuilder.build(terms); |
| return a; |
| } |
| |
| } |
| } |
| |
| |