solr/core/src/java/org/apache/solr/search/join/GraphEdgeCollector.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.solr.search.join;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;
 import java.util.TreeSet;

 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.LeafReaderContext;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.search.AutomatonQuery;
 import org.apache.lucene.search.Collector;
 import org.apache.lucene.search.DocValuesTermsQuery;
 import org.apache.lucene.search.Query;
 import org.apache.lucene.search.ScoreMode;
 import org.apache.lucene.search.SimpleCollector;
 import org.apache.lucene.search.TermInSetQuery;
 import org.apache.lucene.util.BitSet;
 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.BytesRefHash;
 import org.apache.lucene.util.FixedBitSet;
 import org.apache.lucene.util.automaton.Automaton;
 import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocSet;

 /**
  * A graph hit collector.  This accumulates the edges for a given graph traversal.
  * On each collect method, the collector skips edge extraction for nodes that it has
  * already traversed.
  * @lucene.internal
  */
 abstract class GraphEdgeCollector extends SimpleCollector implements Collector {
   // For graph traversal, the result set that has already been visited and thus can be skipped for during value collection.
   DocSet skipSet;
   // known leaf nodes
   DocSet leafNodes;

   int numHits = 0;    // number of documents visited
   BitSet bits;  // if not null, used to collect documents visited

   int base;

   SchemaField collectField;

   // skipSet and leafNodes may be null
   GraphEdgeCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
     this.collectField = collectField;
     this.skipSet = skipSet;
     this.leafNodes = leafNodes;
   }

   // Set to use to collect docs being visited
   // TODO: this should be replaced with a more general delegating collector
   public void setCollectDocs(FixedBitSet target) {
     this.bits = target;
   }

   // the number of docs visited
   public int getNumHits() {
     return numHits;
   }

   public void collect(int segDoc) throws IOException {
     int doc = segDoc + base;
     if (skipSet != null && skipSet.exists(doc)) {
       // when skipSet == all nodes visited so far, then this represents a cycle and we can
       // keep track of that here in the future if we need to.
       return;
     }

     if (bits != null) bits.set(doc);
     // increment the hit count so we know how many docs we traversed this time.
     numHits++;

     // Optimization to not look up edges for a document that is a leaf node (i.e. has no outgoing edges)
     if (leafNodes == null || !leafNodes.exists(doc)) {
       addEdgeIdsToResult(segDoc);
     }
     // Note: tracking links in for each result would be a huge memory hog... so not implementing at this time.
   }

   abstract void addEdgeIdsToResult(int doc) throws IOException;

   private void addDocToResult(int docWithBase) {
     // this document is part of the traversal. mark it in our bitmap.
     bits.set(docWithBase);
     // increment the hit count so we know how many docs we traversed this time.
     numHits++;
   }

   @Override
   public void doSetNextReader(LeafReaderContext context) throws IOException {
     base = context.docBase;
   }

   public abstract Query getResultQuery(SchemaField matchField, boolean useAutomaton);

   @Override
   public ScoreMode scoreMode() {
     return ScoreMode.COMPLETE_NO_SCORES;
   }


   static class GraphTermsCollector extends GraphEdgeCollector {
     // all the collected terms
     private BytesRefHash collectorTerms;
     private SortedSetDocValues docTermOrds;


     GraphTermsCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
       super(collectField, skipSet, leafNodes);
       this.collectorTerms = new BytesRefHash();
     }

     @Override
     public void doSetNextReader(LeafReaderContext context) throws IOException {
       super.doSetNextReader(context);
       // Grab the updated doc values.
       docTermOrds = DocValues.getSortedSet(context.reader(), collectField.getName());
     }

     @Override
     void addEdgeIdsToResult(int doc) throws IOException {
       // set the doc to pull the edges ids for.
       if (doc > docTermOrds.docID()) {
         docTermOrds.advance(doc);
       }
       if (doc == docTermOrds.docID()) {
         BytesRef edgeValue = new BytesRef();
         long ord;
         while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
           edgeValue = docTermOrds.lookupOrd(ord);
           // add the edge id to the collector terms.
           collectorTerms.add(edgeValue);
         }
       }
     }

     @Override
     public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
       if (collectorTerms == null || collectorTerms.size() == 0) {
         // return null if there are no terms (edges) to traverse.
         return null;
       } else {
         // Create a query
         Query q = null;

         // TODO: see if we should dynamically select this based on the frontier size.
         if (useAutomaton) {
           // build an automaton based query for the frontier.
           Automaton autn = buildAutomaton(collectorTerms);
           AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
           q = autnQuery;
         } else {
           List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
           for (int i = 0; i < collectorTerms.size(); i++) {
             BytesRef ref = new BytesRef();
             collectorTerms.get(i, ref);
             termList.add(ref);
           }
           q = (matchField.hasDocValues() && !matchField.indexed())
                   ? new DocValuesTermsQuery(matchField.getName(), termList)
                   : new TermInSetQuery(matchField.getName(), termList);
         }

         return q;
       }
     }


     /**
      * Build an automaton to represent the frontier query
      */
     private Automaton buildAutomaton(BytesRefHash termBytesHash) {
       // need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
       final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
       for (int i = 0; i < termBytesHash.size(); i++) {
         BytesRef ref = new BytesRef();
         termBytesHash.get(i, ref);
         terms.add(ref);
       }
       final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
       return a;
     }

   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.solr.search.join;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;
	import java.util.TreeSet;

	import org.apache.lucene.index.DocValues;
	import org.apache.lucene.index.LeafReaderContext;
	import org.apache.lucene.index.SortedSetDocValues;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.search.AutomatonQuery;
	import org.apache.lucene.search.Collector;
	import org.apache.lucene.search.DocValuesTermsQuery;
	import org.apache.lucene.search.Query;
	import org.apache.lucene.search.ScoreMode;
	import org.apache.lucene.search.SimpleCollector;
	import org.apache.lucene.search.TermInSetQuery;
	import org.apache.lucene.util.BitSet;
	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.BytesRefHash;
	import org.apache.lucene.util.FixedBitSet;
	import org.apache.lucene.util.automaton.Automaton;
	import org.apache.lucene.util.automaton.DaciukMihovAutomatonBuilder;
	import org.apache.solr.schema.SchemaField;
	import org.apache.solr.search.DocSet;

	/**
	* A graph hit collector. This accumulates the edges for a given graph traversal.
	* On each collect method, the collector skips edge extraction for nodes that it has
	* already traversed.
	* @lucene.internal
	*/
	abstract class GraphEdgeCollector extends SimpleCollector implements Collector {
	// For graph traversal, the result set that has already been visited and thus can be skipped for during value collection.
	DocSet skipSet;
	// known leaf nodes
	DocSet leafNodes;

	int numHits = 0; // number of documents visited
	BitSet bits; // if not null, used to collect documents visited

	int base;

	SchemaField collectField;

	// skipSet and leafNodes may be null
	GraphEdgeCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
	this.collectField = collectField;
	this.skipSet = skipSet;
	this.leafNodes = leafNodes;
	}

	// Set to use to collect docs being visited
	// TODO: this should be replaced with a more general delegating collector
	public void setCollectDocs(FixedBitSet target) {
	this.bits = target;
	}

	// the number of docs visited
	public int getNumHits() {
	return numHits;
	}

	public void collect(int segDoc) throws IOException {
	int doc = segDoc + base;
	if (skipSet != null && skipSet.exists(doc)) {
	// when skipSet == all nodes visited so far, then this represents a cycle and we can
	// keep track of that here in the future if we need to.
	return;
	}

	if (bits != null) bits.set(doc);
	// increment the hit count so we know how many docs we traversed this time.
	numHits++;

	// Optimization to not look up edges for a document that is a leaf node (i.e. has no outgoing edges)
	if (leafNodes == null \|\| !leafNodes.exists(doc)) {
	addEdgeIdsToResult(segDoc);
	}
	// Note: tracking links in for each result would be a huge memory hog... so not implementing at this time.
	}

	abstract void addEdgeIdsToResult(int doc) throws IOException;

	private void addDocToResult(int docWithBase) {
	// this document is part of the traversal. mark it in our bitmap.
	bits.set(docWithBase);
	// increment the hit count so we know how many docs we traversed this time.
	numHits++;
	}

	@Override
	public void doSetNextReader(LeafReaderContext context) throws IOException {
	base = context.docBase;
	}

	public abstract Query getResultQuery(SchemaField matchField, boolean useAutomaton);

	@Override
	public ScoreMode scoreMode() {
	return ScoreMode.COMPLETE_NO_SCORES;
	}


	static class GraphTermsCollector extends GraphEdgeCollector {
	// all the collected terms
	private BytesRefHash collectorTerms;
	private SortedSetDocValues docTermOrds;


	GraphTermsCollector(SchemaField collectField, DocSet skipSet, DocSet leafNodes) {
	super(collectField, skipSet, leafNodes);
	this.collectorTerms = new BytesRefHash();
	}

	@Override
	public void doSetNextReader(LeafReaderContext context) throws IOException {
	super.doSetNextReader(context);
	// Grab the updated doc values.
	docTermOrds = DocValues.getSortedSet(context.reader(), collectField.getName());
	}

	@Override
	void addEdgeIdsToResult(int doc) throws IOException {
	// set the doc to pull the edges ids for.
	if (doc > docTermOrds.docID()) {
	docTermOrds.advance(doc);
	}
	if (doc == docTermOrds.docID()) {
	BytesRef edgeValue = new BytesRef();
	long ord;
	while ((ord = docTermOrds.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
	edgeValue = docTermOrds.lookupOrd(ord);
	// add the edge id to the collector terms.
	collectorTerms.add(edgeValue);
	}
	}
	}

	@Override
	public Query getResultQuery(SchemaField matchField, boolean useAutomaton) {
	if (collectorTerms == null \|\| collectorTerms.size() == 0) {
	// return null if there are no terms (edges) to traverse.
	return null;
	} else {
	// Create a query
	Query q = null;

	// TODO: see if we should dynamically select this based on the frontier size.
	if (useAutomaton) {
	// build an automaton based query for the frontier.
	Automaton autn = buildAutomaton(collectorTerms);
	AutomatonQuery autnQuery = new AutomatonQuery(new Term(matchField.getName()), autn);
	q = autnQuery;
	} else {
	List<BytesRef> termList = new ArrayList<>(collectorTerms.size());
	for (int i = 0; i < collectorTerms.size(); i++) {
	BytesRef ref = new BytesRef();
	collectorTerms.get(i, ref);
	termList.add(ref);
	}
	q = (matchField.hasDocValues() && !matchField.indexed())
	? new DocValuesTermsQuery(matchField.getName(), termList)
	: new TermInSetQuery(matchField.getName(), termList);
	}

	return q;
	}
	}


	/**
	* Build an automaton to represent the frontier query
	*/
	private Automaton buildAutomaton(BytesRefHash termBytesHash) {
	// need top pass a sorted set of terms to the autn builder (maybe a better way to avoid this?)
	final TreeSet<BytesRef> terms = new TreeSet<BytesRef>();
	for (int i = 0; i < termBytesHash.size(); i++) {
	BytesRef ref = new BytesRef();
	termBytesHash.get(i, ref);
	terms.add(ref);
	}
	final Automaton a = DaciukMihovAutomatonBuilder.build(terms);
	return a;
	}

	}
	}