lucene/facet/src/java/org/apache/lucene/facet/sortedset/SortedSetDocValuesFacetCounts.java - lucene-solr - Git at Google

 package org.apache.lucene.facet.sortedset;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.facet.FacetResult;
 import org.apache.lucene.facet.Facets;
 import org.apache.lucene.facet.FacetsCollector;
 import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
 import org.apache.lucene.facet.FacetsConfig;
 import org.apache.lucene.facet.LabelAndValue;
 import org.apache.lucene.facet.TopOrdAndIntQueue;
 import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
 import org.apache.lucene.index.AtomicReader;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.MultiDocValues;
 import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
 import org.apache.lucene.index.ReaderUtil;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.util.BytesRef;

 /** Compute facets counts from previously
  *  indexed {@link SortedSetDocValuesFacetField},
  *  without require a separate taxonomy index.  Faceting is
  *  a bit slower (~25%), and there is added cost on every
  *  {@link IndexReader} open to create a new {@link
  *  SortedSetDocValuesReaderState}.  Furthermore, this does
  *  not support hierarchical facets; only flat (dimension +
  *  label) facets, but it uses quite a bit less RAM to do
  *  so.
  *
  *  <p><b>NOTE</b>: this class should be instantiated and
  *  then used from a single thread, because it holds a
  *  thread-private instance of {@link SortedSetDocValues}.
  *
  * <p><b>NOTE:</b>: tie-break is by unicode sort order
  *
  * @lucene.experimental */
 public class SortedSetDocValuesFacetCounts extends Facets {

   final SortedSetDocValuesReaderState state;
   final SortedSetDocValues dv;
   final String field;
   final int[] counts;

   /** Sparse faceting: returns any dimension that had any
    *  hits, topCount labels per dimension. */
   public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits)
       throws IOException {
     this.state = state;
     this.field = state.getField();
     counts = new int[state.getSize()];
     dv = state.getDocValues();
     //System.out.println("field=" + field);
     count(hits.getMatchingDocs());
   }

   @Override
   public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
     if (topN <= 0) {
       throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
     }
     if (path.length > 0) {
       throw new IllegalArgumentException("path should be 0 length");
     }
     OrdRange ordRange = state.getOrdRange(dim);
     if (ordRange == null) {
       throw new IllegalArgumentException("dimension \"" + dim + "\" was not indexed");
     }
     return getDim(dim, ordRange, topN);
   }

   private final FacetResult getDim(String dim, OrdRange ordRange, int topN) {

     TopOrdAndIntQueue q = null;

     int bottomCount = 0;

     int dimCount = 0;
     int childCount = 0;

     TopOrdAndIntQueue.OrdAndValue reuse = null;
     //System.out.println("getDim : " + ordRange.start + " - " + ordRange.end);
     for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
       //System.out.println("  ord=" + ord + " count=" + counts[ord]);
       if (counts[ord] > 0) {
         dimCount += counts[ord];
         childCount++;
         if (counts[ord] > bottomCount) {
           if (reuse == null) {
             reuse = new TopOrdAndIntQueue.OrdAndValue();
           }
           reuse.ord = ord;
           reuse.value = counts[ord];
           if (q == null) {
             // Lazy init, so we don't create this for the
             // sparse case unnecessarily
             q = new TopOrdAndIntQueue(topN);
           }
           reuse = q.insertWithOverflow(reuse);
           if (q.size() == topN) {
             bottomCount = q.top().value;
           }
         }
       }
     }

     if (q == null) {
       return null;
     }

     BytesRef scratch = new BytesRef();

     LabelAndValue[] labelValues = new LabelAndValue[q.size()];
     for(int i=labelValues.length-1;i>=0;i--) {
       TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
       dv.lookupOrd(ordAndValue.ord, scratch);
       String[] parts = FacetsConfig.stringToPath(scratch.utf8ToString());
       labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value);
     }

     return new FacetResult(dim, new String[0], dimCount, labelValues, childCount);
   }

   /** Does all the "real work" of tallying up the counts. */
   private final void count(List<MatchingDocs> matchingDocs) throws IOException {
     //System.out.println("ssdv count");

     MultiDocValues.OrdinalMap ordinalMap;

     // TODO: is this right?  really, we need a way to
     // verify that this ordinalMap "matches" the leaves in
     // matchingDocs...
     if (dv instanceof MultiSortedSetDocValues && matchingDocs.size() > 1) {
       ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
     } else {
       ordinalMap = null;
     }

     for(MatchingDocs hits : matchingDocs) {

       AtomicReader reader = hits.context.reader();
       //System.out.println("  reader=" + reader);
       // LUCENE-5090: make sure the provided reader context "matches"
       // the top-level reader passed to the
       // SortedSetDocValuesReaderState, else cryptic
       // AIOOBE can happen:
       if (ReaderUtil.getTopLevelContext(hits.context).reader() != state.origReader) {
         throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
       }

       SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
       if (segValues == null) {
         continue;
       }

       final int maxDoc = reader.maxDoc();
       assert maxDoc == hits.bits.length();
       //System.out.println("  dv=" + dv);

       // TODO: yet another option is to count all segs
       // first, only in seg-ord space, and then do a
       // merge-sort-PQ in the end to only "resolve to
       // global" those seg ords that can compete, if we know
       // we just want top K?  ie, this is the same algo
       // that'd be used for merging facets across shards
       // (distributed faceting).  but this has much higher
       // temp ram req'ts (sum of number of ords across all
       // segs)
       if (ordinalMap != null) {
         int segOrd = hits.context.ord;

         int numSegOrds = (int) segValues.getValueCount();

         if (hits.totalHits < numSegOrds/10) {
           //System.out.println("    remap as-we-go");
           // Remap every ord to global ord as we iterate:
           int doc = 0;
           while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
             //System.out.println("    doc=" + doc);
             segValues.setDocument(doc);
             int term = (int) segValues.nextOrd();
             while (term != SortedSetDocValues.NO_MORE_ORDS) {
               //System.out.println("      segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
               counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
               term = (int) segValues.nextOrd();
             }
             ++doc;
           }
         } else {
           //System.out.println("    count in seg ord first");

           // First count in seg-ord space:
           final int[] segCounts = new int[numSegOrds];
           int doc = 0;
           while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
             //System.out.println("    doc=" + doc);
             segValues.setDocument(doc);
             int term = (int) segValues.nextOrd();
             while (term != SortedSetDocValues.NO_MORE_ORDS) {
               //System.out.println("      ord=" + term);
               segCounts[term]++;
               term = (int) segValues.nextOrd();
             }
             ++doc;
           }

           // Then, migrate to global ords:
           for(int ord=0;ord<numSegOrds;ord++) {
             int count = segCounts[ord];
             if (count != 0) {
               //System.out.println("    migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord));
               counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
             }
           }
         }
       } else {
         // No ord mapping (e.g., single segment index):
         // just aggregate directly into counts:

         int doc = 0;
         while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
           segValues.setDocument(doc);
           int term = (int) segValues.nextOrd();
           while (term != SortedSetDocValues.NO_MORE_ORDS) {
             counts[term]++;
             term = (int) segValues.nextOrd();
           }
           ++doc;
         }
       }
     }
   }

   @Override
   public Number getSpecificValue(String dim, String... path) {
     if (path.length != 1) {
       throw new IllegalArgumentException("path must be length=1");
     }
     int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
     if (ord < 0) {
       return -1;
     }

     return counts[ord];
   }

   @Override
   public List<FacetResult> getAllDims(int topN) throws IOException {

     List<FacetResult> results = new ArrayList<FacetResult>();
     for(Map.Entry<String,OrdRange> ent : state.getPrefixToOrdRange().entrySet()) {
       FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN);
       if (fr != null) {
         results.add(fr);
       }
     }

     // Sort by highest count:
     Collections.sort(results,
                      new Comparator<FacetResult>() {
                        @Override
                        public int compare(FacetResult a, FacetResult b) {
                          if (a.value.intValue() > b.value.intValue()) {
                            return -1;
                          } else if (b.value.intValue() > a.value.intValue()) {
                            return 1;
                          } else {
                            return a.dim.compareTo(b.dim);
                          }
                        }
                      });

     return results;
   }
 }
	package org.apache.lucene.facet.sortedset;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.facet.FacetResult;
	import org.apache.lucene.facet.Facets;
	import org.apache.lucene.facet.FacetsCollector;
	import org.apache.lucene.facet.FacetsCollector.MatchingDocs;
	import org.apache.lucene.facet.FacetsConfig;
	import org.apache.lucene.facet.LabelAndValue;
	import org.apache.lucene.facet.TopOrdAndIntQueue;
	import org.apache.lucene.facet.sortedset.SortedSetDocValuesReaderState.OrdRange;
	import org.apache.lucene.index.AtomicReader;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.MultiDocValues;
	import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
	import org.apache.lucene.index.ReaderUtil;
	import org.apache.lucene.index.SortedSetDocValues;
	import org.apache.lucene.util.BytesRef;

	/** Compute facets counts from previously
	* indexed {@link SortedSetDocValuesFacetField},
	* without require a separate taxonomy index. Faceting is
	* a bit slower (~25%), and there is added cost on every
	* {@link IndexReader} open to create a new {@link
	* SortedSetDocValuesReaderState}. Furthermore, this does
	* not support hierarchical facets; only flat (dimension +
	* label) facets, but it uses quite a bit less RAM to do
	* so.
	*
	* <p><b>NOTE</b>: this class should be instantiated and
	* then used from a single thread, because it holds a
	* thread-private instance of {@link SortedSetDocValues}.
	*
	* <p><b>NOTE:</b>: tie-break is by unicode sort order
	*
	* @lucene.experimental */
	public class SortedSetDocValuesFacetCounts extends Facets {

	final SortedSetDocValuesReaderState state;
	final SortedSetDocValues dv;
	final String field;
	final int[] counts;

	/** Sparse faceting: returns any dimension that had any
	* hits, topCount labels per dimension. */
	public SortedSetDocValuesFacetCounts(SortedSetDocValuesReaderState state, FacetsCollector hits)
	throws IOException {
	this.state = state;
	this.field = state.getField();
	counts = new int[state.getSize()];
	dv = state.getDocValues();
	//System.out.println("field=" + field);
	count(hits.getMatchingDocs());
	}

	@Override
	public FacetResult getTopChildren(int topN, String dim, String... path) throws IOException {
	if (topN <= 0) {
	throw new IllegalArgumentException("topN must be > 0 (got: " + topN + ")");
	}
	if (path.length > 0) {
	throw new IllegalArgumentException("path should be 0 length");
	}
	OrdRange ordRange = state.getOrdRange(dim);
	if (ordRange == null) {
	throw new IllegalArgumentException("dimension \"" + dim + "\" was not indexed");
	}
	return getDim(dim, ordRange, topN);
	}

	private final FacetResult getDim(String dim, OrdRange ordRange, int topN) {

	TopOrdAndIntQueue q = null;

	int bottomCount = 0;

	int dimCount = 0;
	int childCount = 0;

	TopOrdAndIntQueue.OrdAndValue reuse = null;
	//System.out.println("getDim : " + ordRange.start + " - " + ordRange.end);
	for(int ord=ordRange.start; ord<=ordRange.end; ord++) {
	//System.out.println(" ord=" + ord + " count=" + counts[ord]);
	if (counts[ord] > 0) {
	dimCount += counts[ord];
	childCount++;
	if (counts[ord] > bottomCount) {
	if (reuse == null) {
	reuse = new TopOrdAndIntQueue.OrdAndValue();
	}
	reuse.ord = ord;
	reuse.value = counts[ord];
	if (q == null) {
	// Lazy init, so we don't create this for the
	// sparse case unnecessarily
	q = new TopOrdAndIntQueue(topN);
	}
	reuse = q.insertWithOverflow(reuse);
	if (q.size() == topN) {
	bottomCount = q.top().value;
	}
	}
	}
	}

	if (q == null) {
	return null;
	}

	BytesRef scratch = new BytesRef();

	LabelAndValue[] labelValues = new LabelAndValue[q.size()];
	for(int i=labelValues.length-1;i>=0;i--) {
	TopOrdAndIntQueue.OrdAndValue ordAndValue = q.pop();
	dv.lookupOrd(ordAndValue.ord, scratch);
	String[] parts = FacetsConfig.stringToPath(scratch.utf8ToString());
	labelValues[i] = new LabelAndValue(parts[1], ordAndValue.value);
	}

	return new FacetResult(dim, new String[0], dimCount, labelValues, childCount);
	}

	/** Does all the "real work" of tallying up the counts. */
	private final void count(List<MatchingDocs> matchingDocs) throws IOException {
	//System.out.println("ssdv count");

	MultiDocValues.OrdinalMap ordinalMap;

	// TODO: is this right? really, we need a way to
	// verify that this ordinalMap "matches" the leaves in
	// matchingDocs...
	if (dv instanceof MultiSortedSetDocValues && matchingDocs.size() > 1) {
	ordinalMap = ((MultiSortedSetDocValues) dv).mapping;
	} else {
	ordinalMap = null;
	}

	for(MatchingDocs hits : matchingDocs) {

	AtomicReader reader = hits.context.reader();
	//System.out.println(" reader=" + reader);
	// LUCENE-5090: make sure the provided reader context "matches"
	// the top-level reader passed to the
	// SortedSetDocValuesReaderState, else cryptic
	// AIOOBE can happen:
	if (ReaderUtil.getTopLevelContext(hits.context).reader() != state.origReader) {
	throw new IllegalStateException("the SortedSetDocValuesReaderState provided to this class does not match the reader being searched; you must create a new SortedSetDocValuesReaderState every time you open a new IndexReader");
	}

	SortedSetDocValues segValues = reader.getSortedSetDocValues(field);
	if (segValues == null) {
	continue;
	}

	final int maxDoc = reader.maxDoc();
	assert maxDoc == hits.bits.length();
	//System.out.println(" dv=" + dv);

	// TODO: yet another option is to count all segs
	// first, only in seg-ord space, and then do a
	// merge-sort-PQ in the end to only "resolve to
	// global" those seg ords that can compete, if we know
	// we just want top K? ie, this is the same algo
	// that'd be used for merging facets across shards
	// (distributed faceting). but this has much higher
	// temp ram req'ts (sum of number of ords across all
	// segs)
	if (ordinalMap != null) {
	int segOrd = hits.context.ord;

	int numSegOrds = (int) segValues.getValueCount();

	if (hits.totalHits < numSegOrds/10) {
	//System.out.println(" remap as-we-go");
	// Remap every ord to global ord as we iterate:
	int doc = 0;
	while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
	//System.out.println(" doc=" + doc);
	segValues.setDocument(doc);
	int term = (int) segValues.nextOrd();
	while (term != SortedSetDocValues.NO_MORE_ORDS) {
	//System.out.println(" segOrd=" + segOrd + " ord=" + term + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, term));
	counts[(int) ordinalMap.getGlobalOrd(segOrd, term)]++;
	term = (int) segValues.nextOrd();
	}
	++doc;
	}
	} else {
	//System.out.println(" count in seg ord first");

	// First count in seg-ord space:
	final int[] segCounts = new int[numSegOrds];
	int doc = 0;
	while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
	//System.out.println(" doc=" + doc);
	segValues.setDocument(doc);
	int term = (int) segValues.nextOrd();
	while (term != SortedSetDocValues.NO_MORE_ORDS) {
	//System.out.println(" ord=" + term);
	segCounts[term]++;
	term = (int) segValues.nextOrd();
	}
	++doc;
	}

	// Then, migrate to global ords:
	for(int ord=0;ord<numSegOrds;ord++) {
	int count = segCounts[ord];
	if (count != 0) {
	//System.out.println(" migrate segOrd=" + segOrd + " ord=" + ord + " globalOrd=" + ordinalMap.getGlobalOrd(segOrd, ord));
	counts[(int) ordinalMap.getGlobalOrd(segOrd, ord)] += count;
	}
	}
	}
	} else {
	// No ord mapping (e.g., single segment index):
	// just aggregate directly into counts:

	int doc = 0;
	while (doc < maxDoc && (doc = hits.bits.nextSetBit(doc)) != -1) {
	segValues.setDocument(doc);
	int term = (int) segValues.nextOrd();
	while (term != SortedSetDocValues.NO_MORE_ORDS) {
	counts[term]++;
	term = (int) segValues.nextOrd();
	}
	++doc;
	}
	}
	}
	}

	@Override
	public Number getSpecificValue(String dim, String... path) {
	if (path.length != 1) {
	throw new IllegalArgumentException("path must be length=1");
	}
	int ord = (int) dv.lookupTerm(new BytesRef(FacetsConfig.pathToString(dim, path)));
	if (ord < 0) {
	return -1;
	}

	return counts[ord];
	}

	@Override
	public List<FacetResult> getAllDims(int topN) throws IOException {

	List<FacetResult> results = new ArrayList<FacetResult>();
	for(Map.Entry<String,OrdRange> ent : state.getPrefixToOrdRange().entrySet()) {
	FacetResult fr = getDim(ent.getKey(), ent.getValue(), topN);
	if (fr != null) {
	results.add(fr);
	}
	}

	// Sort by highest count:
	Collections.sort(results,
	new Comparator<FacetResult>() {
	@Override
	public int compare(FacetResult a, FacetResult b) {
	if (a.value.intValue() > b.value.intValue()) {
	return -1;
	} else if (b.value.intValue() > a.value.intValue()) {
	return 1;
	} else {
	return a.dim.compareTo(b.dim);
	}
	}
	});

	return results;
	}
	}