solr/core/src/java/org/apache/solr/request/DocValuesStats.java - lucene-solr - Git at Google

 package org.apache.solr.request;

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.index.AtomicReaderContext;
 import org.apache.lucene.index.DocValues;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
 import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
 import org.apache.lucene.index.MultiDocValues.OrdinalMap;
 import org.apache.lucene.index.SortedDocValues;
 import org.apache.lucene.index.SortedSetDocValues;
 import org.apache.lucene.search.DocIdSet;
 import org.apache.lucene.search.DocIdSetIterator;
 import org.apache.lucene.search.Filter;
 import org.apache.lucene.search.TermQuery;
 import org.apache.lucene.search.TermRangeQuery;
 import org.apache.lucene.util.BytesRef;
 import org.apache.solr.handler.component.FieldFacetStats;
 import org.apache.solr.handler.component.StatsValues;
 import org.apache.solr.handler.component.StatsValuesFactory;
 import org.apache.solr.schema.FieldType;
 import org.apache.solr.schema.SchemaField;
 import org.apache.solr.search.DocSet;
 import org.apache.solr.search.SolrIndexSearcher;

 /**
  * Computes term stats for docvalues field (single or multivalued).
  * <p>
  * Instead of working on a top-level reader view (binary-search per docid),
  * it collects per-segment, but maps ordinals to global ordinal space using
  * MultiDocValues' OrdinalMap.
  */
 public class DocValuesStats {
   private DocValuesStats() {}

   public static StatsValues getCounts(SolrIndexSearcher searcher, String fieldName, DocSet docs, boolean calcDistinct, String[] facet) throws IOException {
     SchemaField schemaField = searcher.getSchema().getField(fieldName);
     FieldType ft = schemaField.getType();
     StatsValues res = StatsValuesFactory.createStatsValues(schemaField, calcDistinct);

     //Initialize facetstats, if facets have been passed in
     final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
     int upto = 0;
     for (String facetField : facet) {
       SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
       facetStats[upto++] = new FieldFacetStats(searcher, facetField, schemaField, facetSchemaField, calcDistinct);
     }

     // TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
     final boolean multiValued = schemaField.multiValued() || ft.multiValuedFieldCache();

     SortedSetDocValues si; // for term lookups only
     OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
     if (multiValued) {
       si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
       if (si instanceof MultiSortedSetDocValues) {
         ordinalMap = ((MultiSortedSetDocValues)si).mapping;
       }
     } else {
       SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName);
       si = single == null ? null : DocValues.singleton(single);
       if (single instanceof MultiSortedDocValues) {
         ordinalMap = ((MultiSortedDocValues)single).mapping;
       }
     }
     if (si == null) {
       si = DocValues.EMPTY_SORTED_SET;
     }
     if (si.getValueCount() >= Integer.MAX_VALUE) {
       throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
     }

     DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(fieldName, null, null, false, false)));

     final int nTerms = (int) si.getValueCount();

     // count collection array only needs to be as big as the number of terms we are
     // going to collect counts for.
     final int[] counts = new int[nTerms];

     Filter filter = docs.getTopFilter();
     List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
     for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
       AtomicReaderContext leaf = leaves.get(subIndex);
       DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
       DocIdSetIterator disi = null;
       if (dis != null) {
         disi = dis.iterator();
       }
       if (disi != null) {
         int docBase = leaf.docBase;
         if (multiValued) {
           SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
           if (sub == null) {
             sub = DocValues.EMPTY_SORTED_SET;
           }
           final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
           if (singleton != null) {
             // some codecs may optimize SORTED_SET storage for single-valued fields
             accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
           } else {
             accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
           }
         } else {
           SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
           if (sub == null) {
             sub = DocValues.EMPTY_SORTED;
           }
           accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
         }
       }
     }

     // add results in index order
     BytesRef value = new BytesRef();
     for (int ord = 0; ord < counts.length; ord++) {
       int count = counts[ord];
       if (count > 0) {
         si.lookupOrd(ord, value);
         res.accumulate(value, count);
         for (FieldFacetStats f : facetStats) {
           f.accumulateTermNum(ord, value);
         }
       }
     }

     res.addMissing(missing.size());
     if (facetStats.length > 0) {
       for (FieldFacetStats f : facetStats) {
         Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
         FieldType facetType = searcher.getSchema().getFieldType(f.name);
         for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
           String termLabel = entry.getKey();
           int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
           entry.getValue().addMissing(missingCount);
         }
         res.addFacet(f.name, facetStatsValues);
       }
     }
     return res;
   }

   /** accumulates per-segment single-valued stats */
   static void accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
     int doc;
     while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
       int term = si.getOrd(doc);
       if (term >= 0) {
         if (map != null) {
           term = (int) map.getGlobalOrd(subIndex, term);
         }
         counts[term]++;
         for (FieldFacetStats f : facetStats) {
           f.facetTermNum(docBase + doc, term);
         }
       }
     }
   }

   /** accumulates per-segment multi-valued stats */
   static void accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
     int doc;
     while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
       si.setDocument(doc);
       long ord;
       while ((ord = si.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
         int term = (int) ord;
         if (map != null) {
           term = (int) map.getGlobalOrd(subIndex, term);
         }
         counts[term]++;
         for (FieldFacetStats f : facetStats) {
           f.facetTermNum(docBase + doc, term);
         }
       }
     }
   }
 }
	package org.apache.solr.request;

	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.index.AtomicReaderContext;
	import org.apache.lucene.index.DocValues;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.MultiDocValues.MultiSortedDocValues;
	import org.apache.lucene.index.MultiDocValues.MultiSortedSetDocValues;
	import org.apache.lucene.index.MultiDocValues.OrdinalMap;
	import org.apache.lucene.index.SortedDocValues;
	import org.apache.lucene.index.SortedSetDocValues;
	import org.apache.lucene.search.DocIdSet;
	import org.apache.lucene.search.DocIdSetIterator;
	import org.apache.lucene.search.Filter;
	import org.apache.lucene.search.TermQuery;
	import org.apache.lucene.search.TermRangeQuery;
	import org.apache.lucene.util.BytesRef;
	import org.apache.solr.handler.component.FieldFacetStats;
	import org.apache.solr.handler.component.StatsValues;
	import org.apache.solr.handler.component.StatsValuesFactory;
	import org.apache.solr.schema.FieldType;
	import org.apache.solr.schema.SchemaField;
	import org.apache.solr.search.DocSet;
	import org.apache.solr.search.SolrIndexSearcher;

	/**
	* Computes term stats for docvalues field (single or multivalued).
	* <p>
	* Instead of working on a top-level reader view (binary-search per docid),
	* it collects per-segment, but maps ordinals to global ordinal space using
	* MultiDocValues' OrdinalMap.
	*/
	public class DocValuesStats {
	private DocValuesStats() {}

	public static StatsValues getCounts(SolrIndexSearcher searcher, String fieldName, DocSet docs, boolean calcDistinct, String[] facet) throws IOException {
	SchemaField schemaField = searcher.getSchema().getField(fieldName);
	FieldType ft = schemaField.getType();
	StatsValues res = StatsValuesFactory.createStatsValues(schemaField, calcDistinct);

	//Initialize facetstats, if facets have been passed in
	final FieldFacetStats[] facetStats = new FieldFacetStats[facet.length];
	int upto = 0;
	for (String facetField : facet) {
	SchemaField facetSchemaField = searcher.getSchema().getField(facetField);
	facetStats[upto++] = new FieldFacetStats(searcher, facetField, schemaField, facetSchemaField, calcDistinct);
	}

	// TODO: remove multiValuedFieldCache(), check dv type / uninversion type?
	final boolean multiValued = schemaField.multiValued() \|\| ft.multiValuedFieldCache();

	SortedSetDocValues si; // for term lookups only
	OrdinalMap ordinalMap = null; // for mapping per-segment ords to global ones
	if (multiValued) {
	si = searcher.getAtomicReader().getSortedSetDocValues(fieldName);
	if (si instanceof MultiSortedSetDocValues) {
	ordinalMap = ((MultiSortedSetDocValues)si).mapping;
	}
	} else {
	SortedDocValues single = searcher.getAtomicReader().getSortedDocValues(fieldName);
	si = single == null ? null : DocValues.singleton(single);
	if (single instanceof MultiSortedDocValues) {
	ordinalMap = ((MultiSortedDocValues)single).mapping;
	}
	}
	if (si == null) {
	si = DocValues.EMPTY_SORTED_SET;
	}
	if (si.getValueCount() >= Integer.MAX_VALUE) {
	throw new UnsupportedOperationException("Currently this stats method is limited to " + Integer.MAX_VALUE + " unique terms");
	}

	DocSet missing = docs.andNot( searcher.getDocSet(new TermRangeQuery(fieldName, null, null, false, false)));

	final int nTerms = (int) si.getValueCount();

	// count collection array only needs to be as big as the number of terms we are
	// going to collect counts for.
	final int[] counts = new int[nTerms];

	Filter filter = docs.getTopFilter();
	List<AtomicReaderContext> leaves = searcher.getTopReaderContext().leaves();
	for (int subIndex = 0; subIndex < leaves.size(); subIndex++) {
	AtomicReaderContext leaf = leaves.get(subIndex);
	DocIdSet dis = filter.getDocIdSet(leaf, null); // solr docsets already exclude any deleted docs
	DocIdSetIterator disi = null;
	if (dis != null) {
	disi = dis.iterator();
	}
	if (disi != null) {
	int docBase = leaf.docBase;
	if (multiValued) {
	SortedSetDocValues sub = leaf.reader().getSortedSetDocValues(fieldName);
	if (sub == null) {
	sub = DocValues.EMPTY_SORTED_SET;
	}
	final SortedDocValues singleton = DocValues.unwrapSingleton(sub);
	if (singleton != null) {
	// some codecs may optimize SORTED_SET storage for single-valued fields
	accumSingle(counts, docBase, facetStats, singleton, disi, subIndex, ordinalMap);
	} else {
	accumMulti(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
	}
	} else {
	SortedDocValues sub = leaf.reader().getSortedDocValues(fieldName);
	if (sub == null) {
	sub = DocValues.EMPTY_SORTED;
	}
	accumSingle(counts, docBase, facetStats, sub, disi, subIndex, ordinalMap);
	}
	}
	}

	// add results in index order
	BytesRef value = new BytesRef();
	for (int ord = 0; ord < counts.length; ord++) {
	int count = counts[ord];
	if (count > 0) {
	si.lookupOrd(ord, value);
	res.accumulate(value, count);
	for (FieldFacetStats f : facetStats) {
	f.accumulateTermNum(ord, value);
	}
	}
	}

	res.addMissing(missing.size());
	if (facetStats.length > 0) {
	for (FieldFacetStats f : facetStats) {
	Map<String, StatsValues> facetStatsValues = f.facetStatsValues;
	FieldType facetType = searcher.getSchema().getFieldType(f.name);
	for (Map.Entry<String,StatsValues> entry : facetStatsValues.entrySet()) {
	String termLabel = entry.getKey();
	int missingCount = searcher.numDocs(new TermQuery(new Term(f.name, facetType.toInternal(termLabel))), missing);
	entry.getValue().addMissing(missingCount);
	}
	res.addFacet(f.name, facetStatsValues);
	}
	}
	return res;
	}

	/** accumulates per-segment single-valued stats */
	static void accumSingle(int counts[], int docBase, FieldFacetStats[] facetStats, SortedDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
	int doc;
	while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
	int term = si.getOrd(doc);
	if (term >= 0) {
	if (map != null) {
	term = (int) map.getGlobalOrd(subIndex, term);
	}
	counts[term]++;
	for (FieldFacetStats f : facetStats) {
	f.facetTermNum(docBase + doc, term);
	}
	}
	}
	}

	/** accumulates per-segment multi-valued stats */
	static void accumMulti(int counts[], int docBase, FieldFacetStats[] facetStats, SortedSetDocValues si, DocIdSetIterator disi, int subIndex, OrdinalMap map) throws IOException {
	int doc;
	while ((doc = disi.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
	si.setDocument(doc);
	long ord;
	while ((ord = si.nextOrd()) != SortedSetDocValues.NO_MORE_ORDS) {
	int term = (int) ord;
	if (map != null) {
	term = (int) map.getGlobalOrd(subIndex, term);
	}
	counts[term]++;
	for (FieldFacetStats f : facetStats) {
	f.facetTermNum(docBase + doc, term);
	}
	}
	}
	}
	}