src/java/org/apache/solr/handler/component/TermVectorComponent.java - lucene - Git at Google

 package org.apache.solr.handler.component;

 import org.apache.lucene.document.Document;
 import org.apache.lucene.document.SetBasedFieldSelector;
 import org.apache.lucene.index.IndexReader;
 import org.apache.lucene.index.Term;
 import org.apache.lucene.index.TermEnum;
 import org.apache.lucene.index.TermVectorMapper;
 import org.apache.lucene.index.TermVectorOffsetInfo;
 import org.apache.solr.common.SolrException;
 import org.apache.solr.common.params.CommonParams;
 import org.apache.solr.common.params.ModifiableSolrParams;
 import org.apache.solr.common.params.SolrParams;
 import org.apache.solr.common.params.TermVectorParams;
 import org.apache.solr.common.util.NamedList;
 import org.apache.solr.common.util.StrUtils;
 import org.apache.solr.core.SolrCore;
 import org.apache.solr.schema.IndexSchema;
 import org.apache.solr.search.DocList;
 import org.apache.solr.search.DocListAndSet;
 import org.apache.solr.search.SolrIndexSearcher;
 import org.apache.solr.util.plugin.SolrCoreAware;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.logging.Logger;
 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */


 /**
  * Return term vectors for the documents in a query result set.
  * <p/>
  * Info available:
  * term, frequency, position, offset, IDF.
  * <p/>
  * <b>Note</b> Returning IDF can be expensive.
  */
 public class TermVectorComponent extends SearchComponent implements SolrCoreAware {


   public static final String COMPONENT_NAME = "tv";

   protected NamedList initParams;
   public static final String TERM_VECTORS = "termVectors";


   public void process(ResponseBuilder rb) throws IOException {
     SolrParams params = rb.req.getParams();
     if (!params.getBool(COMPONENT_NAME, false)) {
       return;
     }

     NamedList termVectors = new NamedList();
     rb.rsp.add(TERM_VECTORS, termVectors);
     //figure out what options we have, and try to get the appropriate vector
     boolean termFreq = params.getBool(TermVectorParams.TF, false);
     boolean positions = params.getBool(TermVectorParams.POSITIONS, false);
     boolean offsets = params.getBool(TermVectorParams.OFFSETS, false);
     boolean docFreq = params.getBool(TermVectorParams.DF, false);
     boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
     //boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
     //short cut to all values.
     boolean all = params.getBool(TermVectorParams.ALL, false);
     if (all == true){
       termFreq = true;
       positions = true;
       offsets = true;
       docFreq = true;
       tfIdf = true;
     }

     String[] fields = params.getParams(TermVectorParams.FIELDS);
     if (fields == null) {
       fields = params.getParams(CommonParams.FL);
     }
     DocListAndSet listAndSet = rb.getResults();
     List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
     Iterator<Integer> iter;
     if (docIds != null && docIds.isEmpty() == false) {
       iter = docIds.iterator();
     } else {
       DocList list = listAndSet.docList;
       iter = list.iterator();
     }
     SolrIndexSearcher searcher = rb.req.getSearcher();

     IndexReader reader = searcher.getReader();
     //the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors
     TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, docFreq, tfIdf);
     IndexSchema schema = rb.req.getSchema();
     String uniqFieldName = schema.getUniqueKeyField().getName();
     //Only load the id field
     SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
     while (iter.hasNext()) {
       Integer docId = iter.next();
       NamedList docNL = new NamedList();
       termVectors.add("doc-" + docId, docNL);
       mapper.docNL = docNL;
       Document document = reader.document(docId, fieldSelector);
       String uniqId = document.get(uniqFieldName);
       docNL.add("uniqueKey", uniqId);
       reader.getTermFreqVector(docId, mapper);
     }
     termVectors.add("uniqueKeyFieldName", uniqFieldName);
   }

   private List<Integer> getInts(String[] vals) {
     List<Integer> result = null;
     if (vals != null && vals.length > 0) {
       result = new ArrayList<Integer>(vals.length);
       for (int i = 0; i < vals.length; i++) {
         try {
           result.add(new Integer(vals[i]));
         } catch (NumberFormatException e) {
           throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
         }
       }
     }
     return result;
   }

   @Override
   public int distributedProcess(ResponseBuilder rb) throws IOException {
     int result = ResponseBuilder.STAGE_DONE;
     if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
       //Go ask each shard for it's vectors
       // for each shard, collect the documents for that shard.
       HashMap<String, Collection<ShardDoc>> shardMap = new HashMap<String, Collection<ShardDoc>>();
       for (ShardDoc sdoc : rb.resultIds.values()) {
         Collection<ShardDoc> shardDocs = shardMap.get(sdoc.shard);
         if (shardDocs == null) {
           shardDocs = new ArrayList<ShardDoc>();
           shardMap.put(sdoc.shard, shardDocs);
         }
         shardDocs.add(sdoc);
       }
       // Now create a request for each shard to retrieve the stored fields
       for (Collection<ShardDoc> shardDocs : shardMap.values()) {
         ShardRequest sreq = new ShardRequest();
         sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS;

         sreq.shards = new String[]{shardDocs.iterator().next().shard};

         sreq.params = new ModifiableSolrParams();

         // add original params
         sreq.params.add(rb.req.getParams());
         sreq.params.remove(CommonParams.Q);//remove the query
         ArrayList<String> ids = new ArrayList<String>(shardDocs.size());
         for (ShardDoc shardDoc : shardDocs) {
           ids.add(shardDoc.id.toString());
         }
         sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ','));

         rb.addRequest(this, sreq);
       }
       result = ResponseBuilder.STAGE_DONE;
     }
     return result;
   }

   private static class TVMapper extends TermVectorMapper {
     private NamedList docNL;
     private IndexReader reader;
     private Set<String> fields;
     private boolean termFreq, positions, offsets, docFreq, tfIdf;
     //internal vars not passed in by construction
     private boolean map, useOffsets, usePositions;
     //private Map<String, Integer> idfCache;
     private NamedList fieldNL;
     private Term currentTerm;

     /**
      *
      * @param fields
      * @param reader
      * @param termFreq
      * @param positions true if the TVM should try to get position info from the Term Vector, assuming it is present
      * @param offsets true if the TVM should try to get offset info from the Term Vector, assuming it is present
      * @param docFreq
      * @param tfIdf
      */
     public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean docFreq, boolean tfIdf) {

       this.reader = reader;
       this.fields = fields != null ? new HashSet<String>(Arrays.asList(fields)) : Collections.<String>emptySet();
       this.termFreq = termFreq;
       this.positions = positions;
       this.offsets = offsets;
       this.docFreq = docFreq;
       this.tfIdf = tfIdf;

     }

     public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
       if (map == true && fieldNL != null) {
         NamedList termInfo = new NamedList();
         fieldNL.add(term, termInfo);
         if (termFreq == true) {
           termInfo.add("tf", frequency);
         }
         if (useOffsets == true) {
           NamedList theOffsets = new NamedList();
           termInfo.add("offsets", theOffsets);
           for (int i = 0; i < offsets.length; i++) {
             TermVectorOffsetInfo offset = offsets[i];
             theOffsets.add("start", offset.getStartOffset());
             theOffsets.add("end", offset.getEndOffset());
           }
         }
         if (usePositions == true) {
           NamedList positionsNL = new NamedList();
           for (int i = 0; i < positions.length; i++) {
             positionsNL.add("position", positions[i]);
           }
           termInfo.add("positions", positionsNL);
         }
         if (docFreq == true) {
           termInfo.add("df", getDocFreq(term));
         }
         if (tfIdf == true){
           double tfIdfVal = ((double) frequency) / getDocFreq(term);
           termInfo.add("tf-idf", tfIdfVal);
         }
       }
     }

     private int getDocFreq(String term) {
       int result = 1;
       currentTerm = currentTerm.createTerm(term);
       try {
         TermEnum termEnum = reader.terms(currentTerm);
         if (termEnum != null && termEnum.term().equals(currentTerm)) {
           result = termEnum.docFreq();
         }
       } catch (IOException e) {
         throw new RuntimeException(e);
       }
       return result;
     }

     public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {

       if (docFreq == true && reader != null) {
         this.currentTerm = new Term(field);
       }
       useOffsets = storeOffsets && offsets;
       usePositions = storePositions && positions;
       if (fields.isEmpty() || fields.contains(field)) {
         map = true;
         fieldNL = new NamedList();
         docNL.add(field, fieldNL);
       } else {
         map = false;
         fieldNL = null;
       }
     }

     @Override
     public boolean isIgnoringPositions() {
       return this.positions == false;  // if we are not interested in positions, then return true telling Lucene to skip loading them
     }

     @Override
     public boolean isIgnoringOffsets() {
       return this.offsets == false;  //  if we are not interested in offsets, then return true telling Lucene to skip loading them
     }
   }

   public void prepare(ResponseBuilder rb) throws IOException {

   }

   //////////////////////// NamedListInitializedPlugin methods //////////////////////
   @Override
   public void init(NamedList args) {
     super.init(args);
     this.initParams = args;
   }

   public void inform(SolrCore core) {

   }

   public String getVersion() {
     return "$Revision$";
   }

   public String getSourceId() {
     return "$Id:$";
   }

   public String getSource() {
     return "$Revision:$";
   }

   public String getDescription() {
     return "A Component for working with Term Vectors";
   }
 }
	package org.apache.solr.handler.component;

	import org.apache.lucene.document.Document;
	import org.apache.lucene.document.SetBasedFieldSelector;
	import org.apache.lucene.index.IndexReader;
	import org.apache.lucene.index.Term;
	import org.apache.lucene.index.TermEnum;
	import org.apache.lucene.index.TermVectorMapper;
	import org.apache.lucene.index.TermVectorOffsetInfo;
	import org.apache.solr.common.SolrException;
	import org.apache.solr.common.params.CommonParams;
	import org.apache.solr.common.params.ModifiableSolrParams;
	import org.apache.solr.common.params.SolrParams;
	import org.apache.solr.common.params.TermVectorParams;
	import org.apache.solr.common.util.NamedList;
	import org.apache.solr.common.util.StrUtils;
	import org.apache.solr.core.SolrCore;
	import org.apache.solr.schema.IndexSchema;
	import org.apache.solr.search.DocList;
	import org.apache.solr.search.DocListAndSet;
	import org.apache.solr.search.SolrIndexSearcher;
	import org.apache.solr.util.plugin.SolrCoreAware;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.Collection;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Set;
	import java.util.logging.Logger;
	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/


	/**
	* Return term vectors for the documents in a query result set.
	* <p/>
	* Info available:
	* term, frequency, position, offset, IDF.
	* <p/>
	* <b>Note</b> Returning IDF can be expensive.
	*/
	public class TermVectorComponent extends SearchComponent implements SolrCoreAware {


	public static final String COMPONENT_NAME = "tv";

	protected NamedList initParams;
	public static final String TERM_VECTORS = "termVectors";


	public void process(ResponseBuilder rb) throws IOException {
	SolrParams params = rb.req.getParams();
	if (!params.getBool(COMPONENT_NAME, false)) {
	return;
	}

	NamedList termVectors = new NamedList();
	rb.rsp.add(TERM_VECTORS, termVectors);
	//figure out what options we have, and try to get the appropriate vector
	boolean termFreq = params.getBool(TermVectorParams.TF, false);
	boolean positions = params.getBool(TermVectorParams.POSITIONS, false);
	boolean offsets = params.getBool(TermVectorParams.OFFSETS, false);
	boolean docFreq = params.getBool(TermVectorParams.DF, false);
	boolean tfIdf = params.getBool(TermVectorParams.TF_IDF, false);
	//boolean cacheIdf = params.getBool(TermVectorParams.IDF, false);
	//short cut to all values.
	boolean all = params.getBool(TermVectorParams.ALL, false);
	if (all == true){
	termFreq = true;
	positions = true;
	offsets = true;
	docFreq = true;
	tfIdf = true;
	}

	String[] fields = params.getParams(TermVectorParams.FIELDS);
	if (fields == null) {
	fields = params.getParams(CommonParams.FL);
	}
	DocListAndSet listAndSet = rb.getResults();
	List<Integer> docIds = getInts(params.getParams(TermVectorParams.DOC_IDS));
	Iterator<Integer> iter;
	if (docIds != null && docIds.isEmpty() == false) {
	iter = docIds.iterator();
	} else {
	DocList list = listAndSet.docList;
	iter = list.iterator();
	}
	SolrIndexSearcher searcher = rb.req.getSearcher();

	IndexReader reader = searcher.getReader();
	//the TVMapper is a TermVectorMapper which can be used to optimize loading of Term Vectors
	TVMapper mapper = new TVMapper(fields, reader, termFreq, positions, offsets, docFreq, tfIdf);
	IndexSchema schema = rb.req.getSchema();
	String uniqFieldName = schema.getUniqueKeyField().getName();
	//Only load the id field
	SetBasedFieldSelector fieldSelector = new SetBasedFieldSelector(Collections.singleton(uniqFieldName), Collections.<String>emptySet());
	while (iter.hasNext()) {
	Integer docId = iter.next();
	NamedList docNL = new NamedList();
	termVectors.add("doc-" + docId, docNL);
	mapper.docNL = docNL;
	Document document = reader.document(docId, fieldSelector);
	String uniqId = document.get(uniqFieldName);
	docNL.add("uniqueKey", uniqId);
	reader.getTermFreqVector(docId, mapper);
	}
	termVectors.add("uniqueKeyFieldName", uniqFieldName);
	}

	private List<Integer> getInts(String[] vals) {
	List<Integer> result = null;
	if (vals != null && vals.length > 0) {
	result = new ArrayList<Integer>(vals.length);
	for (int i = 0; i < vals.length; i++) {
	try {
	result.add(new Integer(vals[i]));
	} catch (NumberFormatException e) {
	throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, e.getMessage(), e);
	}
	}
	}
	return result;
	}

	@Override
	public int distributedProcess(ResponseBuilder rb) throws IOException {
	int result = ResponseBuilder.STAGE_DONE;
	if (rb.stage == ResponseBuilder.STAGE_GET_FIELDS) {
	//Go ask each shard for it's vectors
	// for each shard, collect the documents for that shard.
	HashMap<String, Collection<ShardDoc>> shardMap = new HashMap<String, Collection<ShardDoc>>();
	for (ShardDoc sdoc : rb.resultIds.values()) {
	Collection<ShardDoc> shardDocs = shardMap.get(sdoc.shard);
	if (shardDocs == null) {
	shardDocs = new ArrayList<ShardDoc>();
	shardMap.put(sdoc.shard, shardDocs);
	}
	shardDocs.add(sdoc);
	}
	// Now create a request for each shard to retrieve the stored fields
	for (Collection<ShardDoc> shardDocs : shardMap.values()) {
	ShardRequest sreq = new ShardRequest();
	sreq.purpose = ShardRequest.PURPOSE_GET_FIELDS;

	sreq.shards = new String[]{shardDocs.iterator().next().shard};

	sreq.params = new ModifiableSolrParams();

	// add original params
	sreq.params.add(rb.req.getParams());
	sreq.params.remove(CommonParams.Q);//remove the query
	ArrayList<String> ids = new ArrayList<String>(shardDocs.size());
	for (ShardDoc shardDoc : shardDocs) {
	ids.add(shardDoc.id.toString());
	}
	sreq.params.add(TermVectorParams.DOC_IDS, StrUtils.join(ids, ','));

	rb.addRequest(this, sreq);
	}
	result = ResponseBuilder.STAGE_DONE;
	}
	return result;
	}

	private static class TVMapper extends TermVectorMapper {
	private NamedList docNL;
	private IndexReader reader;
	private Set<String> fields;
	private boolean termFreq, positions, offsets, docFreq, tfIdf;
	//internal vars not passed in by construction
	private boolean map, useOffsets, usePositions;
	//private Map<String, Integer> idfCache;
	private NamedList fieldNL;
	private Term currentTerm;

	/**
	*
	* @param fields
	* @param reader
	* @param termFreq
	* @param positions true if the TVM should try to get position info from the Term Vector, assuming it is present
	* @param offsets true if the TVM should try to get offset info from the Term Vector, assuming it is present
	* @param docFreq
	* @param tfIdf
	*/
	public TVMapper(String[] fields, IndexReader reader, boolean termFreq, boolean positions, boolean offsets, boolean docFreq, boolean tfIdf) {

	this.reader = reader;
	this.fields = fields != null ? new HashSet<String>(Arrays.asList(fields)) : Collections.<String>emptySet();
	this.termFreq = termFreq;
	this.positions = positions;
	this.offsets = offsets;
	this.docFreq = docFreq;
	this.tfIdf = tfIdf;

	}

	public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
	if (map == true && fieldNL != null) {
	NamedList termInfo = new NamedList();
	fieldNL.add(term, termInfo);
	if (termFreq == true) {
	termInfo.add("tf", frequency);
	}
	if (useOffsets == true) {
	NamedList theOffsets = new NamedList();
	termInfo.add("offsets", theOffsets);
	for (int i = 0; i < offsets.length; i++) {
	TermVectorOffsetInfo offset = offsets[i];
	theOffsets.add("start", offset.getStartOffset());
	theOffsets.add("end", offset.getEndOffset());
	}
	}
	if (usePositions == true) {
	NamedList positionsNL = new NamedList();
	for (int i = 0; i < positions.length; i++) {
	positionsNL.add("position", positions[i]);
	}
	termInfo.add("positions", positionsNL);
	}
	if (docFreq == true) {
	termInfo.add("df", getDocFreq(term));
	}
	if (tfIdf == true){
	double tfIdfVal = ((double) frequency) / getDocFreq(term);
	termInfo.add("tf-idf", tfIdfVal);
	}
	}
	}

	private int getDocFreq(String term) {
	int result = 1;
	currentTerm = currentTerm.createTerm(term);
	try {
	TermEnum termEnum = reader.terms(currentTerm);
	if (termEnum != null && termEnum.term().equals(currentTerm)) {
	result = termEnum.docFreq();
	}
	} catch (IOException e) {
	throw new RuntimeException(e);
	}
	return result;
	}

	public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {

	if (docFreq == true && reader != null) {
	this.currentTerm = new Term(field);
	}
	useOffsets = storeOffsets && offsets;
	usePositions = storePositions && positions;
	if (fields.isEmpty() \|\| fields.contains(field)) {
	map = true;
	fieldNL = new NamedList();
	docNL.add(field, fieldNL);
	} else {
	map = false;
	fieldNL = null;
	}
	}

	@Override
	public boolean isIgnoringPositions() {
	return this.positions == false; // if we are not interested in positions, then return true telling Lucene to skip loading them
	}

	@Override
	public boolean isIgnoringOffsets() {
	return this.offsets == false; // if we are not interested in offsets, then return true telling Lucene to skip loading them
	}
	}

	public void prepare(ResponseBuilder rb) throws IOException {

	}

	//////////////////////// NamedListInitializedPlugin methods //////////////////////
	@Override
	public void init(NamedList args) {
	super.init(args);
	this.initParams = args;
	}

	public void inform(SolrCore core) {

	}

	public String getVersion() {
	return "$Revision$";
	}

	public String getSourceId() {
	return "$Id:$";
	}

	public String getSource() {
	return "$Revision:$";
	}

	public String getDescription() {
	return "A Component for working with Term Vectors";
	}
	}