src/java/org/apache/lucene/index/TermInfosReader.java - lucene-solr - Git at Google

 package org.apache.lucene.index;

 /**
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import java.io.IOException;

 import org.apache.lucene.store.Directory;
 import org.apache.lucene.store.BufferedIndexInput;
 import org.apache.lucene.util.cache.Cache;
 import org.apache.lucene.util.cache.SimpleLRUCache;
 import org.apache.lucene.util.CloseableThreadLocal;

 /** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
  * Directory.  Pairs are accessed either by Term or by ordinal position the
  * set.  */

 final class TermInfosReader {
   private Directory directory;
   private String segment;
   private FieldInfos fieldInfos;

   private CloseableThreadLocal threadResources = new CloseableThreadLocal();
   private SegmentTermEnum origEnum;
   private long size;

   private Term[] indexTerms = null;
   private TermInfo[] indexInfos;
   private long[] indexPointers;

   private SegmentTermEnum indexEnum;

   private int indexDivisor = 1;
   private int totalIndexInterval;

   private final static int DEFAULT_CACHE_SIZE = 1024;

   /**
    * Per-thread resources managed by ThreadLocal
    */
   private static final class ThreadResources {
     SegmentTermEnum termEnum;

     // Used for caching the least recently looked-up Terms
     Cache termInfoCache;
   }

   TermInfosReader(Directory dir, String seg, FieldInfos fis)
        throws CorruptIndexException, IOException {
     this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
   }

   TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize)
        throws CorruptIndexException, IOException {
     boolean success = false;

     try {
       directory = dir;
       segment = seg;
       fieldInfos = fis;

       origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
           readBufferSize), fieldInfos, false);
       size = origEnum.size;
       totalIndexInterval = origEnum.indexInterval;

       indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
           readBufferSize), fieldInfos, true);

       success = true;
     } finally {
       // With lock-less commits, it's entirely possible (and
       // fine) to hit a FileNotFound exception above. In
       // this case, we want to explicitly close any subset
       // of things that were opened so that we don't have to
       // wait for a GC to do so.
       if (!success) {
         close();
       }
     }
   }

   public int getSkipInterval() {
     return origEnum.skipInterval;
   }

   public int getMaxSkipLevels() {
     return origEnum.maxSkipLevels;
   }

   /**
    * <p>Sets the indexDivisor, which subsamples the number
    * of indexed terms loaded into memory.  This has a
    * similar effect as {@link
    * IndexWriter#setTermIndexInterval} except that setting
    * must be done at indexing time while this setting can be
    * set per reader.  When set to N, then one in every
    * N*termIndexInterval terms in the index is loaded into
    * memory.  By setting this to a value > 1 you can reduce
    * memory usage, at the expense of higher latency when
    * loading a TermInfo.  The default value is 1.</p>
    *
    * <b>NOTE:</b> you must call this before the term
    * index is loaded.  If the index is already loaded,
    * an IllegalStateException is thrown.
    *
    + @throws IllegalStateException if the term index has
    * already been loaded into memory.
    */
   public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
     if (indexDivisor < 1)
       throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);

     if (indexTerms != null)
       throw new IllegalStateException("index terms are already loaded");

     this.indexDivisor = indexDivisor;
     totalIndexInterval = origEnum.indexInterval * indexDivisor;
   }

   /** Returns the indexDivisor.
    * @see #setIndexDivisor
    */
   public int getIndexDivisor() {
     return indexDivisor;
   }

   final void close() throws IOException {
     if (origEnum != null)
       origEnum.close();
     if (indexEnum != null)
       indexEnum.close();
     threadResources.close();
   }

   /** Returns the number of term/value pairs in the set. */
   final long size() {
     return size;
   }

   private ThreadResources getThreadResources() {
     ThreadResources resources = (ThreadResources)threadResources.get();
     if (resources == null) {
       resources = new ThreadResources();
       resources.termEnum = terms();
       // Cache does not have to be thread-safe, it is only used by one thread at the same time
       resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
       threadResources.set(resources);
     }
     return resources;
   }

   private synchronized void ensureIndexIsRead() throws IOException {
     if (indexTerms != null)                                    // index already read
       return;                                                  // do nothing
     try {
       int indexSize = 1+((int)indexEnum.size-1)/indexDivisor;  // otherwise read index

       indexTerms = new Term[indexSize];
       indexInfos = new TermInfo[indexSize];
       indexPointers = new long[indexSize];

       for (int i = 0; indexEnum.next(); i++) {
         indexTerms[i] = indexEnum.term();
         indexInfos[i] = indexEnum.termInfo();
         indexPointers[i] = indexEnum.indexPointer;

         for (int j = 1; j < indexDivisor; j++)
             if (!indexEnum.next())
                 break;
       }
     } finally {
         indexEnum.close();
         indexEnum = null;
     }
   }

   /** Returns the offset of the greatest index entry which is less than or equal to term.*/
   private final int getIndexOffset(Term term) {
     int lo = 0;					  // binary search indexTerms[]
     int hi = indexTerms.length - 1;

     while (hi >= lo) {
       int mid = (lo + hi) >>> 1;
       int delta = term.compareTo(indexTerms[mid]);
       if (delta < 0)
 	hi = mid - 1;
       else if (delta > 0)
 	lo = mid + 1;
       else
 	return mid;
     }
     return hi;
   }

   private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
     enumerator.seek(indexPointers[indexOffset],
                    (indexOffset * totalIndexInterval) - 1,
                    indexTerms[indexOffset], indexInfos[indexOffset]);
   }

   /** Returns the TermInfo for a Term in the set, or null. */
   TermInfo get(Term term) throws IOException {
     return get(term, true);
   }

   /** Returns the TermInfo for a Term in the set, or null. */
   private TermInfo get(Term term, boolean useCache) throws IOException {
     if (size == 0) return null;

     ensureIndexIsRead();

     TermInfo ti;
     ThreadResources resources = getThreadResources();
     Cache cache = null;

     if (useCache) {
       cache = resources.termInfoCache;
       // check the cache first if the term was recently looked up
       ti = (TermInfo) cache.get(term);
       if (ti != null) {
         return ti;
       }
     }

     // optimize sequential access: first try scanning cached enum w/o seeking
     SegmentTermEnum enumerator = resources.termEnum;
     if (enumerator.term() != null                 // term is at or past current
 	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
 	    || term.compareTo(enumerator.term()) >= 0)) {
       int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
       if (indexTerms.length == enumOffset	  // but before end of block
     || term.compareTo(indexTerms[enumOffset]) < 0) {
        // no need to seek

         int numScans = enumerator.scanTo(term);
         if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
           ti = enumerator.termInfo();
           if (cache != null && numScans > 1) {
             // we only  want to put this TermInfo into the cache if
             // scanEnum skipped more than one dictionary entry.
             // This prevents RangeQueries or WildcardQueries to
             // wipe out the cache when they iterate over a large numbers
             // of terms in order
             cache.put(term, ti);
           }
         } else {
           ti = null;
         }

         return ti;
       }
     }

     // random-access: must seek
     seekEnum(enumerator, getIndexOffset(term));
     enumerator.scanTo(term);
     if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
       ti = enumerator.termInfo();
       if (cache != null) {
         cache.put(term, ti);
       }
     } else {
       ti = null;
     }
     return ti;
   }

   /** Returns the nth term in the set. */
   final Term get(int position) throws IOException {
     if (size == 0) return null;

     SegmentTermEnum enumerator = getThreadResources().termEnum;
     if (enumerator != null && enumerator.term() != null &&
         position >= enumerator.position &&
 	position < (enumerator.position + totalIndexInterval))
       return scanEnum(enumerator, position);      // can avoid seek

     seekEnum(enumerator, position/totalIndexInterval); // must seek
     return scanEnum(enumerator, position);
   }

   private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
     while(enumerator.position < position)
       if (!enumerator.next())
 	return null;

     return enumerator.term();
   }

   /** Returns the position of a Term in the set or -1. */
   final long getPosition(Term term) throws IOException {
     if (size == 0) return -1;

     ensureIndexIsRead();
     int indexOffset = getIndexOffset(term);

     SegmentTermEnum enumerator = getThreadResources().termEnum;
     seekEnum(enumerator, indexOffset);

     while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}

     if (term.compareTo(enumerator.term()) == 0)
       return enumerator.position;
     else
       return -1;
   }

   /** Returns an enumeration of all the Terms and TermInfos in the set. */
   public SegmentTermEnum terms() {
     return (SegmentTermEnum)origEnum.clone();
   }

   /** Returns an enumeration of terms starting at or after the named term. */
   public SegmentTermEnum terms(Term term) throws IOException {
     // don't use the cache in this call because we want to reposition the
     // enumeration
     get(term, false);
     return (SegmentTermEnum)getThreadResources().termEnum.clone();
   }
 }
	package org.apache.lucene.index;

	/**
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import java.io.IOException;

	import org.apache.lucene.store.Directory;
	import org.apache.lucene.store.BufferedIndexInput;
	import org.apache.lucene.util.cache.Cache;
	import org.apache.lucene.util.cache.SimpleLRUCache;
	import org.apache.lucene.util.CloseableThreadLocal;

	/** This stores a monotonically increasing set of <Term, TermInfo> pairs in a
	* Directory. Pairs are accessed either by Term or by ordinal position the
	* set. */

	final class TermInfosReader {
	private Directory directory;
	private String segment;
	private FieldInfos fieldInfos;

	private CloseableThreadLocal threadResources = new CloseableThreadLocal();
	private SegmentTermEnum origEnum;
	private long size;

	private Term[] indexTerms = null;
	private TermInfo[] indexInfos;
	private long[] indexPointers;

	private SegmentTermEnum indexEnum;

	private int indexDivisor = 1;
	private int totalIndexInterval;

	private final static int DEFAULT_CACHE_SIZE = 1024;

	/**
	* Per-thread resources managed by ThreadLocal
	*/
	private static final class ThreadResources {
	SegmentTermEnum termEnum;

	// Used for caching the least recently looked-up Terms
	Cache termInfoCache;
	}

	TermInfosReader(Directory dir, String seg, FieldInfos fis)
	throws CorruptIndexException, IOException {
	this(dir, seg, fis, BufferedIndexInput.BUFFER_SIZE);
	}

	TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize)
	throws CorruptIndexException, IOException {
	boolean success = false;

	try {
	directory = dir;
	segment = seg;
	fieldInfos = fis;

	origEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_EXTENSION,
	readBufferSize), fieldInfos, false);
	size = origEnum.size;
	totalIndexInterval = origEnum.indexInterval;

	indexEnum = new SegmentTermEnum(directory.openInput(segment + "." + IndexFileNames.TERMS_INDEX_EXTENSION,
	readBufferSize), fieldInfos, true);

	success = true;
	} finally {
	// With lock-less commits, it's entirely possible (and
	// fine) to hit a FileNotFound exception above. In
	// this case, we want to explicitly close any subset
	// of things that were opened so that we don't have to
	// wait for a GC to do so.
	if (!success) {
	close();
	}
	}
	}

	public int getSkipInterval() {
	return origEnum.skipInterval;
	}

	public int getMaxSkipLevels() {
	return origEnum.maxSkipLevels;
	}

	/**
	* <p>Sets the indexDivisor, which subsamples the number
	* of indexed terms loaded into memory. This has a
	* similar effect as {@link
	* IndexWriter#setTermIndexInterval} except that setting
	* must be done at indexing time while this setting can be
	* set per reader. When set to N, then one in every
	* N*termIndexInterval terms in the index is loaded into
	* memory. By setting this to a value > 1 you can reduce
	* memory usage, at the expense of higher latency when
	* loading a TermInfo. The default value is 1.</p>
	*
	* <b>NOTE:</b> you must call this before the term
	* index is loaded. If the index is already loaded,
	* an IllegalStateException is thrown.
	*
	+ @throws IllegalStateException if the term index has
	* already been loaded into memory.
	*/
	public void setIndexDivisor(int indexDivisor) throws IllegalStateException {
	if (indexDivisor < 1)
	throw new IllegalArgumentException("indexDivisor must be > 0: got " + indexDivisor);

	if (indexTerms != null)
	throw new IllegalStateException("index terms are already loaded");

	this.indexDivisor = indexDivisor;
	totalIndexInterval = origEnum.indexInterval * indexDivisor;
	}

	/** Returns the indexDivisor.
	* @see #setIndexDivisor
	*/
	public int getIndexDivisor() {
	return indexDivisor;
	}

	final void close() throws IOException {
	if (origEnum != null)
	origEnum.close();
	if (indexEnum != null)
	indexEnum.close();
	threadResources.close();
	}

	/** Returns the number of term/value pairs in the set. */
	final long size() {
	return size;
	}

	private ThreadResources getThreadResources() {
	ThreadResources resources = (ThreadResources)threadResources.get();
	if (resources == null) {
	resources = new ThreadResources();
	resources.termEnum = terms();
	// Cache does not have to be thread-safe, it is only used by one thread at the same time
	resources.termInfoCache = new SimpleLRUCache(DEFAULT_CACHE_SIZE);
	threadResources.set(resources);
	}
	return resources;
	}

	private synchronized void ensureIndexIsRead() throws IOException {
	if (indexTerms != null) // index already read
	return; // do nothing
	try {
	int indexSize = 1+((int)indexEnum.size-1)/indexDivisor; // otherwise read index

	indexTerms = new Term[indexSize];
	indexInfos = new TermInfo[indexSize];
	indexPointers = new long[indexSize];

	for (int i = 0; indexEnum.next(); i++) {
	indexTerms[i] = indexEnum.term();
	indexInfos[i] = indexEnum.termInfo();
	indexPointers[i] = indexEnum.indexPointer;

	for (int j = 1; j < indexDivisor; j++)
	if (!indexEnum.next())
	break;
	}
	} finally {
	indexEnum.close();
	indexEnum = null;
	}
	}

	/** Returns the offset of the greatest index entry which is less than or equal to term.*/
	private final int getIndexOffset(Term term) {
	int lo = 0; // binary search indexTerms[]
	int hi = indexTerms.length - 1;

	while (hi >= lo) {
	int mid = (lo + hi) >>> 1;
	int delta = term.compareTo(indexTerms[mid]);
	if (delta < 0)
	hi = mid - 1;
	else if (delta > 0)
	lo = mid + 1;
	else
	return mid;
	}
	return hi;
	}

	private final void seekEnum(SegmentTermEnum enumerator, int indexOffset) throws IOException {
	enumerator.seek(indexPointers[indexOffset],
	(indexOffset * totalIndexInterval) - 1,
	indexTerms[indexOffset], indexInfos[indexOffset]);
	}

	/** Returns the TermInfo for a Term in the set, or null. */
	TermInfo get(Term term) throws IOException {
	return get(term, true);
	}

	/** Returns the TermInfo for a Term in the set, or null. */
	private TermInfo get(Term term, boolean useCache) throws IOException {
	if (size == 0) return null;

	ensureIndexIsRead();

	TermInfo ti;
	ThreadResources resources = getThreadResources();
	Cache cache = null;

	if (useCache) {
	cache = resources.termInfoCache;
	// check the cache first if the term was recently looked up
	ti = (TermInfo) cache.get(term);
	if (ti != null) {
	return ti;
	}
	}

	// optimize sequential access: first try scanning cached enum w/o seeking
	SegmentTermEnum enumerator = resources.termEnum;
	if (enumerator.term() != null // term is at or past current
	&& ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0)
	\|\| term.compareTo(enumerator.term()) >= 0)) {
	int enumOffset = (int)(enumerator.position/totalIndexInterval)+1;
	if (indexTerms.length == enumOffset // but before end of block
	\|\| term.compareTo(indexTerms[enumOffset]) < 0) {
	// no need to seek

	int numScans = enumerator.scanTo(term);
	if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
	ti = enumerator.termInfo();
	if (cache != null && numScans > 1) {
	// we only want to put this TermInfo into the cache if
	// scanEnum skipped more than one dictionary entry.
	// This prevents RangeQueries or WildcardQueries to
	// wipe out the cache when they iterate over a large numbers
	// of terms in order
	cache.put(term, ti);
	}
	} else {
	ti = null;
	}

	return ti;
	}
	}

	// random-access: must seek
	seekEnum(enumerator, getIndexOffset(term));
	enumerator.scanTo(term);
	if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) {
	ti = enumerator.termInfo();
	if (cache != null) {
	cache.put(term, ti);
	}
	} else {
	ti = null;
	}
	return ti;
	}

	/** Returns the nth term in the set. */
	final Term get(int position) throws IOException {
	if (size == 0) return null;

	SegmentTermEnum enumerator = getThreadResources().termEnum;
	if (enumerator != null && enumerator.term() != null &&
	position >= enumerator.position &&
	position < (enumerator.position + totalIndexInterval))
	return scanEnum(enumerator, position); // can avoid seek

	seekEnum(enumerator, position/totalIndexInterval); // must seek
	return scanEnum(enumerator, position);
	}

	private final Term scanEnum(SegmentTermEnum enumerator, int position) throws IOException {
	while(enumerator.position < position)
	if (!enumerator.next())
	return null;

	return enumerator.term();
	}

	/** Returns the position of a Term in the set or -1. */
	final long getPosition(Term term) throws IOException {
	if (size == 0) return -1;

	ensureIndexIsRead();
	int indexOffset = getIndexOffset(term);

	SegmentTermEnum enumerator = getThreadResources().termEnum;
	seekEnum(enumerator, indexOffset);

	while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {}

	if (term.compareTo(enumerator.term()) == 0)
	return enumerator.position;
	else
	return -1;
	}

	/** Returns an enumeration of all the Terms and TermInfos in the set. */
	public SegmentTermEnum terms() {
	return (SegmentTermEnum)origEnum.clone();
	}

	/** Returns an enumeration of terms starting at or after the named term. */
	public SegmentTermEnum terms(Term term) throws IOException {
	// don't use the cache in this call because we want to reposition the
	// enumeration
	get(term, false);
	return (SegmentTermEnum)getThreadResources().termEnum.clone();
	}
	}