blob: c66e0584dee95cb654e5b63674ca9c4fe93bb973 [file] [log] [blame]
package org.apache.lucene.codecs.lucene3x;
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* See the License for the specific language governing permissions and
* limitations under the License.
import java.util.Arrays;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Map;
import java.util.Set;
import org.apache.lucene.codecs.TermVectorsReader;
import org.apache.lucene.index.CorruptIndexException;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldInfos;
import org.apache.lucene.index.Fields;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexFormatTooNewException;
import org.apache.lucene.index.IndexFormatTooOldException;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.IOUtils;
/** @deprecated */
class Lucene3xTermVectorsReader extends TermVectorsReader {
// NOTE: if you make a new format, it must be larger than
// the current format
// Changed strings to UTF8 with length-in-bytes not length-in-chars
static final int FORMAT_UTF8_LENGTH_IN_BYTES = 4;
// NOTE: always change this if you switch to a new format!
// whenever you add a new format, make it 1 larger (positive version logic)!
// when removing support for old versions, leave the last supported version here
//The size in bytes that the FORMAT_VERSION will take up at the beginning of each file
static final int FORMAT_SIZE = 4;
public static final byte STORE_POSITIONS_WITH_TERMVECTOR = 0x1;
public static final byte STORE_OFFSET_WITH_TERMVECTOR = 0x2;
/** Extension of vectors fields file */
public static final String VECTORS_FIELDS_EXTENSION = "tvf";
/** Extension of vectors documents file */
public static final String VECTORS_DOCUMENTS_EXTENSION = "tvd";
/** Extension of vectors index file */
public static final String VECTORS_INDEX_EXTENSION = "tvx";
private FieldInfos fieldInfos;
private IndexInput tvx;
private IndexInput tvd;
private IndexInput tvf;
private int size;
private int numTotalDocs;
// The docID offset where our docs begin in the index
// file. This will be 0 if we have our own private file.
private int docStoreOffset;
// when we are inside a compound share doc store (CFX),
// (lucene 3.0 indexes only), we privately open our own fd.
// TODO: if we are worried, maybe we could eliminate the
// extra fd somehow when you also have vectors...
private final CompoundFileDirectory storeCFSReader;
private final int format;
// used by clone
Lucene3xTermVectorsReader(FieldInfos fieldInfos, IndexInput tvx, IndexInput tvd, IndexInput tvf, int size, int numTotalDocs, int docStoreOffset, int format) {
this.fieldInfos = fieldInfos;
this.tvx = tvx;
this.tvd = tvd;
this.tvf = tvf;
this.size = size;
this.numTotalDocs = numTotalDocs;
this.docStoreOffset = docStoreOffset;
this.format = format;
this.storeCFSReader = null;
public Lucene3xTermVectorsReader(Directory d, SegmentInfo si, FieldInfos fieldInfos, IOContext context)
throws CorruptIndexException, IOException {
final String segment = si.getDocStoreSegment();
final int docStoreOffset = si.getDocStoreOffset();
final int size = si.docCount;
boolean success = false;
try {
if (docStoreOffset != -1 && si.getDocStoreIsCompoundFile()) {
d = storeCFSReader = new CompoundFileDirectory(si.dir,
IndexFileNames.segmentFileName(segment, "", Lucene3xCodec.COMPOUND_FILE_STORE_EXTENSION), context, false);
} else {
storeCFSReader = null;
String idxName = IndexFileNames.segmentFileName(segment, "", VECTORS_INDEX_EXTENSION);
tvx = d.openInput(idxName, context);
format = checkValidFormat(tvx);
String fn = IndexFileNames.segmentFileName(segment, "", VECTORS_DOCUMENTS_EXTENSION);
tvd = d.openInput(fn, context);
final int tvdFormat = checkValidFormat(tvd);
fn = IndexFileNames.segmentFileName(segment, "", VECTORS_FIELDS_EXTENSION);
tvf = d.openInput(fn, context);
final int tvfFormat = checkValidFormat(tvf);
assert format == tvdFormat;
assert format == tvfFormat;
numTotalDocs = (int) (tvx.length() >> 4);
if (-1 == docStoreOffset) {
this.docStoreOffset = 0;
this.size = numTotalDocs;
assert size == 0 || numTotalDocs == size;
} else {
this.docStoreOffset = docStoreOffset;
this.size = size;
// Verify the file is long enough to hold all of our
// docs
assert numTotalDocs >= size + docStoreOffset: "numTotalDocs=" + numTotalDocs + " size=" + size + " docStoreOffset=" + docStoreOffset;
this.fieldInfos = fieldInfos;
success = true;
} finally {
// With lock-less commits, it's entirely possible (and
// fine) to hit a FileNotFound exception above. In
// this case, we want to explicitly close any subset
// of things that were opened so that we don't have to
// wait for a GC to do so.
if (!success) {
// Not private to avoid synthetic access$NNN methods
void seekTvx(final int docNum) throws IOException { + docStoreOffset) * 16L + FORMAT_SIZE);
private int checkValidFormat(IndexInput in) throws CorruptIndexException, IOException
int format = in.readInt();
if (format < FORMAT_MINIMUM)
throw new IndexFormatTooOldException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
if (format > FORMAT_CURRENT)
throw new IndexFormatTooNewException(in, format, FORMAT_MINIMUM, FORMAT_CURRENT);
return format;
public void close() throws IOException {
IOUtils.close(tvx, tvd, tvf, storeCFSReader);
* @return The number of documents in the reader
int size() {
return size;
private class TVFields extends Fields {
private final int[] fieldNumbers;
private final long[] fieldFPs;
private final Map<Integer,Integer> fieldNumberToIndex = new HashMap<Integer,Integer>();
public TVFields(int docID) throws IOException {
final int fieldCount = tvd.readVInt();
assert fieldCount >= 0;
if (fieldCount != 0) {
fieldNumbers = new int[fieldCount];
fieldFPs = new long[fieldCount];
for(int fieldUpto=0;fieldUpto<fieldCount;fieldUpto++) {
final int fieldNumber = tvd.readVInt();
fieldNumbers[fieldUpto] = fieldNumber;
fieldNumberToIndex.put(fieldNumber, fieldUpto);
long position = tvx.readLong();
fieldFPs[0] = position;
for(int fieldUpto=1;fieldUpto<fieldCount;fieldUpto++) {
position += tvd.readVLong();
fieldFPs[fieldUpto] = position;
} else {
// TODO: we can improve writer here, eg write 0 into
// tvx file, so we know on first read from tvx that
// this doc has no TVs
fieldNumbers = null;
fieldFPs = null;
public FieldsEnum iterator() throws IOException {
return new FieldsEnum() {
private int fieldUpto;
public String next() throws IOException {
if (fieldNumbers != null && fieldUpto < fieldNumbers.length) {
return fieldInfos.fieldName(fieldNumbers[fieldUpto++]);
} else {
return null;
public Terms terms() throws IOException {
return TVFields.this.terms(fieldInfos.fieldName(fieldNumbers[fieldUpto-1]));
public Terms terms(String field) throws IOException {
final FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
if (fieldInfo == null) {
// No such field
return null;
final Integer fieldIndex = fieldNumberToIndex.get(fieldInfo.number);
if (fieldIndex == null) {
// Term vectors were not indexed for this field
return null;
return new TVTerms(fieldFPs[fieldIndex]);
public int size() {
if (fieldNumbers == null) {
return 0;
} else {
return fieldNumbers.length;
private class TVTerms extends Terms {
private final int numTerms;
private final long tvfFPStart;
private final boolean unicodeSortOrder;
public TVTerms(long tvfFP) throws IOException {;
numTerms = tvf.readVInt();
tvfFPStart = tvf.getFilePointer();
unicodeSortOrder = sortTermsByUnicode();
public TermsEnum iterator(TermsEnum reuse) throws IOException {
TVTermsEnum termsEnum;
if (reuse instanceof TVTermsEnum) {
termsEnum = (TVTermsEnum) reuse;
if (!termsEnum.canReuse(tvf)) {
termsEnum = new TVTermsEnum();
} else {
termsEnum = new TVTermsEnum();
termsEnum.reset(numTerms, tvfFPStart, unicodeSortOrder);
return termsEnum;
public long size() {
return numTerms;
public long getSumTotalTermFreq() {
return -1;
public long getSumDocFreq() {
// Every term occurs in just one doc:
return numTerms;
public int getDocCount() {
return 1;
public Comparator<BytesRef> getComparator() {
if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
static class TermAndPostings {
BytesRef term;
int freq;
int[] positions;
int[] startOffsets;
int[] endOffsets;
private class TVTermsEnum extends TermsEnum {
private boolean unicodeSortOrder;
private final IndexInput origTVF;
private final IndexInput tvf;
private int numTerms;
private int currentTerm;
private boolean storePositions;
private boolean storeOffsets;
private TermAndPostings[] termAndPostings;
// NOTE: tvf is pre-positioned by caller
public TVTermsEnum() throws IOException {
this.origTVF = Lucene3xTermVectorsReader.this.tvf;
tvf = (IndexInput) origTVF.clone();
public boolean canReuse(IndexInput tvf) {
return tvf == origTVF;
public void reset(int numTerms, long tvfFPStart, boolean unicodeSortOrder) throws IOException {
this.numTerms = numTerms;
currentTerm = -1;;
final byte bits = tvf.readByte();
storePositions = (bits & STORE_POSITIONS_WITH_TERMVECTOR) != 0;
storeOffsets = (bits & STORE_OFFSET_WITH_TERMVECTOR) != 0;
this.unicodeSortOrder = unicodeSortOrder;
if (unicodeSortOrder) {
Arrays.sort(termAndPostings, new Comparator<TermAndPostings>() {
public int compare(TermAndPostings left, TermAndPostings right) {
return left.term.compareTo(right.term);
private void readVectors() throws IOException {
termAndPostings = new TermAndPostings[numTerms];
BytesRef lastTerm = new BytesRef();
for (int i = 0; i < numTerms; i++) {
TermAndPostings t = new TermAndPostings();
BytesRef term = new BytesRef();
final int start = tvf.readVInt();
final int deltaLen = tvf.readVInt();
term.length = start + deltaLen;
tvf.readBytes(term.bytes, start, deltaLen);
t.term = term;
int freq = tvf.readVInt();
t.freq = freq;
if (storePositions) {
int positions[] = new int[freq];
int pos = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
pos += tvf.readVInt();
positions[posUpto] = pos;
t.positions = positions;
if (storeOffsets) {
int startOffsets[] = new int[freq];
int endOffsets[] = new int[freq];
int offset = 0;
for(int posUpto=0;posUpto<freq;posUpto++) {
startOffsets[posUpto] = offset + tvf.readVInt();
offset = endOffsets[posUpto] = startOffsets[posUpto] + tvf.readVInt();
t.startOffsets = startOffsets;
t.endOffsets = endOffsets;
termAndPostings[i] = t;
// NOTE: slow! (linear scan)
public SeekStatus seekCeil(BytesRef text, boolean useCache) throws IOException {
Comparator<BytesRef> comparator = getComparator();
for (int i = 0; i < numTerms; i++) {
int cmp =, termAndPostings[i].term);
if (cmp < 0) {
currentTerm = i;
return SeekStatus.NOT_FOUND;
} else if (cmp == 0) {
currentTerm = i;
return SeekStatus.FOUND;
currentTerm = termAndPostings.length;
return SeekStatus.END;
public void seekExact(long ord) {
throw new UnsupportedOperationException();
public BytesRef next() throws IOException {
if (++currentTerm >= numTerms) {
return null;
return term();
public BytesRef term() {
return termAndPostings[currentTerm].term;
public long ord() {
throw new UnsupportedOperationException();
public int docFreq() {
return 1;
public long totalTermFreq() {
return termAndPostings[currentTerm].freq;
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs /* ignored */) throws IOException {
TVDocsEnum docsEnum;
if (reuse != null && reuse instanceof TVDocsEnum) {
docsEnum = (TVDocsEnum) reuse;
} else {
docsEnum = new TVDocsEnum();
docsEnum.reset(liveDocs, termAndPostings[currentTerm]);
return docsEnum;
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) throws IOException {
if (needsOffsets && !storeOffsets) {
return null;
if (!storePositions && !storeOffsets) {
return null;
TVDocsAndPositionsEnum docsAndPositionsEnum;
if (reuse != null && reuse instanceof TVDocsAndPositionsEnum) {
docsAndPositionsEnum = (TVDocsAndPositionsEnum) reuse;
} else {
docsAndPositionsEnum = new TVDocsAndPositionsEnum();
docsAndPositionsEnum.reset(liveDocs, termAndPostings[currentTerm]);
return docsAndPositionsEnum;
public Comparator<BytesRef> getComparator() {
if (unicodeSortOrder) {
return BytesRef.getUTF8SortedAsUnicodeComparator();
} else {
return BytesRef.getUTF8SortedAsUTF16Comparator();
// NOTE: sort of a silly class, since you can get the
// freq() already by TermsEnum.totalTermFreq
private static class TVDocsEnum extends DocsEnum {
private boolean didNext;
private int doc = -1;
private int freq;
private Bits liveDocs;
public int freq() {
return freq;
public int docID() {
return doc;
public int nextDoc() {
if (!didNext && (liveDocs == null || liveDocs.get(0))) {
didNext = true;
return (doc = 0);
} else {
return (doc = NO_MORE_DOCS);
public int advance(int target) {
if (!didNext && target == 0) {
return nextDoc();
} else {
return (doc = NO_MORE_DOCS);
public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
this.liveDocs = liveDocs;
this.freq = termAndPostings.freq;
this.doc = -1;
didNext = false;
private static class TVDocsAndPositionsEnum extends DocsAndPositionsEnum {
private boolean didNext;
private int doc = -1;
private int nextPos;
private Bits liveDocs;
private int[] positions;
private int[] startOffsets;
private int[] endOffsets;
public int freq() {
if (positions != null) {
return positions.length;
} else {
assert startOffsets != null;
return startOffsets.length;
public int docID() {
return doc;
public int nextDoc() {
if (!didNext && (liveDocs == null || liveDocs.get(0))) {
didNext = true;
return (doc = 0);
} else {
return (doc = NO_MORE_DOCS);
public int advance(int target) {
if (!didNext && target == 0) {
return nextDoc();
} else {
return (doc = NO_MORE_DOCS);
public void reset(Bits liveDocs, TermAndPostings termAndPostings) {
this.liveDocs = liveDocs;
this.positions = termAndPostings.positions;
this.startOffsets = termAndPostings.startOffsets;
this.endOffsets = termAndPostings.endOffsets;
this.doc = -1;
didNext = false;
nextPos = 0;
public BytesRef getPayload() {
return null;
public boolean hasPayload() {
return false;
public int nextPosition() {
assert (positions != null && nextPos < positions.length) ||
startOffsets != null && nextPos < startOffsets.length;
if (positions != null) {
return positions[nextPos++];
} else {
return -1;
public int startOffset() {
assert startOffsets != null;
return startOffsets[nextPos-1];
public int endOffset() {
assert endOffsets != null;
return endOffsets[nextPos-1];
public Fields get(int docID) throws IOException {
if (docID < 0 || docID >= numTotalDocs) {
throw new IllegalArgumentException("doID=" + docID + " is out of bounds [0.." + (numTotalDocs-1) + "]");
if (tvx != null) {
Fields fields = new TVFields(docID);
if (fields.size() == 0) {
// TODO: we can improve writer here, eg write 0 into
// tvx file, so we know on first read from tvx that
// this doc has no TVs
return null;
} else {
return fields;
} else {
return null;
public TermVectorsReader clone() {
IndexInput cloneTvx = null;
IndexInput cloneTvd = null;
IndexInput cloneTvf = null;
// These are null when a TermVectorsReader was created
// on a segment that did not have term vectors saved
if (tvx != null && tvd != null && tvf != null) {
cloneTvx = (IndexInput) tvx.clone();
cloneTvd = (IndexInput) tvd.clone();
cloneTvf = (IndexInput) tvf.clone();
return new Lucene3xTermVectorsReader(fieldInfos, cloneTvx, cloneTvd, cloneTvf, size, numTotalDocs, docStoreOffset, format);
// note: if there are shared docstores, we are also called by Lucene3xCodec even in
// the CFS case. so logic here must handle this.
public static void files(SegmentInfo info, Set<String> files) throws IOException {
if (info.getHasVectors()) {
if (info.getDocStoreOffset() != -1) {
assert info.getDocStoreSegment() != null;
if (info.getDocStoreIsCompoundFile()) {
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", Lucene3xCodec.COMPOUND_FILE_STORE_EXTENSION));
} else {
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_INDEX_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_FIELDS_EXTENSION));
files.add(IndexFileNames.segmentFileName(info.getDocStoreSegment(), "", VECTORS_DOCUMENTS_EXTENSION));
} else if (!info.getUseCompoundFile()) {
files.add(IndexFileNames.segmentFileName(, "", VECTORS_INDEX_EXTENSION));
files.add(IndexFileNames.segmentFileName(, "", VECTORS_FIELDS_EXTENSION));
files.add(IndexFileNames.segmentFileName(, "", VECTORS_DOCUMENTS_EXTENSION));
// If this returns, we do the surrogates shuffle so that the
// terms are sorted by unicode sort order. This should be
// true when segments are used for "normal" searching;
// it's only false during testing, to create a pre-flex
// index, using the test-only PreFlexRW.
protected boolean sortTermsByUnicode() {
return true;