blob: 4de9ef7fa13c9674ac7aeae6cbc18f26036c4626 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.END;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.FIELD;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.LENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MAXLENGTH;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.MINVALUE;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.NUMVALUES;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.ORDPATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.PATTERN;
import static org.apache.lucene.codecs.simpletext.SimpleTextDocValuesWriter.TYPE;
import java.io.IOException;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.charset.StandardCharsets;
import java.text.DecimalFormat;
import java.text.DecimalFormatSymbols;
import java.text.ParseException;
import java.util.HashMap;
import java.util.Locale;
import java.util.Map;
import java.util.function.IntFunction;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.index.*;
import org.apache.lucene.search.DocIdSetIterator;
import org.apache.lucene.store.BufferedChecksumIndexInput;
import org.apache.lucene.store.ChecksumIndexInput;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.BytesRefBuilder;
import org.apache.lucene.util.RamUsageEstimator;
import org.apache.lucene.util.StringHelper;
class SimpleTextDocValuesReader extends DocValuesProducer {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(SimpleTextDocValuesReader.class)
+ RamUsageEstimator.shallowSizeOfInstance(BytesRef.class);
static class OneField {
private static final long BASE_RAM_BYTES_USED =
RamUsageEstimator.shallowSizeOfInstance(OneField.class);
long dataStartFilePointer;
String pattern;
String ordPattern;
int maxLength;
boolean fixedLength;
long minValue;
long numValues;
}
final int maxDoc;
final IndexInput data;
final BytesRefBuilder scratch = new BytesRefBuilder();
final Map<String, OneField> fields = new HashMap<>();
public SimpleTextDocValuesReader(SegmentReadState state, String ext) throws IOException {
// System.out.println("dir=" + state.directory + " seg=" + state.segmentInfo.name + " file=" +
// IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext));
data =
state.directory.openInput(
IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, ext),
state.context);
maxDoc = state.segmentInfo.maxDoc();
while (true) {
readLine();
// System.out.println("READ field=" + scratch.utf8ToString());
if (scratch.get().equals(END)) {
break;
}
assert startsWith(FIELD) : scratch.get().utf8ToString();
String fieldName = stripPrefix(FIELD);
// System.out.println(" field=" + fieldName);
OneField field = new OneField();
fields.put(fieldName, field);
readLine();
assert startsWith(TYPE) : scratch.get().utf8ToString();
DocValuesType dvType = DocValuesType.valueOf(stripPrefix(TYPE));
assert dvType != DocValuesType.NONE;
if (dvType == DocValuesType.NUMERIC) {
readLine();
assert startsWith(MINVALUE)
: "got " + scratch.get().utf8ToString() + " field=" + fieldName + " ext=" + ext;
field.minValue = Long.parseLong(stripPrefix(MINVALUE));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(data.getFilePointer() + (1 + field.pattern.length() + 2) * maxDoc);
} else if (dvType == DocValuesType.BINARY) {
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(
data.getFilePointer() + (9 + field.pattern.length() + field.maxLength + 2) * maxDoc);
} else if (dvType == DocValuesType.SORTED || dvType == DocValuesType.SORTED_SET) {
readLine();
assert startsWith(NUMVALUES);
field.numValues = Long.parseLong(stripPrefix(NUMVALUES));
readLine();
assert startsWith(MAXLENGTH);
field.maxLength = Integer.parseInt(stripPrefix(MAXLENGTH));
readLine();
assert startsWith(PATTERN);
field.pattern = stripPrefix(PATTERN);
readLine();
assert startsWith(ORDPATTERN);
field.ordPattern = stripPrefix(ORDPATTERN);
field.dataStartFilePointer = data.getFilePointer();
data.seek(
data.getFilePointer()
+ (9 + field.pattern.length() + field.maxLength) * field.numValues
+ (1 + field.ordPattern.length()) * maxDoc);
} else {
throw new AssertionError();
}
}
// We should only be called from above if at least one
// field has DVs:
assert !fields.isEmpty();
}
@Override
public NumericDocValues getNumeric(FieldInfo fieldInfo) throws IOException {
IntFunction<Long> values = getNumericNonIterator(fieldInfo);
if (values == null) {
return null;
} else {
DocValuesIterator docsWithField = getNumericDocsWithField(fieldInfo);
return new NumericDocValues() {
@Override
public int nextDoc() throws IOException {
return docsWithField.nextDoc();
}
@Override
public int docID() {
return docsWithField.docID();
}
@Override
public long cost() {
return docsWithField.cost();
}
@Override
public int advance(int target) throws IOException {
return docsWithField.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return docsWithField.advanceExact(target);
}
@Override
public long longValue() throws IOException {
return values.apply(docsWithField.docID());
}
};
}
}
IntFunction<Long> getNumericNonIterator(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
assert field != null;
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null : "field=" + fieldInfo.name + " fields=" + fields;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder =
new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
decoder.setParseBigDecimal(true);
return new IntFunction<Long>() {
@Override
public Long apply(int docID) {
try {
// System.out.println(Thread.currentThread().getName() + ": get docID=" + docID + " in=" +
// in);
if (docID < 0 || docID >= maxDoc) {
throw new IndexOutOfBoundsException(
"docID must be 0 .. " + (maxDoc - 1) + "; got " + docID);
}
in.seek(field.dataStartFilePointer + (1 + field.pattern.length() + 2) * docID);
SimpleTextUtil.readLine(in, scratch);
// System.out.println("parsing delta: " + scratch.utf8ToString());
BigDecimal bd;
try {
bd = (BigDecimal) decoder.parse(scratch.get().utf8ToString());
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse BigDecimal value", in, pe);
}
SimpleTextUtil.readLine(in, scratch); // read the line telling us if it's real or not
return BigInteger.valueOf(field.minValue).add(bd.toBigIntegerExact()).longValue();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
};
}
private abstract static class DocValuesIterator extends DocIdSetIterator {
abstract boolean advanceExact(int target) throws IOException;
}
private DocValuesIterator getNumericDocsWithField(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
return new DocValuesIterator() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(field.dataStartFilePointer + (1 + field.pattern.length() + 2) * i);
SimpleTextUtil.readLine(in, scratch); // data
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
if (scratch.byteAt(0) == (byte) 'T') {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(field.dataStartFilePointer + (1 + field.pattern.length() + 2) * target);
SimpleTextUtil.readLine(in, scratch); // data
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
return scratch.byteAt(0) == (byte) 'T';
}
};
}
@Override
public synchronized BinaryDocValues getBinary(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder =
new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
DocValuesIterator docsWithField = getBinaryDocsWithField(fieldInfo);
IntFunction<BytesRef> values =
new IntFunction<BytesRef>() {
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public BytesRef apply(int docID) {
try {
if (docID < 0 || docID >= maxDoc) {
throw new IndexOutOfBoundsException(
"docID must be 0 .. " + (maxDoc - 1) + "; got " + docID);
}
in.seek(
field.dataStartFilePointer
+ (9 + field.pattern.length() + field.maxLength + 2) * docID);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len =
decoder
.parse(
new String(
scratch.bytes(),
LENGTH.length,
scratch.length() - LENGTH.length,
StandardCharsets.UTF_8))
.intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
} catch (IOException ioe) {
throw new RuntimeException(ioe);
}
}
};
return new BinaryDocValues() {
@Override
public int nextDoc() throws IOException {
return docsWithField.nextDoc();
}
@Override
public int docID() {
return docsWithField.docID();
}
@Override
public long cost() {
return docsWithField.cost();
}
@Override
public int advance(int target) throws IOException {
return docsWithField.advance(target);
}
@Override
public boolean advanceExact(int target) throws IOException {
return docsWithField.advanceExact(target);
}
@Override
public BytesRef binaryValue() throws IOException {
return values.apply(docsWithField.docID());
}
};
}
private DocValuesIterator getBinaryDocsWithField(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder =
new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
return new DocValuesIterator() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(
field.dataStartFilePointer + (9 + field.pattern.length() + field.maxLength + 2) * i);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len =
decoder
.parse(
new String(
scratch.bytes(),
LENGTH.length,
scratch.length() - LENGTH.length,
StandardCharsets.UTF_8))
.intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
// skip past bytes
byte bytes[] = new byte[len];
in.readBytes(bytes, 0, len);
SimpleTextUtil.readLine(in, scratch); // newline
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
if (scratch.byteAt(0) == (byte) 'T') {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(
field.dataStartFilePointer
+ (9 + field.pattern.length() + field.maxLength + 2) * target);
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH);
int len;
try {
len =
decoder
.parse(
new String(
scratch.bytes(),
LENGTH.length,
scratch.length() - LENGTH.length,
StandardCharsets.UTF_8))
.intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
// skip past bytes
byte bytes[] = new byte[len];
in.readBytes(bytes, 0, len);
SimpleTextUtil.readLine(in, scratch); // newline
SimpleTextUtil.readLine(in, scratch); // 'T' or 'F'
return scratch.byteAt(0) == (byte) 'T';
}
};
}
@Override
public SortedDocValues getSorted(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder =
new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
final DecimalFormat ordDecoder =
new DecimalFormat(field.ordPattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedDocValues() {
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(docID() + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
int ord;
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(
field.dataStartFilePointer
+ field.numValues * (9 + field.pattern.length() + field.maxLength)
+ i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
if (ord >= 0) {
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
this.doc = target;
in.seek(
field.dataStartFilePointer
+ field.numValues * (9 + field.pattern.length() + field.maxLength)
+ target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
try {
ord = (int) ordDecoder.parse(scratch.get().utf8ToString()).longValue() - 1;
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse ord", in, pe);
}
return ord >= 0;
}
@Override
public int ordValue() {
return ord;
}
final BytesRefBuilder term = new BytesRefBuilder();
@Override
public BytesRef lookupOrd(int ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException(
"ord must be 0 .. " + (field.numValues - 1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH)
: "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len =
decoder
.parse(
new String(
scratch.bytes(),
LENGTH.length,
scratch.length() - LENGTH.length,
StandardCharsets.UTF_8))
.intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public int getValueCount() {
return (int) field.numValues;
}
};
}
@Override
public SortedNumericDocValues getSortedNumeric(FieldInfo field) throws IOException {
final BinaryDocValues binary = getBinary(field);
return new SortedNumericDocValues() {
@Override
public int nextDoc() throws IOException {
int doc = binary.nextDoc();
setCurrentDoc();
return doc;
}
@Override
public int docID() {
return binary.docID();
}
@Override
public long cost() {
return binary.cost();
}
@Override
public int advance(int target) throws IOException {
int doc = binary.advance(target);
setCurrentDoc();
return doc;
}
@Override
public boolean advanceExact(int target) throws IOException {
if (binary.advanceExact(target)) {
setCurrentDoc();
return true;
}
return false;
}
long values[];
int index;
private void setCurrentDoc() throws IOException {
if (docID() == NO_MORE_DOCS) {
return;
}
String csv = binary.binaryValue().utf8ToString();
if (csv.length() == 0) {
values = new long[0];
} else {
String s[] = csv.split(",");
values = new long[s.length];
for (int i = 0; i < values.length; i++) {
values[i] = Long.parseLong(s[i]);
}
}
index = 0;
}
@Override
public long nextValue() throws IOException {
return values[index++];
}
@Override
public int docValueCount() {
return values.length;
}
};
}
@Override
public SortedSetDocValues getSortedSet(FieldInfo fieldInfo) throws IOException {
final OneField field = fields.get(fieldInfo.name);
// SegmentCoreReaders already verifies this field is
// valid:
assert field != null;
final IndexInput in = data.clone();
final BytesRefBuilder scratch = new BytesRefBuilder();
final DecimalFormat decoder =
new DecimalFormat(field.pattern, new DecimalFormatSymbols(Locale.ROOT));
return new SortedSetDocValues() {
String[] currentOrds = new String[0];
int currentIndex = 0;
final BytesRefBuilder term = new BytesRefBuilder();
int doc = -1;
@Override
public int nextDoc() throws IOException {
return advance(doc + 1);
}
@Override
public int docID() {
return doc;
}
@Override
public long cost() {
return maxDoc;
}
@Override
public int advance(int target) throws IOException {
for (int i = target; i < maxDoc; ++i) {
in.seek(
field.dataStartFilePointer
+ field.numValues * (9 + field.pattern.length() + field.maxLength)
+ i * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return doc = i;
}
}
return doc = NO_MORE_DOCS;
}
@Override
public boolean advanceExact(int target) throws IOException {
in.seek(
field.dataStartFilePointer
+ field.numValues * (9 + field.pattern.length() + field.maxLength)
+ target * (1 + field.ordPattern.length()));
SimpleTextUtil.readLine(in, scratch);
String ordList = scratch.get().utf8ToString().trim();
doc = target;
if (ordList.isEmpty() == false) {
currentOrds = ordList.split(",");
currentIndex = 0;
return true;
}
return false;
}
@Override
public long nextOrd() throws IOException {
if (currentIndex == currentOrds.length) {
return NO_MORE_ORDS;
} else {
return Long.parseLong(currentOrds[currentIndex++]);
}
}
@Override
public BytesRef lookupOrd(long ord) throws IOException {
if (ord < 0 || ord >= field.numValues) {
throw new IndexOutOfBoundsException(
"ord must be 0 .. " + (field.numValues - 1) + "; got " + ord);
}
in.seek(field.dataStartFilePointer + ord * (9 + field.pattern.length() + field.maxLength));
SimpleTextUtil.readLine(in, scratch);
assert StringHelper.startsWith(scratch.get(), LENGTH)
: "got " + scratch.get().utf8ToString() + " in=" + in;
int len;
try {
len =
decoder
.parse(
new String(
scratch.bytes(),
LENGTH.length,
scratch.length() - LENGTH.length,
StandardCharsets.UTF_8))
.intValue();
} catch (ParseException pe) {
throw new CorruptIndexException("failed to parse int length", in, pe);
}
term.grow(len);
term.setLength(len);
in.readBytes(term.bytes(), 0, len);
return term.get();
}
@Override
public long getValueCount() {
return field.numValues;
}
};
}
@Override
public void close() throws IOException {
data.close();
}
/** Used only in ctor: */
private void readLine() throws IOException {
SimpleTextUtil.readLine(data, scratch);
// System.out.println("line: " + scratch.utf8ToString());
}
/** Used only in ctor: */
private boolean startsWith(BytesRef prefix) {
return StringHelper.startsWith(scratch.get(), prefix);
}
/** Used only in ctor: */
private String stripPrefix(BytesRef prefix) {
return new String(
scratch.bytes(), prefix.length, scratch.length() - prefix.length, StandardCharsets.UTF_8);
}
@Override
public long ramBytesUsed() {
return BASE_RAM_BYTES_USED
+ RamUsageEstimator.sizeOf(scratch.bytes())
+ fields.size()
* (RamUsageEstimator.NUM_BYTES_OBJECT_REF * 2L + OneField.BASE_RAM_BYTES_USED);
}
@Override
public String toString() {
return getClass().getSimpleName() + "(fields=" + fields.size() + ")";
}
@Override
public void checkIntegrity() throws IOException {
BytesRefBuilder scratch = new BytesRefBuilder();
IndexInput clone = data.clone();
clone.seek(0);
// checksum is fixed-width encoded with 20 bytes, plus 1 byte for newline (the space is included
// in SimpleTextUtil.CHECKSUM):
long footerStartPos = data.length() - (SimpleTextUtil.CHECKSUM.length + 21);
ChecksumIndexInput input = new BufferedChecksumIndexInput(clone);
while (true) {
SimpleTextUtil.readLine(input, scratch);
if (input.getFilePointer() >= footerStartPos) {
// Make sure we landed at precisely the right location:
if (input.getFilePointer() != footerStartPos) {
throw new CorruptIndexException(
"SimpleText failure: footer does not start at expected position current="
+ input.getFilePointer()
+ " vs expected="
+ footerStartPos,
input);
}
SimpleTextUtil.checkFooter(input);
break;
}
}
}
}