blob: a846dc9d36a6f77970634ec1981e40817739061e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.simpletext;
import java.io.IOException;
import org.apache.lucene.codecs.DocValuesConsumer;
import org.apache.lucene.codecs.DocValuesProducer;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
/**
* plain text doc values format.
* <p>
* <b>FOR RECREATIONAL USE ONLY</b>
* <p>
* the .dat file contains the data.
* for numbers this is a "fixed-width" file, for example a single byte range:
* <pre>
* field myField
* type NUMERIC
* minvalue 0
* pattern 000
* 005
* T
* 234
* T
* 123
* T
* ...
* </pre>
* so a document's value (delta encoded from minvalue) can be retrieved by
* seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline.
* The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
*
* for bytes this is also a "fixed-width" file, for example:
* <pre>
* field myField
* type BINARY
* maxlength 6
* pattern 0
* length 6
* foobar[space][space]
* T
* length 3
* baz[space][space][space][space][space]
* T
* ...
* </pre>
* so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
* the extra 9 is 2 newlines, plus "length " itself.
* the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
*
* for sorted bytes this is a fixed-width file, for example:
* <pre>
* field myField
* type SORTED
* numvalues 10
* maxLength 8
* pattern 0
* ordpattern 00
* length 6
* foobar[space][space]
* length 3
* baz[space][space][space][space][space]
* ...
* 03
* 06
* 01
* 10
* ...
* </pre>
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
*
* for sorted set this is a fixed-width file very similar to the SORTED case, for example:
* <pre>
* field myField
* type SORTED_SET
* numvalues 10
* maxLength 8
* pattern 0
* ordpattern XXXXX
* length 6
* foobar[space][space]
* length 3
* baz[space][space][space][space][space]
* ...
* 0,3,5
* 1,2
*
* 10
* ...
* </pre>
* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
* a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
* this is a comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
* and beware the empty string!
* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
*
* for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings the same as binary.
* beware the empty string!
*
* the reader can just scan this file when it opens, skipping over the data blocks
* and saving the offset/etc for each field.
* @lucene.experimental
*/
class SimpleTextDocValuesFormat extends DocValuesFormat {
public SimpleTextDocValuesFormat() {
super("SimpleText");
}
@Override
public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
return new SimpleTextDocValuesWriter(state, "dat");
}
@Override
public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
return new SimpleTextDocValuesReader(state, "dat");
}
}