lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.simpletext;

 import java.io.IOException;
 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;

 /**
  * plain text doc values format.
  *
  * <p><b>FOR RECREATIONAL USE ONLY</b>
  *
  * <p>the .dat file contains the data. for numbers this is a "fixed-width" file, for example a
  * single byte range:
  *
  * <pre>
  *  field myField
  *    type NUMERIC
  *    minvalue 0
  *    pattern 000
  *  005
  *  T
  *  234
  *  T
  *  123
  *  T
  *  ...
  *  </pre>
  *
  * so a document's value (delta encoded from minvalue) can be retrieved by seeking to startOffset +
  * (1+pattern.length()+2)*docid. The extra 1 is the newline. The extra 2 is another newline and 'T'
  * or 'F': true if the value is real, false if missing.
  *
  * <p>for bytes this is also a "fixed-width" file, for example:
  *
  * <pre>
  *  field myField
  *    type BINARY
  *    maxlength 6
  *    pattern 0
  *  length 6
  *  foobar[space][space]
  *  T
  *  length 3
  *  baz[space][space][space][space][space]
  *  T
  *  ...
  *  </pre>
  *
  * so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
  * the extra 9 is 2 newlines, plus "length " itself. the extra 2 is another newline and 'T' or 'F':
  * true if the value is real, false if missing.
  *
  * <p>for sorted bytes this is a fixed-width file, for example:
  *
  * <pre>
  *  field myField
  *    type SORTED
  *    numvalues 10
  *    maxLength 8
  *    pattern 0
  *    ordpattern 00
  *  length 6
  *  foobar[space][space]
  *  length 3
  *  baz[space][space][space][space][space]
  *  ...
  *  03
  *  06
  *  01
  *  10
  *  ...
  *  </pre>
  *
  * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
  * ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid an ord's value
  * can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
  *
  * <p>for sorted set this is a fixed-width file very similar to the SORTED case, for example:
  *
  * <pre>
  *  field myField
  *    type SORTED_SET
  *    numvalues 10
  *    maxLength 8
  *    pattern 0
  *    ordpattern XXXXX
  *  length 6
  *  foobar[space][space]
  *  length 3
  *  baz[space][space][space][space][space]
  *  ...
  *  0,3,5
  *  1,2
  *
  *  10
  *  ...
  *  </pre>
  *
  * so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
  * ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid this is a
  * comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
  * and beware the empty string! an ord's value can be retrieved by seeking to startOffset +
  * (9+pattern.length+maxlength)*ord
  *
  * <p>for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings
  * the same as binary. beware the empty string!
  *
  * <p>the reader can just scan this file when it opens, skipping over the data blocks and saving the
  * offset/etc for each field.
  *
  * @lucene.experimental
  */
 class SimpleTextDocValuesFormat extends DocValuesFormat {

   public SimpleTextDocValuesFormat() {
     super("SimpleText");
   }

   @Override
   public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
     return new SimpleTextDocValuesWriter(state, "dat");
   }

   @Override
   public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
     return new SimpleTextDocValuesReader(state, "dat");
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.simpletext;

	import java.io.IOException;
	import org.apache.lucene.codecs.DocValuesConsumer;
	import org.apache.lucene.codecs.DocValuesFormat;
	import org.apache.lucene.codecs.DocValuesProducer;
	import org.apache.lucene.index.SegmentReadState;
	import org.apache.lucene.index.SegmentWriteState;

	/**
	* plain text doc values format.
	*
	* <p><b>FOR RECREATIONAL USE ONLY</b>
	*
	* <p>the .dat file contains the data. for numbers this is a "fixed-width" file, for example a
	* single byte range:
	*
	* <pre>
	* field myField
	* type NUMERIC
	* minvalue 0
	* pattern 000
	* 005
	* T
	* 234
	* T
	* 123
	* T
	* ...
	* </pre>
	*
	* so a document's value (delta encoded from minvalue) can be retrieved by seeking to startOffset +
	* (1+pattern.length()+2)*docid. The extra 1 is the newline. The extra 2 is another newline and 'T'
	* or 'F': true if the value is real, false if missing.
	*
	* <p>for bytes this is also a "fixed-width" file, for example:
	*
	* <pre>
	* field myField
	* type BINARY
	* maxlength 6
	* pattern 0
	* length 6
	* foobar[space][space]
	* T
	* length 3
	* baz[space][space][space][space][space]
	* T
	* ...
	* </pre>
	*
	* so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
	* the extra 9 is 2 newlines, plus "length " itself. the extra 2 is another newline and 'T' or 'F':
	* true if the value is real, false if missing.
	*
	* <p>for sorted bytes this is a fixed-width file, for example:
	*
	* <pre>
	* field myField
	* type SORTED
	* numvalues 10
	* maxLength 8
	* pattern 0
	* ordpattern 00
	* length 6
	* foobar[space][space]
	* length 3
	* baz[space][space][space][space][space]
	* ...
	* 03
	* 06
	* 01
	* 10
	* ...
	* </pre>
	*
	* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
	* ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid an ord's value
	* can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
	*
	* <p>for sorted set this is a fixed-width file very similar to the SORTED case, for example:
	*
	* <pre>
	* field myField
	* type SORTED_SET
	* numvalues 10
	* maxLength 8
	* pattern 0
	* ordpattern XXXXX
	* length 6
	* foobar[space][space]
	* length 3
	* baz[space][space][space][space][space]
	* ...
	* 0,3,5
	* 1,2
	*
	* 10
	* ...
	* </pre>
	*
	* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues. a document's
	* ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid this is a
	* comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
	* and beware the empty string! an ord's value can be retrieved by seeking to startOffset +
	* (9+pattern.length+maxlength)*ord
	*
	* <p>for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings
	* the same as binary. beware the empty string!
	*
	* <p>the reader can just scan this file when it opens, skipping over the data blocks and saving the
	* offset/etc for each field.
	*
	* @lucene.experimental
	*/
	class SimpleTextDocValuesFormat extends DocValuesFormat {

	public SimpleTextDocValuesFormat() {
	super("SimpleText");
	}

	@Override
	public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
	return new SimpleTextDocValuesWriter(state, "dat");
	}

	@Override
	public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
	return new SimpleTextDocValuesReader(state, "dat");
	}
	}