lucene/codecs/src/java/org/apache/lucene/codecs/simpletext/SimpleTextDocValuesFormat.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs.simpletext;


 import java.io.IOException;

 import org.apache.lucene.codecs.DocValuesConsumer;
 import org.apache.lucene.codecs.DocValuesProducer;
 import org.apache.lucene.codecs.DocValuesFormat;
 import org.apache.lucene.index.SegmentReadState;
 import org.apache.lucene.index.SegmentWriteState;

 /**
  * plain text doc values format.
  * <p>
  * <b>FOR RECREATIONAL USE ONLY</b>
  * <p>
  * the .dat file contains the data.
  *  for numbers this is a "fixed-width" file, for example a single byte range:
  *  <pre>
  *  field myField
  *    type NUMERIC
  *    minvalue 0
  *    pattern 000
  *  005
  *  T
  *  234
  *  T
  *  123
  *  T
  *  ...
  *  </pre>
  *  so a document's value (delta encoded from minvalue) can be retrieved by
  *  seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline.
  *  The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
  *
  *  for bytes this is also a "fixed-width" file, for example:
  *  <pre>
  *  field myField
  *    type BINARY
  *    maxlength 6
  *    pattern 0
  *  length 6
  *  foobar[space][space]
  *  T
  *  length 3
  *  baz[space][space][space][space][space]
  *  T
  *  ...
  *  </pre>
  *  so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
  *  the extra 9 is 2 newlines, plus "length " itself.
  *  the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
  *
  *  for sorted bytes this is a fixed-width file, for example:
  *  <pre>
  *  field myField
  *    type SORTED
  *    numvalues 10
  *    maxLength 8
  *    pattern 0
  *    ordpattern 00
  *  length 6
  *  foobar[space][space]
  *  length 3
  *  baz[space][space][space][space][space]
  *  ...
  *  03
  *  06
  *  01
  *  10
  *  ...
  *  </pre>
  *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
  *  a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
  *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
  *
  *  for sorted set this is a fixed-width file very similar to the SORTED case, for example:
  *  <pre>
  *  field myField
  *    type SORTED_SET
  *    numvalues 10
  *    maxLength 8
  *    pattern 0
  *    ordpattern XXXXX
  *  length 6
  *  foobar[space][space]
  *  length 3
  *  baz[space][space][space][space][space]
  *  ...
  *  0,3,5
  *  1,2
  *
  *  10
  *  ...
  *  </pre>
  *  so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
  *  a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
  *  this is a comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
  *  and beware the empty string!
  *  an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
  *
  *  for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings the same as binary.
  *  beware the empty string!
  *
  *  the reader can just scan this file when it opens, skipping over the data blocks
  *  and saving the offset/etc for each field.
  *  @lucene.experimental
  */
 class SimpleTextDocValuesFormat extends DocValuesFormat {

   public SimpleTextDocValuesFormat() {
     super("SimpleText");
   }

   @Override
   public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
     return new SimpleTextDocValuesWriter(state, "dat");
   }

   @Override
   public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
     return new SimpleTextDocValuesReader(state, "dat");
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs.simpletext;


	import java.io.IOException;

	import org.apache.lucene.codecs.DocValuesConsumer;
	import org.apache.lucene.codecs.DocValuesProducer;
	import org.apache.lucene.codecs.DocValuesFormat;
	import org.apache.lucene.index.SegmentReadState;
	import org.apache.lucene.index.SegmentWriteState;

	/**
	* plain text doc values format.
	* <p>
	* <b>FOR RECREATIONAL USE ONLY</b>
	* <p>
	* the .dat file contains the data.
	* for numbers this is a "fixed-width" file, for example a single byte range:
	* <pre>
	* field myField
	* type NUMERIC
	* minvalue 0
	* pattern 000
	* 005
	* T
	* 234
	* T
	* 123
	* T
	* ...
	* </pre>
	* so a document's value (delta encoded from minvalue) can be retrieved by
	* seeking to startOffset + (1+pattern.length()+2)*docid. The extra 1 is the newline.
	* The extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
	*
	* for bytes this is also a "fixed-width" file, for example:
	* <pre>
	* field myField
	* type BINARY
	* maxlength 6
	* pattern 0
	* length 6
	* foobar[space][space]
	* T
	* length 3
	* baz[space][space][space][space][space]
	* T
	* ...
	* </pre>
	* so a doc's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength+2)*doc
	* the extra 9 is 2 newlines, plus "length " itself.
	* the extra 2 is another newline and 'T' or 'F': true if the value is real, false if missing.
	*
	* for sorted bytes this is a fixed-width file, for example:
	* <pre>
	* field myField
	* type SORTED
	* numvalues 10
	* maxLength 8
	* pattern 0
	* ordpattern 00
	* length 6
	* foobar[space][space]
	* length 3
	* baz[space][space][space][space][space]
	* ...
	* 03
	* 06
	* 01
	* 10
	* ...
	* </pre>
	* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
	* a document's ord can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
	* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
	*
	* for sorted set this is a fixed-width file very similar to the SORTED case, for example:
	* <pre>
	* field myField
	* type SORTED_SET
	* numvalues 10
	* maxLength 8
	* pattern 0
	* ordpattern XXXXX
	* length 6
	* foobar[space][space]
	* length 3
	* baz[space][space][space][space][space]
	* ...
	* 0,3,5
	* 1,2
	*
	* 10
	* ...
	* </pre>
	* so the "ord section" begins at startOffset + (9+pattern.length+maxlength)*numValues.
	* a document's ord list can be retrieved by seeking to "ord section" + (1+ordpattern.length())*docid
	* this is a comma-separated list, and it's padded with spaces to be fixed width. so trim() and split() it.
	* and beware the empty string!
	* an ord's value can be retrieved by seeking to startOffset + (9+pattern.length+maxlength)*ord
	*
	* for sorted numerics, it's encoded (not very creatively) as a comma-separated list of strings the same as binary.
	* beware the empty string!
	*
	* the reader can just scan this file when it opens, skipping over the data blocks
	* and saving the offset/etc for each field.
	* @lucene.experimental
	*/
	class SimpleTextDocValuesFormat extends DocValuesFormat {

	public SimpleTextDocValuesFormat() {
	super("SimpleText");
	}

	@Override
	public DocValuesConsumer fieldsConsumer(SegmentWriteState state) throws IOException {
	return new SimpleTextDocValuesWriter(state, "dat");
	}

	@Override
	public DocValuesProducer fieldsProducer(SegmentReadState state) throws IOException {
	return new SimpleTextDocValuesReader(state, "dat");
	}
	}