lucene/core/src/java/org/apache/lucene/codecs/StoredFieldsWriter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.codecs;

 import java.io.Closeable;
 import java.io.IOException;
 import java.io.Reader;
 import java.nio.charset.StandardCharsets;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.lucene.analysis.Analyzer;
 import org.apache.lucene.analysis.TokenStream;
 import org.apache.lucene.document.StoredField;
 import org.apache.lucene.index.DocIDMerger;
 import org.apache.lucene.index.FieldInfo;
 import org.apache.lucene.index.FieldInfos;
 import org.apache.lucene.index.IndexableField;
 import org.apache.lucene.index.IndexableFieldType;
 import org.apache.lucene.index.MergeState;
 import org.apache.lucene.index.StoredFieldVisitor;
 import org.apache.lucene.util.Accountable;
 import org.apache.lucene.util.BytesRef;

 import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

 /**
  * Codec API for writing stored fields:
  * <ol>
  *   <li>For every document, {@link #startDocument()} is called,
  *       informing the Codec that a new document has started.
  *   <li>{@link #writeField(FieldInfo, IndexableField)} is called for
  *       each field in the document.
  *   <li>After all documents have been written, {@link #finish(FieldInfos, int)}
  *       is called for verification/sanity-checks.
  *   <li>Finally the writer is closed ({@link #close()})
  * </ol>
  *
  * @lucene.experimental
  */
 public abstract class StoredFieldsWriter implements Closeable, Accountable {

   /** Sole constructor. (For invocation by subclass
    *  constructors, typically implicit.) */
   protected StoredFieldsWriter() {
   }

   /** Called before writing the stored fields of the document.
    *  {@link #writeField(FieldInfo, IndexableField)} will be called
    *  for each stored field. Note that this is
    *  called even if the document has no stored fields. */
   public abstract void startDocument() throws IOException;

   /** Called when a document and all its fields have been added. */
   public void finishDocument() throws IOException {}

   /** Writes a single stored field. */
   public abstract void writeField(FieldInfo info, IndexableField field) throws IOException;

   /** Called before {@link #close()}, passing in the number
    *  of documents that were written. Note that this is
    *  intentionally redundant (equivalent to the number of
    *  calls to {@link #startDocument()}, but a Codec should
    *  check that this is the case to detect the JRE bug described
    *  in LUCENE-1282. */
   public abstract void finish(FieldInfos fis, int numDocs) throws IOException;

   private static class StoredFieldsMergeSub extends DocIDMerger.Sub {
     private final StoredFieldsReader reader;
     private final int maxDoc;
     private final MergeVisitor visitor;
     int docID = -1;

     public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, StoredFieldsReader reader, int maxDoc) {
       super(docMap);
       this.maxDoc = maxDoc;
       this.reader = reader;
       this.visitor = visitor;
     }

     @Override
     public int nextDoc() {
       docID++;
       if (docID == maxDoc) {
         return NO_MORE_DOCS;
       } else {
         return docID;
       }
     }
   }

   /** Merges in the stored fields from the readers in
    *  <code>mergeState</code>. The default implementation skips
    *  over deleted documents, and uses {@link #startDocument()},
    *  {@link #writeField(FieldInfo, IndexableField)}, and {@link #finish(FieldInfos, int)},
    *  returning the number of documents that were written.
    *  Implementations can override this method for more sophisticated
    *  merging (bulk-byte copying, etc). */
   public int merge(MergeState mergeState) throws IOException {
     List<StoredFieldsMergeSub> subs = new ArrayList<>();
     for(int i=0;i<mergeState.storedFieldsReaders.length;i++) {
       StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
       storedFieldsReader.checkIntegrity();
       subs.add(new StoredFieldsMergeSub(new MergeVisitor(mergeState, i), mergeState.docMaps[i], storedFieldsReader, mergeState.maxDocs[i]));
     }

     final DocIDMerger<StoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);

     int docCount = 0;
     while (true) {
       StoredFieldsMergeSub sub = docIDMerger.next();
       if (sub == null) {
         break;
       }
       assert sub.mappedDocID == docCount;
       startDocument();
       sub.reader.visitDocument(sub.docID, sub.visitor);
       finishDocument();
       docCount++;
     }
     finish(mergeState.mergeFieldInfos, docCount);
     return docCount;
   }

   /**
    * A visitor that adds every field it sees.
    * <p>
    * Use like this:
    * <pre>
    * MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
    * for (...) {
    *   startDocument();
    *   storedFieldsReader.visitDocument(docID, visitor);
    *   finishDocument();
    * }
    * </pre>
    */
   protected class MergeVisitor extends StoredFieldVisitor implements IndexableField {
     BytesRef binaryValue;
     String stringValue;
     Number numericValue;
     FieldInfo currentField;
     FieldInfos remapper;

     /**
      * Create new merge visitor.
      */
     public MergeVisitor(MergeState mergeState, int readerIndex) {
       // if field numbers are aligned, we can save hash lookups
       // on every field access. Otherwise, we need to lookup
       // fieldname each time, and remap to a new number.
       for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
         FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
         if (other == null || !other.name.equals(fi.name)) {
           remapper = mergeState.mergeFieldInfos;
           break;
         }
       }
     }

     @Override
     public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
       reset(fieldInfo);
       // TODO: can we avoid new BR here?
       binaryValue = new BytesRef(value);
       write();
     }

     @Override
     public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
       reset(fieldInfo);
       // TODO: can we avoid new String here?
       stringValue = new String(value, StandardCharsets.UTF_8);
       write();
     }

     @Override
     public void intField(FieldInfo fieldInfo, int value) throws IOException {
       reset(fieldInfo);
       numericValue = value;
       write();
     }

     @Override
     public void longField(FieldInfo fieldInfo, long value) throws IOException {
       reset(fieldInfo);
       numericValue = value;
       write();
     }

     @Override
     public void floatField(FieldInfo fieldInfo, float value) throws IOException {
       reset(fieldInfo);
       numericValue = value;
       write();
     }

     @Override
     public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
       reset(fieldInfo);
       numericValue = value;
       write();
     }

     @Override
     public Status needsField(FieldInfo fieldInfo) throws IOException {
       return Status.YES;
     }

     @Override
     public String name() {
       return currentField.name;
     }

     @Override
     public IndexableFieldType fieldType() {
       return StoredField.TYPE;
     }

     @Override
     public BytesRef binaryValue() {
       return binaryValue;
     }

     @Override
     public String stringValue() {
       return stringValue;
     }

     @Override
     public Number numericValue() {
       return numericValue;
     }

     @Override
     public Reader readerValue() {
       return null;
     }

     @Override
     public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
       return null;
     }

     void reset(FieldInfo field) {
       if (remapper != null) {
         // field numbers are not aligned, we need to remap to the new field number
         currentField = remapper.fieldInfo(field.name);
       } else {
         currentField = field;
       }
       binaryValue = null;
       stringValue = null;
       numericValue = null;
     }

     void write() throws IOException {
       writeField(currentField, this);
     }
   }

   @Override
   public abstract void close() throws IOException;
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.codecs;

	import java.io.Closeable;
	import java.io.IOException;
	import java.io.Reader;
	import java.nio.charset.StandardCharsets;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.lucene.analysis.Analyzer;
	import org.apache.lucene.analysis.TokenStream;
	import org.apache.lucene.document.StoredField;
	import org.apache.lucene.index.DocIDMerger;
	import org.apache.lucene.index.FieldInfo;
	import org.apache.lucene.index.FieldInfos;
	import org.apache.lucene.index.IndexableField;
	import org.apache.lucene.index.IndexableFieldType;
	import org.apache.lucene.index.MergeState;
	import org.apache.lucene.index.StoredFieldVisitor;
	import org.apache.lucene.util.Accountable;
	import org.apache.lucene.util.BytesRef;

	import static org.apache.lucene.search.DocIdSetIterator.NO_MORE_DOCS;

	/**
	* Codec API for writing stored fields:
	* <ol>
	* <li>For every document, {@link #startDocument()} is called,
	* informing the Codec that a new document has started.
	* <li>{@link #writeField(FieldInfo, IndexableField)} is called for
	* each field in the document.
	* <li>After all documents have been written, {@link #finish(FieldInfos, int)}
	* is called for verification/sanity-checks.
	* <li>Finally the writer is closed ({@link #close()})
	* </ol>
	*
	* @lucene.experimental
	*/
	public abstract class StoredFieldsWriter implements Closeable, Accountable {

	/** Sole constructor. (For invocation by subclass
	* constructors, typically implicit.) */
	protected StoredFieldsWriter() {
	}

	/** Called before writing the stored fields of the document.
	* {@link #writeField(FieldInfo, IndexableField)} will be called
	* for each stored field. Note that this is
	* called even if the document has no stored fields. */
	public abstract void startDocument() throws IOException;

	/** Called when a document and all its fields have been added. */
	public void finishDocument() throws IOException {}

	/** Writes a single stored field. */
	public abstract void writeField(FieldInfo info, IndexableField field) throws IOException;

	/** Called before {@link #close()}, passing in the number
	* of documents that were written. Note that this is
	* intentionally redundant (equivalent to the number of
	* calls to {@link #startDocument()}, but a Codec should
	* check that this is the case to detect the JRE bug described
	* in LUCENE-1282. */
	public abstract void finish(FieldInfos fis, int numDocs) throws IOException;

	private static class StoredFieldsMergeSub extends DocIDMerger.Sub {
	private final StoredFieldsReader reader;
	private final int maxDoc;
	private final MergeVisitor visitor;
	int docID = -1;

	public StoredFieldsMergeSub(MergeVisitor visitor, MergeState.DocMap docMap, StoredFieldsReader reader, int maxDoc) {
	super(docMap);
	this.maxDoc = maxDoc;
	this.reader = reader;
	this.visitor = visitor;
	}

	@Override
	public int nextDoc() {
	docID++;
	if (docID == maxDoc) {
	return NO_MORE_DOCS;
	} else {
	return docID;
	}
	}
	}

	/** Merges in the stored fields from the readers in
	* <code>mergeState</code>. The default implementation skips
	* over deleted documents, and uses {@link #startDocument()},
	* {@link #writeField(FieldInfo, IndexableField)}, and {@link #finish(FieldInfos, int)},
	* returning the number of documents that were written.
	* Implementations can override this method for more sophisticated
	* merging (bulk-byte copying, etc). */
	public int merge(MergeState mergeState) throws IOException {
	List<StoredFieldsMergeSub> subs = new ArrayList<>();
	for(int i=0;i<mergeState.storedFieldsReaders.length;i++) {
	StoredFieldsReader storedFieldsReader = mergeState.storedFieldsReaders[i];
	storedFieldsReader.checkIntegrity();
	subs.add(new StoredFieldsMergeSub(new MergeVisitor(mergeState, i), mergeState.docMaps[i], storedFieldsReader, mergeState.maxDocs[i]));
	}

	final DocIDMerger<StoredFieldsMergeSub> docIDMerger = DocIDMerger.of(subs, mergeState.needsIndexSort);

	int docCount = 0;
	while (true) {
	StoredFieldsMergeSub sub = docIDMerger.next();
	if (sub == null) {
	break;
	}
	assert sub.mappedDocID == docCount;
	startDocument();
	sub.reader.visitDocument(sub.docID, sub.visitor);
	finishDocument();
	docCount++;
	}
	finish(mergeState.mergeFieldInfos, docCount);
	return docCount;
	}

	/**
	* A visitor that adds every field it sees.
	* <p>
	* Use like this:
	* <pre>
	* MergeVisitor visitor = new MergeVisitor(mergeState, readerIndex);
	* for (...) {
	* startDocument();
	* storedFieldsReader.visitDocument(docID, visitor);
	* finishDocument();
	* }
	* </pre>
	*/
	protected class MergeVisitor extends StoredFieldVisitor implements IndexableField {
	BytesRef binaryValue;
	String stringValue;
	Number numericValue;
	FieldInfo currentField;
	FieldInfos remapper;

	/**
	* Create new merge visitor.
	*/
	public MergeVisitor(MergeState mergeState, int readerIndex) {
	// if field numbers are aligned, we can save hash lookups
	// on every field access. Otherwise, we need to lookup
	// fieldname each time, and remap to a new number.
	for (FieldInfo fi : mergeState.fieldInfos[readerIndex]) {
	FieldInfo other = mergeState.mergeFieldInfos.fieldInfo(fi.number);
	if (other == null \|\| !other.name.equals(fi.name)) {
	remapper = mergeState.mergeFieldInfos;
	break;
	}
	}
	}

	@Override
	public void binaryField(FieldInfo fieldInfo, byte[] value) throws IOException {
	reset(fieldInfo);
	// TODO: can we avoid new BR here?
	binaryValue = new BytesRef(value);
	write();
	}

	@Override
	public void stringField(FieldInfo fieldInfo, byte[] value) throws IOException {
	reset(fieldInfo);
	// TODO: can we avoid new String here?
	stringValue = new String(value, StandardCharsets.UTF_8);
	write();
	}

	@Override
	public void intField(FieldInfo fieldInfo, int value) throws IOException {
	reset(fieldInfo);
	numericValue = value;
	write();
	}

	@Override
	public void longField(FieldInfo fieldInfo, long value) throws IOException {
	reset(fieldInfo);
	numericValue = value;
	write();
	}

	@Override
	public void floatField(FieldInfo fieldInfo, float value) throws IOException {
	reset(fieldInfo);
	numericValue = value;
	write();
	}

	@Override
	public void doubleField(FieldInfo fieldInfo, double value) throws IOException {
	reset(fieldInfo);
	numericValue = value;
	write();
	}

	@Override
	public Status needsField(FieldInfo fieldInfo) throws IOException {
	return Status.YES;
	}

	@Override
	public String name() {
	return currentField.name;
	}

	@Override
	public IndexableFieldType fieldType() {
	return StoredField.TYPE;
	}

	@Override
	public BytesRef binaryValue() {
	return binaryValue;
	}

	@Override
	public String stringValue() {
	return stringValue;
	}

	@Override
	public Number numericValue() {
	return numericValue;
	}

	@Override
	public Reader readerValue() {
	return null;
	}

	@Override
	public TokenStream tokenStream(Analyzer analyzer, TokenStream reuse) {
	return null;
	}

	void reset(FieldInfo field) {
	if (remapper != null) {
	// field numbers are not aligned, we need to remap to the new field number
	currentField = remapper.fieldInfo(field.name);
	} else {
	currentField = field;
	}
	binaryValue = null;
	stringValue = null;
	numericValue = null;
	}

	void write() throws IOException {
	writeField(currentField, this);
	}
	}

	@Override
	public abstract void close() throws IOException;
	}