| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.suggest.document; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| |
| import org.apache.lucene.codecs.CodecUtil; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.NormsProducer; |
| import org.apache.lucene.codecs.PostingsFormat; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.Fields; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.DocIdSetIterator; |
| import org.apache.lucene.store.ByteArrayDataInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.IOUtils; |
| |
| import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.COMPLETION_VERSION_CURRENT; |
| import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.DICT_EXTENSION; |
| import static org.apache.lucene.search.suggest.document.CompletionPostingsFormat.INDEX_EXTENSION; |
| |
| /** |
| * <p> |
| * Weighted FSTs for any indexed {@link SuggestField} is built on {@link #write(Fields,NormsProducer)}. |
| * A weighted FST maps the analyzed forms of a field to its |
| * surface form and document id. FSTs are stored in the CompletionDictionary (.lkp). |
| * </p> |
| * <p> |
| * The file offsets of a field's FST are stored in the CompletionIndex (.cmp) |
| * along with the field's internal number {@link FieldInfo#number} on {@link #close()}. |
| * </p> |
| * |
| */ |
| final class CompletionFieldsConsumer extends FieldsConsumer { |
| |
| private final String delegatePostingsFormatName; |
| private final Map<String, CompletionMetaData> seenFields = new HashMap<>(); |
| private final SegmentWriteState state; |
| private IndexOutput dictOut; |
| private FieldsConsumer delegateFieldsConsumer; |
| private final String codecName; |
| |
| CompletionFieldsConsumer(String codecName, PostingsFormat delegatePostingsFormat, SegmentWriteState state) throws IOException { |
| this.codecName = codecName; |
| this.delegatePostingsFormatName = delegatePostingsFormat.getName(); |
| this.state = state; |
| String dictFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, DICT_EXTENSION); |
| boolean success = false; |
| try { |
| this.delegateFieldsConsumer = delegatePostingsFormat.fieldsConsumer(state); |
| dictOut = state.directory.createOutput(dictFile, state.context); |
| CodecUtil.writeIndexHeader(dictOut, codecName, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); |
| success = true; |
| } finally { |
| if (success == false) { |
| IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer); |
| } |
| } |
| } |
| |
| @Override |
| public void write(Fields fields, NormsProducer norms) throws IOException { |
| delegateFieldsConsumer.write(fields, norms); |
| |
| for (String field : fields) { |
| CompletionTermWriter termWriter = new CompletionTermWriter(); |
| Terms terms = fields.terms(field); |
| if (terms == null) { |
| // this can happen from ghost fields, where the incoming Fields iterator claims a field exists but it does not |
| continue; |
| } |
| TermsEnum termsEnum = terms.iterator(); |
| |
| // write terms |
| BytesRef term; |
| while ((term = termsEnum.next()) != null) { |
| termWriter.write(term, termsEnum); |
| } |
| |
| // store lookup, if needed |
| long filePointer = dictOut.getFilePointer(); |
| if (termWriter.finish(dictOut)) { |
| seenFields.put(field, new CompletionMetaData(filePointer, |
| termWriter.minWeight, |
| termWriter.maxWeight, |
| termWriter.type)); |
| } |
| } |
| } |
| |
| private boolean closed = false; |
| |
| @Override |
| public void close() throws IOException { |
| if (closed) { |
| return; |
| } |
| closed = true; |
| String indexFile = IndexFileNames.segmentFileName(state.segmentInfo.name, state.segmentSuffix, INDEX_EXTENSION); |
| boolean success = false; |
| try (IndexOutput indexOut = state.directory.createOutput(indexFile, state.context)) { |
| delegateFieldsConsumer.close(); |
| CodecUtil.writeIndexHeader(indexOut, codecName, COMPLETION_VERSION_CURRENT, state.segmentInfo.getId(), state.segmentSuffix); |
| /* |
| * we write the delegate postings format name so we can load it |
| * without getting an instance in the ctor |
| */ |
| indexOut.writeString(delegatePostingsFormatName); |
| // write # of seen fields |
| indexOut.writeVInt(seenFields.size()); |
| // write field numbers and dictOut offsets |
| for (Map.Entry<String, CompletionMetaData> seenField : seenFields.entrySet()) { |
| FieldInfo fieldInfo = state.fieldInfos.fieldInfo(seenField.getKey()); |
| indexOut.writeVInt(fieldInfo.number); |
| CompletionMetaData metaData = seenField.getValue(); |
| indexOut.writeVLong(metaData.filePointer); |
| indexOut.writeVLong(metaData.minWeight); |
| indexOut.writeVLong(metaData.maxWeight); |
| indexOut.writeByte(metaData.type); |
| } |
| CodecUtil.writeFooter(indexOut); |
| CodecUtil.writeFooter(dictOut); |
| IOUtils.close(dictOut); |
| success = true; |
| } finally { |
| if (success == false) { |
| IOUtils.closeWhileHandlingException(dictOut, delegateFieldsConsumer); |
| } |
| } |
| } |
| |
| private static class CompletionMetaData { |
| private final long filePointer; |
| private final long minWeight; |
| private final long maxWeight; |
| private final byte type; |
| |
| private CompletionMetaData(long filePointer, long minWeight, long maxWeight, byte type) { |
| this.filePointer = filePointer; |
| this.minWeight = minWeight; |
| this.maxWeight = maxWeight; |
| this.type = type; |
| } |
| } |
| |
| // builds an FST based on the terms written |
| private static class CompletionTermWriter { |
| |
| private PostingsEnum postingsEnum = null; |
| private int docCount = 0; |
| private long maxWeight = 0; |
| private long minWeight = Long.MAX_VALUE; |
| private byte type; |
| private boolean first; |
| |
| private final BytesRefBuilder scratch = new BytesRefBuilder(); |
| private final NRTSuggesterBuilder builder; |
| |
| public CompletionTermWriter() { |
| builder = new NRTSuggesterBuilder(); |
| first = true; |
| } |
| |
| /** |
| * Stores the built FST in <code>output</code> |
| * Returns true if there was anything stored, false otherwise |
| */ |
| public boolean finish(IndexOutput output) throws IOException { |
| boolean stored = builder.store(output); |
| assert stored || docCount == 0 : "the FST is null but docCount is != 0 actual value: [" + docCount + "]"; |
| if (docCount == 0) { |
| minWeight = 0; |
| } |
| return stored; |
| } |
| |
| /** |
| * Writes all postings (surface form, weight, document id) for <code>term</code> |
| */ |
| public void write(BytesRef term, TermsEnum termsEnum) throws IOException { |
| postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.PAYLOADS); |
| builder.startTerm(term); |
| int docFreq = 0; |
| while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { |
| int docID = postingsEnum.docID(); |
| for (int i = 0; i < postingsEnum.freq(); i++) { |
| postingsEnum.nextPosition(); |
| assert postingsEnum.getPayload() != null; |
| BytesRef payload = postingsEnum.getPayload(); |
| ByteArrayDataInput input = new ByteArrayDataInput(payload.bytes, payload.offset, payload.length); |
| int len = input.readVInt(); |
| scratch.grow(len); |
| scratch.setLength(len); |
| input.readBytes(scratch.bytes(), 0, scratch.length()); |
| long weight = input.readVInt() - 1; |
| maxWeight = Math.max(maxWeight, weight); |
| minWeight = Math.min(minWeight, weight); |
| byte type = input.readByte(); |
| if (first) { |
| this.type = type; |
| first = false; |
| } else if (this.type != type) { |
| throw new IllegalArgumentException("single field name has mixed types"); |
| } |
| builder.addEntry(docID, scratch.get(), weight); |
| } |
| docFreq++; |
| docCount = Math.max(docCount, docFreq + 1); |
| } |
| builder.finishTerm(); |
| } |
| } |
| } |