| package org.apache.lucene.codecs.ramonly; |
| |
| /** |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Comparator; |
| import java.util.HashMap; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| import java.util.SortedMap; |
| import java.util.TreeMap; |
| import java.util.concurrent.atomic.AtomicInteger; |
| |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.FieldsProducer; |
| import org.apache.lucene.codecs.PostingsConsumer; |
| import org.apache.lucene.codecs.PostingsFormat; |
| import org.apache.lucene.codecs.TermStats; |
| import org.apache.lucene.codecs.TermsConsumer; |
| import org.apache.lucene.index.DocsAndPositionsEnum; |
| import org.apache.lucene.index.DocsEnum; |
| import org.apache.lucene.index.FieldInfo.IndexOptions; |
| import org.apache.lucene.index.FieldInfo; |
| import org.apache.lucene.index.FieldsEnum; |
| import org.apache.lucene.index.IndexFileNames; |
| import org.apache.lucene.index.SegmentInfo; |
| import org.apache.lucene.index.SegmentReadState; |
| import org.apache.lucene.index.SegmentWriteState; |
| import org.apache.lucene.index.Terms; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.IndexOutput; |
| import org.apache.lucene.util.Bits; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CodecUtil; |
| import org.apache.lucene.util.IOUtils; |
| |
| /** Stores all postings data in RAM, but writes a small |
| * token (header + single int) to identify which "slot" the |
| * index is using in RAM HashMap. |
| * |
| * NOTE: this codec sorts terms by reverse-unicode-order! */ |
| |
| public class RAMOnlyPostingsFormat extends PostingsFormat { |
| |
| // For fun, test that we can override how terms are |
| // sorted, and basic things still work -- this comparator |
| // sorts in reversed unicode code point order: |
| private static final Comparator<BytesRef> reverseUnicodeComparator = new Comparator<BytesRef>() { |
| public int compare(BytesRef t1, BytesRef t2) { |
| byte[] b1 = t1.bytes; |
| byte[] b2 = t2.bytes; |
| int b1Stop; |
| int b1Upto = t1.offset; |
| int b2Upto = t2.offset; |
| if (t1.length < t2.length) { |
| b1Stop = t1.offset + t1.length; |
| } else { |
| b1Stop = t1.offset + t2.length; |
| } |
| while(b1Upto < b1Stop) { |
| final int bb1 = b1[b1Upto++] & 0xff; |
| final int bb2 = b2[b2Upto++] & 0xff; |
| if (bb1 != bb2) { |
| //System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1)); |
| return bb2 - bb1; |
| } |
| } |
| |
| // One is prefix of another, or they are equal |
| return t2.length-t1.length; |
| } |
| |
| @Override |
| public boolean equals(Object other) { |
| return this == other; |
| } |
| }; |
| |
| public RAMOnlyPostingsFormat() { |
| super("RAMOnly"); |
| } |
| |
| // Postings state: |
| static class RAMPostings extends FieldsProducer { |
| final Map<String,RAMField> fieldToTerms = new TreeMap<String,RAMField>(); |
| |
| @Override |
| public Terms terms(String field) { |
| return fieldToTerms.get(field); |
| } |
| |
| @Override |
| public int getUniqueFieldCount() { |
| return fieldToTerms.size(); |
| } |
| |
| @Override |
| public FieldsEnum iterator() { |
| return new RAMFieldsEnum(this); |
| } |
| |
| @Override |
| public void close() { |
| } |
| } |
| |
| static class RAMField extends Terms { |
| final String field; |
| final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>(); |
| long sumTotalTermFreq; |
| long sumDocFreq; |
| int docCount; |
| |
| RAMField(String field) { |
| this.field = field; |
| } |
| |
| @Override |
| public long getUniqueTermCount() { |
| return termToDocs.size(); |
| } |
| |
| @Override |
| public long getSumTotalTermFreq() { |
| return sumTotalTermFreq; |
| } |
| |
| @Override |
| public long getSumDocFreq() throws IOException { |
| return sumDocFreq; |
| } |
| |
| @Override |
| public int getDocCount() throws IOException { |
| return docCount; |
| } |
| |
| @Override |
| public TermsEnum iterator(TermsEnum reuse) { |
| return new RAMTermsEnum(RAMOnlyPostingsFormat.RAMField.this); |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return reverseUnicodeComparator; |
| } |
| } |
| |
| static class RAMTerm { |
| final String term; |
| long totalTermFreq; |
| final List<RAMDoc> docs = new ArrayList<RAMDoc>(); |
| public RAMTerm(String term) { |
| this.term = term; |
| } |
| } |
| |
| static class RAMDoc { |
| final int docID; |
| final int[] positions; |
| byte[][] payloads; |
| |
| public RAMDoc(int docID, int freq) { |
| this.docID = docID; |
| positions = new int[freq]; |
| } |
| } |
| |
| // Classes for writing to the postings state |
| private static class RAMFieldsConsumer extends FieldsConsumer { |
| |
| private final RAMPostings postings; |
| private final RAMTermsConsumer termsConsumer = new RAMTermsConsumer(); |
| |
| public RAMFieldsConsumer(RAMPostings postings) { |
| this.postings = postings; |
| } |
| |
| @Override |
| public TermsConsumer addField(FieldInfo field) { |
| if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) { |
| throw new UnsupportedOperationException("this codec cannot index offsets"); |
| } |
| RAMField ramField = new RAMField(field.name); |
| postings.fieldToTerms.put(field.name, ramField); |
| termsConsumer.reset(ramField); |
| return termsConsumer; |
| } |
| |
| @Override |
| public void close() { |
| // TODO: finalize stuff |
| } |
| } |
| |
| private static class RAMTermsConsumer extends TermsConsumer { |
| private RAMField field; |
| private final RAMPostingsWriterImpl postingsWriter = new RAMPostingsWriterImpl(); |
| RAMTerm current; |
| |
| void reset(RAMField field) { |
| this.field = field; |
| } |
| |
| @Override |
| public PostingsConsumer startTerm(BytesRef text) { |
| final String term = text.utf8ToString(); |
| current = new RAMTerm(term); |
| postingsWriter.reset(current); |
| return postingsWriter; |
| } |
| |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| public void finishTerm(BytesRef text, TermStats stats) { |
| assert stats.docFreq > 0; |
| assert stats.docFreq == current.docs.size(); |
| current.totalTermFreq = stats.totalTermFreq; |
| field.termToDocs.put(current.term, current); |
| } |
| |
| @Override |
| public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) { |
| field.sumTotalTermFreq = sumTotalTermFreq; |
| field.sumDocFreq = sumDocFreq; |
| field.docCount = docCount; |
| } |
| } |
| |
| public static class RAMPostingsWriterImpl extends PostingsConsumer { |
| private RAMTerm term; |
| private RAMDoc current; |
| private int posUpto = 0; |
| |
| public void reset(RAMTerm term) { |
| this.term = term; |
| } |
| |
| @Override |
| public void startDoc(int docID, int freq) { |
| current = new RAMDoc(docID, freq); |
| term.docs.add(current); |
| posUpto = 0; |
| } |
| |
| @Override |
| public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) { |
| assert startOffset == -1; |
| assert endOffset == -1; |
| current.positions[posUpto] = position; |
| if (payload != null && payload.length > 0) { |
| if (current.payloads == null) { |
| current.payloads = new byte[current.positions.length][]; |
| } |
| byte[] bytes = current.payloads[posUpto] = new byte[payload.length]; |
| System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length); |
| } |
| posUpto++; |
| } |
| |
| @Override |
| public void finishDoc() { |
| assert posUpto == current.positions.length; |
| } |
| } |
| |
| // Classes for reading from the postings state |
| static class RAMFieldsEnum extends FieldsEnum { |
| private final RAMPostings postings; |
| private final Iterator<String> it; |
| private String current; |
| |
| public RAMFieldsEnum(RAMPostings postings) { |
| this.postings = postings; |
| this.it = postings.fieldToTerms.keySet().iterator(); |
| } |
| |
| @Override |
| public String next() { |
| if (it.hasNext()) { |
| current = it.next(); |
| } else { |
| current = null; |
| } |
| return current; |
| } |
| |
| @Override |
| public Terms terms() { |
| return postings.fieldToTerms.get(current); |
| } |
| } |
| |
| static class RAMTermsEnum extends TermsEnum { |
| Iterator<String> it; |
| String current; |
| private final RAMField ramField; |
| |
| public RAMTermsEnum(RAMField field) { |
| this.ramField = field; |
| } |
| |
| @Override |
| public Comparator<BytesRef> getComparator() { |
| return BytesRef.getUTF8SortedAsUnicodeComparator(); |
| } |
| |
| @Override |
| public BytesRef next() { |
| if (it == null) { |
| if (current == null) { |
| it = ramField.termToDocs.keySet().iterator(); |
| } else { |
| it = ramField.termToDocs.tailMap(current).keySet().iterator(); |
| } |
| } |
| if (it.hasNext()) { |
| current = it.next(); |
| return new BytesRef(current); |
| } else { |
| return null; |
| } |
| } |
| |
| @Override |
| public SeekStatus seekCeil(BytesRef term, boolean useCache) { |
| current = term.utf8ToString(); |
| it = null; |
| if (ramField.termToDocs.containsKey(current)) { |
| return SeekStatus.FOUND; |
| } else { |
| if (current.compareTo(ramField.termToDocs.lastKey()) > 0) { |
| return SeekStatus.END; |
| } else { |
| return SeekStatus.NOT_FOUND; |
| } |
| } |
| } |
| |
| @Override |
| public void seekExact(long ord) { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public long ord() { |
| throw new UnsupportedOperationException(); |
| } |
| |
| @Override |
| public BytesRef term() { |
| // TODO: reuse BytesRef |
| return new BytesRef(current); |
| } |
| |
| @Override |
| public int docFreq() { |
| return ramField.termToDocs.get(current).docs.size(); |
| } |
| |
| @Override |
| public long totalTermFreq() { |
| return ramField.termToDocs.get(current).totalTermFreq; |
| } |
| |
| @Override |
| public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) { |
| return new RAMDocsEnum(ramField.termToDocs.get(current), liveDocs); |
| } |
| |
| @Override |
| public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) { |
| if (needsOffsets) { |
| return null; |
| } |
| return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs); |
| } |
| } |
| |
| private static class RAMDocsEnum extends DocsEnum { |
| private final RAMTerm ramTerm; |
| private final Bits liveDocs; |
| private RAMDoc current; |
| int upto = -1; |
| int posUpto = 0; |
| |
| public RAMDocsEnum(RAMTerm ramTerm, Bits liveDocs) { |
| this.ramTerm = ramTerm; |
| this.liveDocs = liveDocs; |
| } |
| |
| @Override |
| public int advance(int targetDocID) { |
| do { |
| nextDoc(); |
| } while (upto < ramTerm.docs.size() && current.docID < targetDocID); |
| return NO_MORE_DOCS; |
| } |
| |
| // TODO: override bulk read, for better perf |
| @Override |
| public int nextDoc() { |
| while(true) { |
| upto++; |
| if (upto < ramTerm.docs.size()) { |
| current = ramTerm.docs.get(upto); |
| if (liveDocs == null || liveDocs.get(current.docID)) { |
| posUpto = 0; |
| return current.docID; |
| } |
| } else { |
| return NO_MORE_DOCS; |
| } |
| } |
| } |
| |
| @Override |
| public int freq() { |
| return current.positions.length; |
| } |
| |
| @Override |
| public int docID() { |
| return current.docID; |
| } |
| } |
| |
| private static class RAMDocsAndPositionsEnum extends DocsAndPositionsEnum { |
| private final RAMTerm ramTerm; |
| private final Bits liveDocs; |
| private RAMDoc current; |
| int upto = -1; |
| int posUpto = 0; |
| |
| public RAMDocsAndPositionsEnum(RAMTerm ramTerm, Bits liveDocs) { |
| this.ramTerm = ramTerm; |
| this.liveDocs = liveDocs; |
| } |
| |
| @Override |
| public int advance(int targetDocID) { |
| do { |
| nextDoc(); |
| } while (upto < ramTerm.docs.size() && current.docID < targetDocID); |
| return NO_MORE_DOCS; |
| } |
| |
| // TODO: override bulk read, for better perf |
| @Override |
| public int nextDoc() { |
| while(true) { |
| upto++; |
| if (upto < ramTerm.docs.size()) { |
| current = ramTerm.docs.get(upto); |
| if (liveDocs == null || liveDocs.get(current.docID)) { |
| posUpto = 0; |
| return current.docID; |
| } |
| } else { |
| return NO_MORE_DOCS; |
| } |
| } |
| } |
| |
| @Override |
| public int freq() { |
| return current.positions.length; |
| } |
| |
| @Override |
| public int docID() { |
| return current.docID; |
| } |
| |
| @Override |
| public int nextPosition() { |
| return current.positions[posUpto++]; |
| } |
| |
| @Override |
| public int startOffset() { |
| return -1; |
| } |
| |
| @Override |
| public int endOffset() { |
| return -1; |
| } |
| |
| @Override |
| public boolean hasPayload() { |
| return current.payloads != null && current.payloads[posUpto-1] != null; |
| } |
| |
| @Override |
| public BytesRef getPayload() { |
| return new BytesRef(current.payloads[posUpto-1]); |
| } |
| } |
| |
| // Holds all indexes created, keyed by the ID assigned in fieldsConsumer |
| private final Map<Integer,RAMPostings> state = new HashMap<Integer,RAMPostings>(); |
| |
| private final AtomicInteger nextID = new AtomicInteger(); |
| |
| private final String RAM_ONLY_NAME = "RAMOnly"; |
| private final static int VERSION_START = 0; |
| private final static int VERSION_LATEST = VERSION_START; |
| |
| private static final String ID_EXTENSION = "id"; |
| |
| @Override |
| public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) throws IOException { |
| final int id = nextID.getAndIncrement(); |
| |
| // TODO -- ok to do this up front instead of |
| // on close....? should be ok? |
| // Write our ID: |
| final String idFileName = IndexFileNames.segmentFileName(writeState.segmentName, writeState.segmentSuffix, ID_EXTENSION); |
| IndexOutput out = writeState.directory.createOutput(idFileName, writeState.context); |
| boolean success = false; |
| try { |
| CodecUtil.writeHeader(out, RAM_ONLY_NAME, VERSION_LATEST); |
| out.writeVInt(id); |
| success = true; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(out); |
| } else { |
| IOUtils.close(out); |
| } |
| } |
| |
| final RAMPostings postings = new RAMPostings(); |
| final RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings); |
| |
| synchronized(state) { |
| state.put(id, postings); |
| } |
| return consumer; |
| } |
| |
| @Override |
| public FieldsProducer fieldsProducer(SegmentReadState readState) |
| throws IOException { |
| |
| // Load our ID: |
| final String idFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, ID_EXTENSION); |
| IndexInput in = readState.dir.openInput(idFileName, readState.context); |
| boolean success = false; |
| final int id; |
| try { |
| CodecUtil.checkHeader(in, RAM_ONLY_NAME, VERSION_START, VERSION_LATEST); |
| id = in.readVInt(); |
| success = true; |
| } finally { |
| if (!success) { |
| IOUtils.closeWhileHandlingException(in); |
| } else { |
| IOUtils.close(in); |
| } |
| } |
| |
| synchronized(state) { |
| return state.get(id); |
| } |
| } |
| |
| @Override |
| public void files(SegmentInfo segmentInfo, String segmentSuffix, Set<String> files) { |
| final String idFileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, ID_EXTENSION); |
| files.add(idFileName); |
| } |
| } |