blob: 824b6b5a7b23cd27ae05f62a13187468fd6fbf29 [file] [log] [blame]
package org.apache.lucene.codecs.ramonly;
/**
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
import java.io.IOException;
import java.util.ArrayList;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.SortedMap;
import java.util.TreeMap;
import java.util.concurrent.atomic.AtomicInteger;
import org.apache.lucene.codecs.FieldsConsumer;
import org.apache.lucene.codecs.FieldsProducer;
import org.apache.lucene.codecs.PostingsConsumer;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.TermStats;
import org.apache.lucene.codecs.TermsConsumer;
import org.apache.lucene.index.DocsAndPositionsEnum;
import org.apache.lucene.index.DocsEnum;
import org.apache.lucene.index.FieldInfo.IndexOptions;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.FieldsEnum;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.SegmentInfo;
import org.apache.lucene.index.SegmentReadState;
import org.apache.lucene.index.SegmentWriteState;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.store.IndexInput;
import org.apache.lucene.store.IndexOutput;
import org.apache.lucene.util.Bits;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.CodecUtil;
import org.apache.lucene.util.IOUtils;
/** Stores all postings data in RAM, but writes a small
* token (header + single int) to identify which "slot" the
* index is using in RAM HashMap.
*
* NOTE: this codec sorts terms by reverse-unicode-order! */
public class RAMOnlyPostingsFormat extends PostingsFormat {
// For fun, test that we can override how terms are
// sorted, and basic things still work -- this comparator
// sorts in reversed unicode code point order:
private static final Comparator<BytesRef> reverseUnicodeComparator = new Comparator<BytesRef>() {
public int compare(BytesRef t1, BytesRef t2) {
byte[] b1 = t1.bytes;
byte[] b2 = t2.bytes;
int b1Stop;
int b1Upto = t1.offset;
int b2Upto = t2.offset;
if (t1.length < t2.length) {
b1Stop = t1.offset + t1.length;
} else {
b1Stop = t1.offset + t2.length;
}
while(b1Upto < b1Stop) {
final int bb1 = b1[b1Upto++] & 0xff;
final int bb2 = b2[b2Upto++] & 0xff;
if (bb1 != bb2) {
//System.out.println("cmp 1=" + t1 + " 2=" + t2 + " return " + (bb2-bb1));
return bb2 - bb1;
}
}
// One is prefix of another, or they are equal
return t2.length-t1.length;
}
@Override
public boolean equals(Object other) {
return this == other;
}
};
public RAMOnlyPostingsFormat() {
super("RAMOnly");
}
// Postings state:
static class RAMPostings extends FieldsProducer {
final Map<String,RAMField> fieldToTerms = new TreeMap<String,RAMField>();
@Override
public Terms terms(String field) {
return fieldToTerms.get(field);
}
@Override
public int getUniqueFieldCount() {
return fieldToTerms.size();
}
@Override
public FieldsEnum iterator() {
return new RAMFieldsEnum(this);
}
@Override
public void close() {
}
}
static class RAMField extends Terms {
final String field;
final SortedMap<String,RAMTerm> termToDocs = new TreeMap<String,RAMTerm>();
long sumTotalTermFreq;
long sumDocFreq;
int docCount;
RAMField(String field) {
this.field = field;
}
@Override
public long getUniqueTermCount() {
return termToDocs.size();
}
@Override
public long getSumTotalTermFreq() {
return sumTotalTermFreq;
}
@Override
public long getSumDocFreq() throws IOException {
return sumDocFreq;
}
@Override
public int getDocCount() throws IOException {
return docCount;
}
@Override
public TermsEnum iterator(TermsEnum reuse) {
return new RAMTermsEnum(RAMOnlyPostingsFormat.RAMField.this);
}
@Override
public Comparator<BytesRef> getComparator() {
return reverseUnicodeComparator;
}
}
static class RAMTerm {
final String term;
long totalTermFreq;
final List<RAMDoc> docs = new ArrayList<RAMDoc>();
public RAMTerm(String term) {
this.term = term;
}
}
static class RAMDoc {
final int docID;
final int[] positions;
byte[][] payloads;
public RAMDoc(int docID, int freq) {
this.docID = docID;
positions = new int[freq];
}
}
// Classes for writing to the postings state
private static class RAMFieldsConsumer extends FieldsConsumer {
private final RAMPostings postings;
private final RAMTermsConsumer termsConsumer = new RAMTermsConsumer();
public RAMFieldsConsumer(RAMPostings postings) {
this.postings = postings;
}
@Override
public TermsConsumer addField(FieldInfo field) {
if (field.indexOptions.compareTo(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS) >= 0) {
throw new UnsupportedOperationException("this codec cannot index offsets");
}
RAMField ramField = new RAMField(field.name);
postings.fieldToTerms.put(field.name, ramField);
termsConsumer.reset(ramField);
return termsConsumer;
}
@Override
public void close() {
// TODO: finalize stuff
}
}
private static class RAMTermsConsumer extends TermsConsumer {
private RAMField field;
private final RAMPostingsWriterImpl postingsWriter = new RAMPostingsWriterImpl();
RAMTerm current;
void reset(RAMField field) {
this.field = field;
}
@Override
public PostingsConsumer startTerm(BytesRef text) {
final String term = text.utf8ToString();
current = new RAMTerm(term);
postingsWriter.reset(current);
return postingsWriter;
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public void finishTerm(BytesRef text, TermStats stats) {
assert stats.docFreq > 0;
assert stats.docFreq == current.docs.size();
current.totalTermFreq = stats.totalTermFreq;
field.termToDocs.put(current.term, current);
}
@Override
public void finish(long sumTotalTermFreq, long sumDocFreq, int docCount) {
field.sumTotalTermFreq = sumTotalTermFreq;
field.sumDocFreq = sumDocFreq;
field.docCount = docCount;
}
}
public static class RAMPostingsWriterImpl extends PostingsConsumer {
private RAMTerm term;
private RAMDoc current;
private int posUpto = 0;
public void reset(RAMTerm term) {
this.term = term;
}
@Override
public void startDoc(int docID, int freq) {
current = new RAMDoc(docID, freq);
term.docs.add(current);
posUpto = 0;
}
@Override
public void addPosition(int position, BytesRef payload, int startOffset, int endOffset) {
assert startOffset == -1;
assert endOffset == -1;
current.positions[posUpto] = position;
if (payload != null && payload.length > 0) {
if (current.payloads == null) {
current.payloads = new byte[current.positions.length][];
}
byte[] bytes = current.payloads[posUpto] = new byte[payload.length];
System.arraycopy(payload.bytes, payload.offset, bytes, 0, payload.length);
}
posUpto++;
}
@Override
public void finishDoc() {
assert posUpto == current.positions.length;
}
}
// Classes for reading from the postings state
static class RAMFieldsEnum extends FieldsEnum {
private final RAMPostings postings;
private final Iterator<String> it;
private String current;
public RAMFieldsEnum(RAMPostings postings) {
this.postings = postings;
this.it = postings.fieldToTerms.keySet().iterator();
}
@Override
public String next() {
if (it.hasNext()) {
current = it.next();
} else {
current = null;
}
return current;
}
@Override
public Terms terms() {
return postings.fieldToTerms.get(current);
}
}
static class RAMTermsEnum extends TermsEnum {
Iterator<String> it;
String current;
private final RAMField ramField;
public RAMTermsEnum(RAMField field) {
this.ramField = field;
}
@Override
public Comparator<BytesRef> getComparator() {
return BytesRef.getUTF8SortedAsUnicodeComparator();
}
@Override
public BytesRef next() {
if (it == null) {
if (current == null) {
it = ramField.termToDocs.keySet().iterator();
} else {
it = ramField.termToDocs.tailMap(current).keySet().iterator();
}
}
if (it.hasNext()) {
current = it.next();
return new BytesRef(current);
} else {
return null;
}
}
@Override
public SeekStatus seekCeil(BytesRef term, boolean useCache) {
current = term.utf8ToString();
it = null;
if (ramField.termToDocs.containsKey(current)) {
return SeekStatus.FOUND;
} else {
if (current.compareTo(ramField.termToDocs.lastKey()) > 0) {
return SeekStatus.END;
} else {
return SeekStatus.NOT_FOUND;
}
}
}
@Override
public void seekExact(long ord) {
throw new UnsupportedOperationException();
}
@Override
public long ord() {
throw new UnsupportedOperationException();
}
@Override
public BytesRef term() {
// TODO: reuse BytesRef
return new BytesRef(current);
}
@Override
public int docFreq() {
return ramField.termToDocs.get(current).docs.size();
}
@Override
public long totalTermFreq() {
return ramField.termToDocs.get(current).totalTermFreq;
}
@Override
public DocsEnum docs(Bits liveDocs, DocsEnum reuse, boolean needsFreqs) {
return new RAMDocsEnum(ramField.termToDocs.get(current), liveDocs);
}
@Override
public DocsAndPositionsEnum docsAndPositions(Bits liveDocs, DocsAndPositionsEnum reuse, boolean needsOffsets) {
if (needsOffsets) {
return null;
}
return new RAMDocsAndPositionsEnum(ramField.termToDocs.get(current), liveDocs);
}
}
private static class RAMDocsEnum extends DocsEnum {
private final RAMTerm ramTerm;
private final Bits liveDocs;
private RAMDoc current;
int upto = -1;
int posUpto = 0;
public RAMDocsEnum(RAMTerm ramTerm, Bits liveDocs) {
this.ramTerm = ramTerm;
this.liveDocs = liveDocs;
}
@Override
public int advance(int targetDocID) {
do {
nextDoc();
} while (upto < ramTerm.docs.size() && current.docID < targetDocID);
return NO_MORE_DOCS;
}
// TODO: override bulk read, for better perf
@Override
public int nextDoc() {
while(true) {
upto++;
if (upto < ramTerm.docs.size()) {
current = ramTerm.docs.get(upto);
if (liveDocs == null || liveDocs.get(current.docID)) {
posUpto = 0;
return current.docID;
}
} else {
return NO_MORE_DOCS;
}
}
}
@Override
public int freq() {
return current.positions.length;
}
@Override
public int docID() {
return current.docID;
}
}
private static class RAMDocsAndPositionsEnum extends DocsAndPositionsEnum {
private final RAMTerm ramTerm;
private final Bits liveDocs;
private RAMDoc current;
int upto = -1;
int posUpto = 0;
public RAMDocsAndPositionsEnum(RAMTerm ramTerm, Bits liveDocs) {
this.ramTerm = ramTerm;
this.liveDocs = liveDocs;
}
@Override
public int advance(int targetDocID) {
do {
nextDoc();
} while (upto < ramTerm.docs.size() && current.docID < targetDocID);
return NO_MORE_DOCS;
}
// TODO: override bulk read, for better perf
@Override
public int nextDoc() {
while(true) {
upto++;
if (upto < ramTerm.docs.size()) {
current = ramTerm.docs.get(upto);
if (liveDocs == null || liveDocs.get(current.docID)) {
posUpto = 0;
return current.docID;
}
} else {
return NO_MORE_DOCS;
}
}
}
@Override
public int freq() {
return current.positions.length;
}
@Override
public int docID() {
return current.docID;
}
@Override
public int nextPosition() {
return current.positions[posUpto++];
}
@Override
public int startOffset() {
return -1;
}
@Override
public int endOffset() {
return -1;
}
@Override
public boolean hasPayload() {
return current.payloads != null && current.payloads[posUpto-1] != null;
}
@Override
public BytesRef getPayload() {
return new BytesRef(current.payloads[posUpto-1]);
}
}
// Holds all indexes created, keyed by the ID assigned in fieldsConsumer
private final Map<Integer,RAMPostings> state = new HashMap<Integer,RAMPostings>();
private final AtomicInteger nextID = new AtomicInteger();
private final String RAM_ONLY_NAME = "RAMOnly";
private final static int VERSION_START = 0;
private final static int VERSION_LATEST = VERSION_START;
private static final String ID_EXTENSION = "id";
@Override
public FieldsConsumer fieldsConsumer(SegmentWriteState writeState) throws IOException {
final int id = nextID.getAndIncrement();
// TODO -- ok to do this up front instead of
// on close....? should be ok?
// Write our ID:
final String idFileName = IndexFileNames.segmentFileName(writeState.segmentName, writeState.segmentSuffix, ID_EXTENSION);
IndexOutput out = writeState.directory.createOutput(idFileName, writeState.context);
boolean success = false;
try {
CodecUtil.writeHeader(out, RAM_ONLY_NAME, VERSION_LATEST);
out.writeVInt(id);
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(out);
} else {
IOUtils.close(out);
}
}
final RAMPostings postings = new RAMPostings();
final RAMFieldsConsumer consumer = new RAMFieldsConsumer(postings);
synchronized(state) {
state.put(id, postings);
}
return consumer;
}
@Override
public FieldsProducer fieldsProducer(SegmentReadState readState)
throws IOException {
// Load our ID:
final String idFileName = IndexFileNames.segmentFileName(readState.segmentInfo.name, readState.segmentSuffix, ID_EXTENSION);
IndexInput in = readState.dir.openInput(idFileName, readState.context);
boolean success = false;
final int id;
try {
CodecUtil.checkHeader(in, RAM_ONLY_NAME, VERSION_START, VERSION_LATEST);
id = in.readVInt();
success = true;
} finally {
if (!success) {
IOUtils.closeWhileHandlingException(in);
} else {
IOUtils.close(in);
}
}
synchronized(state) {
return state.get(id);
}
}
@Override
public void files(SegmentInfo segmentInfo, String segmentSuffix, Set<String> files) {
final String idFileName = IndexFileNames.segmentFileName(segmentInfo.name, segmentSuffix, ID_EXTENSION);
files.add(idFileName);
}
}