| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| import java.io.IOException; |
| import java.util.Objects; |
| |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.RAMFile; |
| import org.apache.lucene.store.RAMInputStream; |
| import org.apache.lucene.store.RAMOutputStream; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.StringHelper; |
| |
| /** |
| * Prefix codes term instances (prefixes are shared). This is expected to be |
| * faster to build than a FST and might also be more compact if there are no |
| * common suffixes. |
| * @lucene.internal |
| */ |
| public class PrefixCodedTerms implements Accountable { |
| final RAMFile buffer; |
| private final long size; |
| private long delGen; |
| |
| private PrefixCodedTerms(RAMFile buffer, long size) { |
| this.buffer = Objects.requireNonNull(buffer); |
| this.size = size; |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return buffer.ramBytesUsed() + 2 * Long.BYTES; |
| } |
| |
| /** Records del gen for this packet. */ |
| public void setDelGen(long delGen) { |
| this.delGen = delGen; |
| } |
| |
| /** Builds a PrefixCodedTerms: call add repeatedly, then finish. */ |
| public static class Builder { |
| private RAMFile buffer = new RAMFile(); |
| private RAMOutputStream output = new RAMOutputStream(buffer, false); |
| private Term lastTerm = new Term(""); |
| private BytesRefBuilder lastTermBytes = new BytesRefBuilder(); |
| private long size; |
| |
| /** Sole constructor. */ |
| public Builder() {} |
| |
| /** add a term */ |
| public void add(Term term) { |
| add(term.field(), term.bytes()); |
| } |
| |
| /** add a term. This fully consumes in the incoming {@link BytesRef}. */ |
| public void add(String field, BytesRef bytes) { |
| assert lastTerm.equals(new Term("")) || new Term(field, bytes).compareTo(lastTerm) > 0; |
| |
| try { |
| final int prefix; |
| if (size > 0 && field.equals(lastTerm.field)) { |
| // same field as the last term |
| prefix = StringHelper.bytesDifference(lastTerm.bytes, bytes); |
| output.writeVInt(prefix << 1); |
| } else { |
| // field change |
| prefix = 0; |
| output.writeVInt(1); |
| output.writeString(field); |
| } |
| |
| int suffix = bytes.length - prefix; |
| output.writeVInt(suffix); |
| output.writeBytes(bytes.bytes, bytes.offset + prefix, suffix); |
| lastTermBytes.copyBytes(bytes); |
| lastTerm.bytes = lastTermBytes.get(); |
| lastTerm.field = field; |
| size += 1; |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| /** return finalized form */ |
| public PrefixCodedTerms finish() { |
| try { |
| output.close(); |
| return new PrefixCodedTerms(buffer, size); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| } |
| |
| /** An iterator over the list of terms stored in a {@link PrefixCodedTerms}. */ |
| public static class TermIterator extends FieldTermIterator { |
| final IndexInput input; |
| final BytesRefBuilder builder = new BytesRefBuilder(); |
| final BytesRef bytes = builder.get(); |
| final long end; |
| final long delGen; |
| String field = ""; |
| |
| private TermIterator(long delGen, RAMFile buffer) { |
| try { |
| input = new RAMInputStream("PrefixCodedTermsIterator", buffer); |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| end = input.length(); |
| this.delGen = delGen; |
| } |
| |
| @Override |
| public BytesRef next() { |
| if (input.getFilePointer() < end) { |
| try { |
| int code = input.readVInt(); |
| boolean newField = (code & 1) != 0; |
| if (newField) { |
| field = input.readString(); |
| } |
| int prefix = code >>> 1; |
| int suffix = input.readVInt(); |
| readTermBytes(prefix, suffix); |
| return bytes; |
| } catch (IOException e) { |
| throw new RuntimeException(e); |
| } |
| } else { |
| field = null; |
| return null; |
| } |
| } |
| |
| // TODO: maybe we should freeze to FST or automaton instead? |
| private void readTermBytes(int prefix, int suffix) throws IOException { |
| builder.grow(prefix + suffix); |
| input.readBytes(builder.bytes(), prefix, suffix); |
| builder.setLength(prefix + suffix); |
| } |
| |
| @Override |
| public String field() { |
| return field; |
| } |
| |
| @Override |
| public long delGen() { |
| return delGen; |
| } |
| } |
| |
| /** Return an iterator over the terms stored in this {@link PrefixCodedTerms}. */ |
| public TermIterator iterator() { |
| return new TermIterator(delGen, buffer); |
| } |
| |
| /** Return the number of terms stored in this {@link PrefixCodedTerms}. */ |
| public long size() { |
| return size; |
| } |
| |
| @Override |
| public int hashCode() { |
| int h = buffer.hashCode(); |
| h = 31 * h + (int) (delGen ^ (delGen >>> 32)); |
| return h; |
| } |
| |
| @Override |
| public boolean equals(Object obj) { |
| if (this == obj) return true; |
| if (obj == null) return false; |
| if (getClass() != obj.getClass()) return false; |
| PrefixCodedTerms other = (PrefixCodedTerms) obj; |
| return buffer.equals(other.buffer) && delGen == other.delGen; |
| } |
| } |