| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.index; |
| |
| import java.io.IOException; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.BytesRefBuilder; |
| import org.apache.lucene.util.automaton.CompiledAutomaton; |
| |
| /** |
| * Access to the terms in a specific field. See {@link Fields}. |
| * |
| * @lucene.experimental |
| */ |
| public abstract class Terms { |
| |
| /** Sole constructor. (For invocation by subclass constructors, typically implicit.) */ |
| protected Terms() {} |
| |
| /** Returns an iterator that will step through all terms. This method will not return null. */ |
| public abstract TermsEnum iterator() throws IOException; |
| |
| /** |
| * Returns a TermsEnum that iterates over all terms and documents that are accepted by the |
| * provided {@link CompiledAutomaton}. If the <code>startTerm</code> is provided then the returned |
| * enum will only return terms {@code > startTerm}, but you still must call next() first to get to |
| * the first term. Note that the provided <code>startTerm</code> must be accepted by the |
| * automaton. |
| * |
| * <p>This is an expert low-level API and will only work for {@code NORMAL} compiled automata. To |
| * handle any compiled automata you should instead use {@link CompiledAutomaton#getTermsEnum} |
| * instead. |
| * |
| * <p><b>NOTE</b>: the returned TermsEnum cannot seek. |
| */ |
| public TermsEnum intersect(CompiledAutomaton compiled, final BytesRef startTerm) |
| throws IOException { |
| |
| // TODO: could we factor out a common interface b/w |
| // CompiledAutomaton and FST? Then we could pass FST there too, |
| // and likely speed up resolving terms to deleted docs ... but |
| // AutomatonTermsEnum makes this tricky because of its on-the-fly cycle |
| // detection |
| |
| // TODO: eventually we could support seekCeil/Exact on |
| // the returned enum, instead of only being able to seek |
| // at the start |
| |
| TermsEnum termsEnum = iterator(); |
| |
| if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) { |
| throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead"); |
| } |
| |
| if (startTerm == null) { |
| return new AutomatonTermsEnum(termsEnum, compiled); |
| } else { |
| return new AutomatonTermsEnum(termsEnum, compiled) { |
| @Override |
| protected BytesRef nextSeekTerm(BytesRef term) throws IOException { |
| if (term == null) { |
| term = startTerm; |
| } |
| return super.nextSeekTerm(term); |
| } |
| }; |
| } |
| } |
| |
| /** |
| * Returns the number of terms for this field, or -1 if this measure isn't stored by the codec. |
| * Note that, just like other term measures, this measure does not take deleted documents into |
| * account. |
| */ |
| public abstract long size() throws IOException; |
| |
| /** |
| * Returns the sum of {@link TermsEnum#totalTermFreq} for all terms in this field. Note that, just |
| * like other term measures, this measure does not take deleted documents into account. |
| */ |
| public abstract long getSumTotalTermFreq() throws IOException; |
| |
| /** |
| * Returns the sum of {@link TermsEnum#docFreq()} for all terms in this field. Note that, just |
| * like other term measures, this measure does not take deleted documents into account. |
| */ |
| public abstract long getSumDocFreq() throws IOException; |
| |
| /** |
| * Returns the number of documents that have at least one term for this field. Note that, just |
| * like other term measures, this measure does not take deleted documents into account. |
| */ |
| public abstract int getDocCount() throws IOException; |
| |
| /** |
| * Returns true if documents in this field store per-document term frequency ({@link |
| * PostingsEnum#freq}). |
| */ |
| public abstract boolean hasFreqs(); |
| |
| /** Returns true if documents in this field store offsets. */ |
| public abstract boolean hasOffsets(); |
| |
| /** Returns true if documents in this field store positions. */ |
| public abstract boolean hasPositions(); |
| |
| /** Returns true if documents in this field store payloads. */ |
| public abstract boolean hasPayloads(); |
| |
| /** Zero-length array of {@link Terms}. */ |
| public static final Terms[] EMPTY_ARRAY = new Terms[0]; |
| |
| /** |
| * Returns the smallest term (in lexicographic order) in the field. Note that, just like other |
| * term measures, this measure does not take deleted documents into account. This returns null |
| * when there are no terms. |
| */ |
| public BytesRef getMin() throws IOException { |
| return iterator().next(); |
| } |
| |
| /** |
| * Returns the largest term (in lexicographic order) in the field. Note that, just like other term |
| * measures, this measure does not take deleted documents into account. This returns null when |
| * there are no terms. |
| */ |
| @SuppressWarnings("fallthrough") |
| public BytesRef getMax() throws IOException { |
| long size = size(); |
| |
| if (size == 0) { |
| // empty: only possible from a FilteredTermsEnum... |
| return null; |
| } else if (size >= 0) { |
| // try to seek-by-ord |
| try { |
| TermsEnum iterator = iterator(); |
| iterator.seekExact(size - 1); |
| return iterator.term(); |
| } catch (UnsupportedOperationException e) { |
| // ok |
| } |
| } |
| |
| // otherwise: binary search |
| TermsEnum iterator = iterator(); |
| BytesRef v = iterator.next(); |
| if (v == null) { |
| // empty: only possible from a FilteredTermsEnum... |
| return v; |
| } |
| |
| BytesRefBuilder scratch = new BytesRefBuilder(); |
| scratch.append((byte) 0); |
| |
| // Iterates over digits: |
| while (true) { |
| |
| int low = 0; |
| int high = 256; |
| |
| // Binary search current digit to find the highest |
| // digit before END: |
| while (low != high) { |
| int mid = (low + high) >>> 1; |
| scratch.setByteAt(scratch.length() - 1, (byte) mid); |
| if (iterator.seekCeil(scratch.get()) == TermsEnum.SeekStatus.END) { |
| // Scratch was too high |
| if (mid == 0) { |
| scratch.setLength(scratch.length() - 1); |
| return scratch.get(); |
| } |
| high = mid; |
| } else { |
| // Scratch was too low; there is at least one term |
| // still after it: |
| if (low == mid) { |
| break; |
| } |
| low = mid; |
| } |
| } |
| |
| // Recurse to next digit: |
| scratch.setLength(scratch.length() + 1); |
| scratch.grow(scratch.length()); |
| } |
| } |
| |
| /** Expert: returns additional information about this Terms instance for debugging purposes. */ |
| public Object getStats() throws IOException { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("impl=").append(getClass().getSimpleName()); |
| sb.append(",size=").append(size()); |
| sb.append(",docCount=").append(getDocCount()); |
| sb.append(",sumTotalTermFreq=").append(getSumTotalTermFreq()); |
| sb.append(",sumDocFreq=").append(getSumDocFreq()); |
| return sb.toString(); |
| } |
| } |