| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.IndexReaderContext; |
| import org.apache.lucene.index.LeafReaderContext; |
| import org.apache.lucene.index.PostingsEnum; |
| import org.apache.lucene.index.ReaderUtil; |
| import org.apache.lucene.index.Term; |
| import org.apache.lucene.index.TermState; |
| import org.apache.lucene.index.TermStates; |
| import org.apache.lucene.index.TermsEnum; |
| import org.apache.lucene.search.similarities.Similarity; |
| import org.apache.lucene.search.spans.SpanNearQuery; |
| import org.apache.lucene.util.Accountable; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.IntsRef; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.automaton.Automaton; |
| import org.apache.lucene.util.automaton.Operations; |
| import org.apache.lucene.util.automaton.Transition; |
| |
| import static org.apache.lucene.util.automaton.Operations.DEFAULT_DETERMINIZE_WORK_LIMIT; |
| |
| // TODO |
| // - compare perf to PhraseQuery exact and sloppy |
| // - optimize: find terms that are in fact MUST (because all paths |
| // through the A include that term) |
| // - if we ever store posLength in the index, it would be easy[ish] |
| // to take it into account here |
| |
| /** A proximity query that lets you express an automaton, whose |
| * transitions are terms, to match documents. This is a generalization |
| * of other proximity queries like {@link PhraseQuery}, {@link |
| * MultiPhraseQuery} and {@link SpanNearQuery}. It is likely |
| * slow, since it visits any document having any of the terms (i.e. it |
| * acts like a disjunction, not a conjunction like {@link |
| * PhraseQuery}), and then it must merge-sort all positions within each |
| * document to test whether/how many times the automaton matches. |
| * |
| * <p>After creating the query, use {@link #createState}, {@link |
| * #setAccept}, {@link #addTransition} and {@link #addAnyTransition} to |
| * build up the automaton. Once you are done, call {@link #finish} and |
| * then execute the query. |
| * |
| * <p>This code is very new and likely has exciting bugs! |
| * |
| * @lucene.experimental */ |
| |
| public class TermAutomatonQuery extends Query implements Accountable { |
| private static final long BASE_RAM_BYTES = RamUsageEstimator.shallowSizeOfInstance(TermAutomatonQuery.class); |
| |
| private final String field; |
| private final Automaton.Builder builder; |
| Automaton det; |
| private final Map<BytesRef,Integer> termToID = new HashMap<>(); |
| private final Map<Integer,BytesRef> idToTerm = new HashMap<>(); |
| private int anyTermID = -1; |
| |
| public TermAutomatonQuery(String field) { |
| this.field = field; |
| this.builder = new Automaton.Builder(); |
| } |
| |
| /** Returns a new state; state 0 is always the initial state. */ |
| public int createState() { |
| return builder.createState(); |
| } |
| |
| /** Marks the specified state as accept or not. */ |
| public void setAccept(int state, boolean accept) { |
| builder.setAccept(state, accept); |
| } |
| |
| /** Adds a transition to the automaton. */ |
| public void addTransition(int source, int dest, String term) { |
| addTransition(source, dest, new BytesRef(term)); |
| } |
| |
| /** Adds a transition to the automaton. */ |
| public void addTransition(int source, int dest, BytesRef term) { |
| if (term == null) { |
| throw new NullPointerException("term should not be null"); |
| } |
| builder.addTransition(source, dest, getTermID(term)); |
| } |
| |
| /** Adds a transition matching any term. */ |
| public void addAnyTransition(int source, int dest) { |
| builder.addTransition(source, dest, getTermID(null)); |
| } |
| |
| /** Call this once you are done adding states/transitions. */ |
| public void finish() { |
| finish(DEFAULT_DETERMINIZE_WORK_LIMIT); |
| } |
| |
| /** |
| * Call this once you are done adding states/transitions. |
| * @param determinizeWorkLimit Maximum effort to spend determinizing the automaton. Higher numbers |
| * allow this operation to consume more memory but allow more complex automatons. Use {@link |
| * Operations#DEFAULT_DETERMINIZE_WORK_LIMIT} as a decent default if you don't otherwise know |
| * what to specify. |
| */ |
| public void finish(int determinizeWorkLimit) { |
| Automaton automaton = builder.finish(); |
| |
| // System.out.println("before det:\n" + automaton.toDot()); |
| |
| Transition t = new Transition(); |
| |
| // TODO: should we add "eps back to initial node" for all states, |
| // and det that? then we don't need to revisit initial node at |
| // every position? but automaton could blow up? And, this makes it |
| // harder to skip useless positions at search time? |
| |
| if (anyTermID != -1) { |
| |
| // Make sure there are no leading or trailing ANY: |
| int count = automaton.initTransition(0, t); |
| for(int i=0;i<count;i++) { |
| automaton.getNextTransition(t); |
| if (anyTermID >= t.min && anyTermID <= t.max) { |
| throw new IllegalStateException("automaton cannot lead with an ANY transition"); |
| } |
| } |
| |
| int numStates = automaton.getNumStates(); |
| for(int i=0;i<numStates;i++) { |
| count = automaton.initTransition(i, t); |
| for(int j=0;j<count;j++) { |
| automaton.getNextTransition(t); |
| if (automaton.isAccept(t.dest) && anyTermID >= t.min && anyTermID <= t.max) { |
| throw new IllegalStateException("automaton cannot end with an ANY transition"); |
| } |
| } |
| } |
| |
| int termCount = termToID.size(); |
| |
| // We have to carefully translate these transitions so automaton |
| // realizes they also match all other terms: |
| Automaton newAutomaton = new Automaton(); |
| for(int i=0;i<numStates;i++) { |
| newAutomaton.createState(); |
| newAutomaton.setAccept(i, automaton.isAccept(i)); |
| } |
| |
| for(int i=0;i<numStates;i++) { |
| count = automaton.initTransition(i, t); |
| for(int j=0;j<count;j++) { |
| automaton.getNextTransition(t); |
| int min, max; |
| if (t.min <= anyTermID && anyTermID <= t.max) { |
| // Match any term |
| min = 0; |
| max = termCount-1; |
| } else { |
| min = t.min; |
| max = t.max; |
| } |
| newAutomaton.addTransition(t.source, t.dest, min, max); |
| } |
| } |
| newAutomaton.finishState(); |
| automaton = newAutomaton; |
| } |
| |
| det = Operations.removeDeadStates(Operations.determinize(automaton, |
| determinizeWorkLimit)); |
| |
| if (det.isAccept(0)) { |
| throw new IllegalStateException("cannot accept the empty string"); |
| } |
| } |
| |
| @Override |
| public Weight createWeight(IndexSearcher searcher, ScoreMode scoreMode, float boost) throws IOException { |
| IndexReaderContext context = searcher.getTopReaderContext(); |
| Map<Integer,TermStates> termStates = new HashMap<>(); |
| |
| for (Map.Entry<BytesRef,Integer> ent : termToID.entrySet()) { |
| if (ent.getKey() != null) { |
| termStates.put(ent.getValue(), TermStates.build(context, new Term(field, ent.getKey()), scoreMode.needsScores())); |
| } |
| } |
| |
| return new TermAutomatonWeight(det, searcher, termStates, boost); |
| } |
| |
| @Override |
| public String toString(String field) { |
| // TODO: what really am I supposed to do with the incoming field... |
| StringBuilder sb = new StringBuilder(); |
| sb.append("TermAutomatonQuery(field="); |
| sb.append(this.field); |
| if (det != null) { |
| sb.append(" numStates="); |
| sb.append(det.getNumStates()); |
| } |
| sb.append(')'); |
| return sb.toString(); |
| } |
| |
| private int getTermID(BytesRef term) { |
| Integer id = termToID.get(term); |
| if (id == null) { |
| id = termToID.size(); |
| if (term != null) { |
| term = BytesRef.deepCopyOf(term); |
| } |
| termToID.put(term, id); |
| idToTerm.put(id, term); |
| if (term == null) { |
| anyTermID = id; |
| } |
| } |
| |
| return id; |
| } |
| |
| /** Returns true iff <code>o</code> is equal to this. */ |
| @Override |
| public boolean equals(Object other) { |
| return sameClassAs(other) && |
| equalsTo(getClass().cast(other)); |
| } |
| |
| private static boolean checkFinished(TermAutomatonQuery q) { |
| if (q.det == null) { |
| throw new IllegalStateException("Call finish first on: " + q); |
| } |
| return true; |
| } |
| |
| private boolean equalsTo(TermAutomatonQuery other) { |
| return checkFinished(this) && |
| checkFinished(other) && |
| other == this; |
| } |
| |
| @Override |
| public int hashCode() { |
| checkFinished(this); |
| // LUCENE-7295: this used to be very awkward toDot() call; it is safer to assume |
| // that no two instances are equivalent instead (until somebody finds a better way to check |
| // on automaton equivalence quickly). |
| return System.identityHashCode(this); |
| } |
| |
| @Override |
| public long ramBytesUsed() { |
| return BASE_RAM_BYTES + |
| RamUsageEstimator.sizeOfObject(builder) + |
| RamUsageEstimator.sizeOfObject(det) + |
| RamUsageEstimator.sizeOfObject(field) + |
| RamUsageEstimator.sizeOfObject(idToTerm) + |
| RamUsageEstimator.sizeOfObject(termToID); |
| } |
| |
| /** Returns the dot (graphviz) representation of this automaton. |
| * This is extremely useful for visualizing the automaton. */ |
| public String toDot() { |
| |
| // TODO: refactor & share with Automaton.toDot! |
| |
| StringBuilder b = new StringBuilder(); |
| b.append("digraph Automaton {\n"); |
| b.append(" rankdir = LR\n"); |
| final int numStates = det.getNumStates(); |
| if (numStates > 0) { |
| b.append(" initial [shape=plaintext,label=\"0\"]\n"); |
| b.append(" initial -> 0\n"); |
| } |
| |
| Transition t = new Transition(); |
| for(int state=0;state<numStates;state++) { |
| b.append(" "); |
| b.append(state); |
| if (det.isAccept(state)) { |
| b.append(" [shape=doublecircle,label=\"").append(state).append("\"]\n"); |
| } else { |
| b.append(" [shape=circle,label=\"").append(state).append("\"]\n"); |
| } |
| int numTransitions = det.initTransition(state, t); |
| for(int i=0;i<numTransitions;i++) { |
| det.getNextTransition(t); |
| assert t.max >= t.min; |
| for(int j=t.min;j<=t.max;j++) { |
| b.append(" "); |
| b.append(state); |
| b.append(" -> "); |
| b.append(t.dest); |
| b.append(" [label=\""); |
| if (j == anyTermID) { |
| b.append('*'); |
| } else { |
| b.append(idToTerm.get(j).utf8ToString()); |
| } |
| b.append("\"]\n"); |
| } |
| } |
| } |
| b.append('}'); |
| return b.toString(); |
| } |
| |
| // TODO: should we impl rewrite to return BooleanQuery of PhraseQuery, |
| // when 1) automaton is finite, 2) doesn't use ANY transition, 3) is |
| // "small enough"? |
| |
| static class EnumAndScorer { |
| public final int termID; |
| public final PostingsEnum posEnum; |
| |
| // How many positions left in the current document: |
| public int posLeft; |
| |
| // Current position |
| public int pos; |
| |
| public EnumAndScorer(int termID, PostingsEnum posEnum) { |
| this.termID = termID; |
| this.posEnum = posEnum; |
| } |
| } |
| |
| final class TermAutomatonWeight extends Weight { |
| final Automaton automaton; |
| private final Map<Integer,TermStates> termStates; |
| private final Similarity.SimScorer stats; |
| private final Similarity similarity; |
| |
| public TermAutomatonWeight(Automaton automaton, IndexSearcher searcher, Map<Integer,TermStates> termStates, float boost) throws IOException { |
| super(TermAutomatonQuery.this); |
| this.automaton = automaton; |
| this.termStates = termStates; |
| this.similarity = searcher.getSimilarity(); |
| List<TermStatistics> allTermStats = new ArrayList<>(); |
| for(Map.Entry<Integer,BytesRef> ent : idToTerm.entrySet()) { |
| Integer termID = ent.getKey(); |
| if (ent.getValue() != null) { |
| TermStates ts = termStates.get(termID); |
| if (ts.docFreq() > 0) { |
| allTermStats.add(searcher.termStatistics(new Term(field, ent.getValue()), ts.docFreq(), ts.totalTermFreq())); |
| } |
| } |
| } |
| |
| if (allTermStats.isEmpty()) { |
| stats = null; // no terms matched at all, will not use sim |
| } else { |
| stats = similarity.scorer(boost, searcher.collectionStatistics(field), |
| allTermStats.toArray(new TermStatistics[allTermStats.size()])); |
| } |
| } |
| |
| @Override |
| public void extractTerms(Set<Term> terms) { |
| for(BytesRef text : termToID.keySet()) { |
| if (text != null) { |
| terms.add(new Term(field, text)); |
| } |
| } |
| } |
| |
| @Override |
| public String toString() { |
| return "weight(" + TermAutomatonQuery.this + ")"; |
| } |
| |
| @Override |
| public Scorer scorer(LeafReaderContext context) throws IOException { |
| |
| // Initialize the enums; null for a given slot means that term didn't appear in this reader |
| EnumAndScorer[] enums = new EnumAndScorer[idToTerm.size()]; |
| |
| boolean any = false; |
| for(Map.Entry<Integer,TermStates> ent : termStates.entrySet()) { |
| TermStates termStates = ent.getValue(); |
| assert termStates.wasBuiltFor(ReaderUtil.getTopLevelContext(context)) : "The top-reader used to create Weight is not the same as the current reader's top-reader (" + ReaderUtil.getTopLevelContext(context); |
| BytesRef term = idToTerm.get(ent.getKey()); |
| TermState state = termStates.get(context); |
| if (state != null) { |
| TermsEnum termsEnum = context.reader().terms(field).iterator(); |
| termsEnum.seekExact(term, state); |
| enums[ent.getKey()] = new EnumAndScorer(ent.getKey(), termsEnum.postings(null, PostingsEnum.POSITIONS)); |
| any = true; |
| } |
| } |
| |
| if (any) { |
| return new TermAutomatonScorer(this, enums, anyTermID, idToTerm, new LeafSimScorer(stats, context.reader(), field, true)); |
| } else { |
| return null; |
| } |
| } |
| |
| @Override |
| public boolean isCacheable(LeafReaderContext ctx) { |
| return true; |
| } |
| |
| @Override |
| public Explanation explain(LeafReaderContext context, int doc) throws IOException { |
| // TODO |
| return null; |
| } |
| } |
| |
| public Query rewrite(IndexReader reader) throws IOException { |
| if (Operations.isEmpty(det)) { |
| return new MatchNoDocsQuery(); |
| } |
| |
| IntsRef single = Operations.getSingleton(det); |
| if (single != null && single.length == 1) { |
| return new TermQuery(new Term(field, idToTerm.get(single.ints[single.offset]))); |
| } |
| |
| // TODO: can PhraseQuery really handle multiple terms at the same position? If so, why do we even have MultiPhraseQuery? |
| |
| // Try for either PhraseQuery or MultiPhraseQuery, which only works when the automaton is a sausage: |
| MultiPhraseQuery.Builder mpq = new MultiPhraseQuery.Builder(); |
| PhraseQuery.Builder pq = new PhraseQuery.Builder(); |
| |
| Transition t = new Transition(); |
| int state = 0; |
| int pos = 0; |
| query: |
| while (true) { |
| int count = det.initTransition(state, t); |
| if (count == 0) { |
| if (det.isAccept(state) == false) { |
| mpq = null; |
| pq = null; |
| } |
| break; |
| } else if (det.isAccept(state)) { |
| mpq = null; |
| pq = null; |
| break; |
| } |
| int dest = -1; |
| List<Term> terms = new ArrayList<>(); |
| boolean matchesAny = false; |
| for(int i=0;i<count;i++) { |
| det.getNextTransition(t); |
| if (i == 0) { |
| dest = t.dest; |
| } else if (dest != t.dest) { |
| mpq = null; |
| pq = null; |
| break query; |
| } |
| |
| matchesAny |= anyTermID >= t.min && anyTermID <= t.max; |
| |
| if (matchesAny == false) { |
| for(int termID=t.min;termID<=t.max;termID++) { |
| terms.add(new Term(field, idToTerm.get(termID))); |
| } |
| } |
| } |
| if (matchesAny == false) { |
| mpq.add(terms.toArray(new Term[terms.size()]), pos); |
| if (pq != null) { |
| if (terms.size() == 1) { |
| pq.add(terms.get(0), pos); |
| } else { |
| pq = null; |
| } |
| } |
| } |
| state = dest; |
| pos++; |
| } |
| |
| if (pq != null) { |
| return pq.build(); |
| } else if (mpq != null) { |
| return mpq.build(); |
| } |
| |
| // TODO: we could maybe also rewrite to union of PhraseQuery (pull all finite strings) if it's "worth it"? |
| return this; |
| } |
| |
| @Override |
| public void visit(QueryVisitor visitor) { |
| if (visitor.acceptField(field) == false) { |
| return; |
| } |
| QueryVisitor v = visitor.getSubVisitor(BooleanClause.Occur.SHOULD, this); |
| for (BytesRef term : termToID.keySet()) { |
| v.consumeTerms(this, new Term(field, term)); |
| } |
| } |
| } |