| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.highlight; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.CachingTokenFilter; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; |
| import org.apache.lucene.index.IndexReader; |
| import org.apache.lucene.index.memory.MemoryIndex; |
| import org.apache.lucene.search.Query; |
| import org.apache.lucene.search.spans.SpanQuery; |
| |
| /** |
| * {@link Scorer} implementation which scores text fragments by the number of |
| * unique query terms found. This class converts appropriate {@link Query}s to |
| * {@link SpanQuery}s and attempts to score only those terms that participated in |
| * generating the 'hit' on the document. |
| */ |
| public class QueryScorer implements Scorer { |
| private float totalScore; |
| private Set<String> foundTerms; |
| private Map<String,WeightedSpanTerm> fieldWeightedSpanTerms; |
| private float maxTermWeight; |
| private int position = -1; |
| private String defaultField; |
| private CharTermAttribute termAtt; |
| private PositionIncrementAttribute posIncAtt; |
| private boolean expandMultiTermQuery = true; |
| private Query query; |
| private String field; |
| private IndexReader reader; |
| private boolean skipInitExtractor; |
| private boolean wrapToCaching = true; |
| private int maxCharsToAnalyze; |
| private boolean usePayloads = false; |
| |
| /** |
| * @param query Query to use for highlighting |
| */ |
| public QueryScorer(Query query) { |
| init(query, null, null, true); |
| } |
| |
| /** |
| * @param query Query to use for highlighting |
| * @param field Field to highlight - pass null to ignore fields |
| */ |
| public QueryScorer(Query query, String field) { |
| init(query, field, null, true); |
| } |
| |
| /** |
| * @param query Query to use for highlighting |
| * @param field Field to highlight - pass null to ignore fields |
| * @param reader {@link IndexReader} to use for quasi tf/idf scoring |
| */ |
| public QueryScorer(Query query, IndexReader reader, String field) { |
| init(query, field, reader, true); |
| } |
| |
| |
| /** |
| * @param query to use for highlighting |
| * @param reader {@link IndexReader} to use for quasi tf/idf scoring |
| * @param field to highlight - pass null to ignore fields |
| */ |
| public QueryScorer(Query query, IndexReader reader, String field, String defaultField) { |
| this.defaultField = defaultField; |
| init(query, field, reader, true); |
| } |
| |
| /** |
| * @param defaultField - The default field for queries with the field name unspecified |
| */ |
| public QueryScorer(Query query, String field, String defaultField) { |
| this.defaultField = defaultField; |
| init(query, field, null, true); |
| } |
| |
| /** |
| * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s |
| */ |
| public QueryScorer(WeightedSpanTerm[] weightedTerms) { |
| this.fieldWeightedSpanTerms = new HashMap<>(weightedTerms.length); |
| |
| for (int i = 0; i < weightedTerms.length; i++) { |
| WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term); |
| |
| if ((existingTerm == null) || |
| (existingTerm.weight < weightedTerms[i].weight)) { |
| // if a term is defined more than once, always use the highest |
| // scoring weight |
| fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]); |
| maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); |
| } |
| } |
| skipInitExtractor = true; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() |
| */ |
| @Override |
| public float getFragmentScore() { |
| return totalScore; |
| } |
| |
| /** |
| * |
| * @return The highest weighted term (useful for passing to |
| * GradientFormatter to set top end of coloring scale). |
| */ |
| public float getMaxTermWeight() { |
| return maxTermWeight; |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token, |
| * int) |
| */ |
| @Override |
| public float getTokenScore() { |
| position += posIncAtt.getPositionIncrement(); |
| String termText = termAtt.toString(); |
| |
| WeightedSpanTerm weightedSpanTerm; |
| |
| if ((weightedSpanTerm = fieldWeightedSpanTerms.get( |
| termText)) == null) { |
| return 0; |
| } |
| |
| if (weightedSpanTerm.positionSensitive && |
| !weightedSpanTerm.checkPosition(position)) { |
| return 0; |
| } |
| |
| float score = weightedSpanTerm.getWeight(); |
| |
| // found a query term - is it unique in this doc? |
| if (!foundTerms.contains(termText)) { |
| totalScore += score; |
| foundTerms.add(termText); |
| } |
| |
| return score; |
| } |
| |
| /* (non-Javadoc) |
| * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) |
| */ |
| @Override |
| public TokenStream init(TokenStream tokenStream) throws IOException { |
| position = -1; |
| termAtt = tokenStream.addAttribute(CharTermAttribute.class); |
| posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); |
| if(!skipInitExtractor) { |
| if(fieldWeightedSpanTerms != null) { |
| fieldWeightedSpanTerms.clear(); |
| } |
| return initExtractor(tokenStream); |
| } |
| return null; |
| } |
| |
| /** |
| * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing |
| * Span information to a {@link Fragmenter}. |
| * |
| * @param token to get {@link WeightedSpanTerm} for |
| * @return WeightedSpanTerm for token |
| */ |
| public WeightedSpanTerm getWeightedSpanTerm(String token) { |
| return fieldWeightedSpanTerms.get(token); |
| } |
| |
| /** |
| */ |
| private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) { |
| this.reader = reader; |
| this.expandMultiTermQuery = expandMultiTermQuery; |
| this.query = query; |
| this.field = field; |
| } |
| |
| private TokenStream initExtractor(TokenStream tokenStream) throws IOException { |
| WeightedSpanTermExtractor qse = newTermExtractor(defaultField); |
| qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze); |
| qse.setExpandMultiTermQuery(expandMultiTermQuery); |
| qse.setWrapIfNotCachingTokenFilter(wrapToCaching); |
| qse.setUsePayloads(usePayloads); |
| if (reader == null) { |
| this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query, 1f, |
| tokenStream, field); |
| } else { |
| this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query, 1f, |
| tokenStream, field, reader); |
| } |
| if(qse.isCachedTokenStream()) { |
| return qse.getTokenStream(); |
| } |
| |
| return null; |
| } |
| |
| protected WeightedSpanTermExtractor newTermExtractor(String defaultField) { |
| return new WeightedSpanTermExtractor(defaultField); |
| } |
| |
| /* |
| * (non-Javadoc) |
| * |
| * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment) |
| */ |
| @Override |
| public void startFragment(TextFragment newFragment) { |
| foundTerms = new HashSet<>(); |
| totalScore = 0; |
| } |
| |
| /** |
| * @return true if multi-term queries should be expanded |
| */ |
| public boolean isExpandMultiTermQuery() { |
| return expandMultiTermQuery; |
| } |
| |
| /** |
| * Controls whether or not multi-term queries are expanded |
| * against a {@link MemoryIndex} {@link IndexReader}. |
| * |
| * @param expandMultiTermQuery true if multi-term queries should be expanded |
| */ |
| public void setExpandMultiTermQuery(boolean expandMultiTermQuery) { |
| this.expandMultiTermQuery = expandMultiTermQuery; |
| } |
| |
| /** |
| * Whether or not we should capture payloads in {@link MemoryIndex} at each position so that queries can access them. |
| * This does not apply to term vector based TokenStreams, which support payloads only when the term vector has them. |
| */ |
| public boolean isUsePayloads() { |
| return usePayloads; |
| } |
| |
| public void setUsePayloads(boolean usePayloads) { |
| this.usePayloads = usePayloads; |
| } |
| |
| /** |
| * By default, {@link TokenStream}s that are not of the type |
| * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to |
| * ensure an efficient reset - if you are already using a different caching |
| * {@link TokenStream} impl and you don't want it to be wrapped, set this to |
| * false. Note that term-vector based tokenstreams are detected and won't be |
| * wrapped either. |
| */ |
| public void setWrapIfNotCachingTokenFilter(boolean wrap) { |
| this.wrapToCaching = wrap; |
| } |
| |
| public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { |
| this.maxCharsToAnalyze = maxDocCharsToAnalyze; |
| } |
| } |