| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.search.highlight; |
| |
| import java.io.IOException; |
| import java.util.ArrayList; |
| import java.util.Iterator; |
| import java.util.Objects; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.TokenStream; |
| import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; |
| import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; |
| import org.apache.lucene.util.PriorityQueue; |
| |
| /** |
| * Marks up highlighted terms found in the best sections of |
| * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, |
| * {@link Encoder} and tokenizers. |
| * |
| * This is Lucene's original Highlighter; there are others. |
| */ |
| public class Highlighter |
| { |
| public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024; |
| |
| private Formatter formatter; |
| private Encoder encoder; |
| private Scorer fragmentScorer; |
| private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE; |
| private Fragmenter textFragmenter = new SimpleFragmenter(); |
| |
| public Highlighter(Scorer fragmentScorer) |
| { |
| this(new SimpleHTMLFormatter(),fragmentScorer); |
| } |
| |
| public Highlighter(Formatter formatter, Scorer fragmentScorer) |
| { |
| this(formatter,new DefaultEncoder(),fragmentScorer); |
| } |
| |
| public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) |
| { |
| ensureArgumentNotNull(formatter, "'formatter' must not be null"); |
| ensureArgumentNotNull(encoder, "'encoder' must not be null"); |
| ensureArgumentNotNull(fragmentScorer, "'fragmentScorer' must not be null"); |
| |
| this.formatter = formatter; |
| this.encoder = encoder; |
| this.fragmentScorer = fragmentScorer; |
| } |
| |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant section. |
| * This is a convenience method that calls |
| * {@link #getBestFragment(TokenStream, String)} |
| * |
| * @param analyzer the analyzer that will be used to split <code>text</code> |
| * into chunks |
| * @param text text to highlight terms in |
| * @param fieldName Name of field used to influence analyzer's tokenization policy |
| * |
| * @return highlighted text fragment or null if no terms found |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final String getBestFragment(Analyzer analyzer, String fieldName,String text) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| TokenStream tokenStream = analyzer.tokenStream(fieldName, text); |
| return getBestFragment(tokenStream, text); |
| } |
| |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant section. |
| * The document text is analysed in chunks to record hit statistics |
| * across the document. After accumulating stats, the fragment with the highest score |
| * is returned |
| * |
| * @param tokenStream a stream of tokens identified in the text parameter, including offset information. |
| * This is typically produced by an analyzer re-parsing a document's |
| * text. Some work may be done on retrieving TokenStreams more efficiently |
| * by adding support for storing original text position data in the Lucene |
| * index but this support is not currently available (as of Lucene 1.4 rc2). |
| * @param text text to highlight terms in |
| * |
| * @return highlighted text fragment or null if no terms found |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final String getBestFragment(TokenStream tokenStream, String text) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| String[] results = getBestFragments(tokenStream,text, 1); |
| if (results.length > 0) |
| { |
| return results[0]; |
| } |
| return null; |
| } |
| |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant sections. |
| * This is a convenience method that calls |
| * {@link #getBestFragments(TokenStream, String, int)} |
| * |
| * @param analyzer the analyzer that will be used to split <code>text</code> |
| * into chunks |
| * @param fieldName the name of the field being highlighted (used by analyzer) |
| * @param text text to highlight terms in |
| * @param maxNumFragments the maximum number of fragments. |
| * |
| * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final String[] getBestFragments( |
| Analyzer analyzer, |
| String fieldName, |
| String text, |
| int maxNumFragments) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| TokenStream tokenStream = analyzer.tokenStream(fieldName, text); |
| return getBestFragments(tokenStream, text, maxNumFragments); |
| } |
| |
| /** |
| * Highlights chosen terms in a text, extracting the most relevant sections. |
| * The document text is analysed in chunks to record hit statistics |
| * across the document. After accumulating stats, the fragments with the highest scores |
| * are returned as an array of strings in order of score (contiguous fragments are merged into |
| * one in their original order to improve readability) |
| * |
| * @param text text to highlight terms in |
| * @param maxNumFragments the maximum number of fragments. |
| * |
| * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final String[] getBestFragments( |
| TokenStream tokenStream, |
| String text, |
| int maxNumFragments) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| maxNumFragments = Math.max(1, maxNumFragments); //sanity check |
| |
| TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); |
| |
| //Get text |
| ArrayList<String> fragTexts = new ArrayList<>(); |
| for (int i = 0; i < frag.length; i++) |
| { |
| if ((frag[i] != null) && (frag[i].getScore() > 0)) |
| { |
| fragTexts.add(frag[i].toString()); |
| } |
| } |
| return fragTexts.toArray(new String[0]); |
| } |
| |
| |
| /** |
| * Low level api to get the most relevant (formatted) sections of the document. |
| * This method has been made public to allow visibility of score information held in TextFragment objects. |
| * Thanks to Jason Calabrese for help in redefining the interface. |
| * @throws IOException If there is a low-level I/O error |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final TextFragment[] getBestTextFragments( |
| TokenStream tokenStream, |
| String text, |
| boolean mergeContiguousFragments, |
| int maxNumFragments) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| ArrayList<TextFragment> docFrags = new ArrayList<>(); |
| StringBuilder newText=new StringBuilder(); |
| |
| CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); |
| OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); |
| TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); |
| |
| if (fragmentScorer instanceof QueryScorer) { |
| ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); |
| } |
| |
| TokenStream newStream = fragmentScorer.init(tokenStream); |
| if(newStream != null) { |
| tokenStream = newStream; |
| } |
| fragmentScorer.startFragment(currentFrag); |
| docFrags.add(currentFrag); |
| |
| FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); |
| |
| try |
| { |
| |
| String tokenText; |
| int startOffset; |
| int endOffset; |
| int lastEndOffset = 0; |
| textFragmenter.start(text, tokenStream); |
| |
| TokenGroup tokenGroup=new TokenGroup(tokenStream); |
| |
| tokenStream.reset(); |
| for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); |
| next = tokenStream.incrementToken()) |
| { |
| if( (offsetAtt.endOffset()>text.length()) |
| || |
| (offsetAtt.startOffset()>text.length()) |
| ) |
| { |
| throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() |
| +" exceeds length of provided text sized "+text.length()); |
| } |
| if((tokenGroup.getNumTokens() >0)&&(tokenGroup.isDistinct())) |
| { |
| //the current token is distinct from previous tokens - |
| // markup the cached token group info |
| startOffset = tokenGroup.getStartOffset(); |
| endOffset = tokenGroup.getEndOffset(); |
| tokenText = text.substring(startOffset, endOffset); |
| String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); |
| //store any whitespace etc from between this and last group |
| if (startOffset > lastEndOffset) |
| newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); |
| newText.append(markedUpText); |
| lastEndOffset=Math.max(endOffset, lastEndOffset); |
| tokenGroup.clear(); |
| |
| //check if current token marks the start of a new fragment |
| if(textFragmenter.isNewFragment()) |
| { |
| currentFrag.setScore(fragmentScorer.getFragmentScore()); |
| //record stats for a new fragment |
| currentFrag.textEndPos = newText.length(); |
| currentFrag =new TextFragment(newText, newText.length(), docFrags.size()); |
| fragmentScorer.startFragment(currentFrag); |
| docFrags.add(currentFrag); |
| } |
| } |
| |
| tokenGroup.addToken(fragmentScorer.getTokenScore()); |
| |
| // if(lastEndOffset>maxDocBytesToAnalyze) |
| // { |
| // break; |
| // } |
| } |
| currentFrag.setScore(fragmentScorer.getFragmentScore()); |
| |
| if(tokenGroup.getNumTokens() >0) |
| { |
| //flush the accumulated text (same code as in above loop) |
| startOffset = tokenGroup.getStartOffset(); |
| endOffset = tokenGroup.getEndOffset(); |
| tokenText = text.substring(startOffset, endOffset); |
| String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); |
| //store any whitespace etc from between this and last group |
| if (startOffset > lastEndOffset) |
| newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); |
| newText.append(markedUpText); |
| lastEndOffset=Math.max(lastEndOffset,endOffset); |
| } |
| |
| //Test what remains of the original text beyond the point where we stopped analyzing |
| if ( |
| // if there is text beyond the last token considered.. |
| (lastEndOffset < text.length()) |
| && |
| // and that text is not too large... |
| (text.length()<= maxDocCharsToAnalyze) |
| ) |
| { |
| //append it to the last fragment |
| newText.append(encoder.encodeText(text.substring(lastEndOffset))); |
| } |
| |
| currentFrag.textEndPos = newText.length(); |
| |
| //sort the most relevant sections of the text |
| for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();) |
| { |
| currentFrag = i.next(); |
| |
| //If you are running with a version of Lucene before 11th Sept 03 |
| // you do not have PriorityQueue.insert() - so uncomment the code below |
| /* |
| if (currentFrag.getScore() >= minScore) |
| { |
| fragQueue.put(currentFrag); |
| if (fragQueue.size() > maxNumFragments) |
| { // if hit queue overfull |
| fragQueue.pop(); // remove lowest in hit queue |
| minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore |
| } |
| |
| |
| } |
| */ |
| //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 |
| //fix to PriorityQueue. The correct method to use here is the new "insert" method |
| // USE ABOVE CODE IF THIS DOES NOT COMPILE! |
| fragQueue.insertWithOverflow(currentFrag); |
| } |
| |
| //return the most relevant fragments |
| TextFragment frag[] = new TextFragment[fragQueue.size()]; |
| for (int i = frag.length - 1; i >= 0; i--) |
| { |
| frag[i] = fragQueue.pop(); |
| } |
| |
| //merge any contiguous fragments to improve readability |
| if(mergeContiguousFragments) |
| { |
| mergeContiguousFragments(frag); |
| ArrayList<TextFragment> fragTexts = new ArrayList<>(); |
| for (int i = 0; i < frag.length; i++) |
| { |
| if ((frag[i] != null) && (frag[i].getScore() > 0)) |
| { |
| fragTexts.add(frag[i]); |
| } |
| } |
| frag= fragTexts.toArray(new TextFragment[0]); |
| } |
| |
| return frag; |
| |
| } |
| finally |
| { |
| if (tokenStream != null) |
| { |
| try |
| { |
| tokenStream.end(); |
| tokenStream.close(); |
| } |
| catch (Exception e) |
| { |
| } |
| } |
| } |
| } |
| |
| |
| /** Improves readability of a score-sorted list of TextFragments by merging any fragments |
| * that were contiguous in the original text into one larger fragment with the correct order. |
| * This will leave a "null" in the array entry for the lesser scored fragment. |
| * |
| * @param frag An array of document fragments in descending score |
| */ |
| private void mergeContiguousFragments(TextFragment[] frag) |
| { |
| boolean mergingStillBeingDone; |
| if (frag.length > 1) |
| do |
| { |
| mergingStillBeingDone = false; //initialise loop control flag |
| //for each fragment, scan other frags looking for contiguous blocks |
| for (int i = 0; i < frag.length; i++) |
| { |
| if (frag[i] == null) |
| { |
| continue; |
| } |
| //merge any contiguous blocks |
| for (int x = 0; x < frag.length; x++) |
| { |
| if (frag[x] == null) |
| { |
| continue; |
| } |
| if (frag[i] == null) |
| { |
| break; |
| } |
| TextFragment frag1 = null; |
| TextFragment frag2 = null; |
| int frag1Num = 0; |
| int frag2Num = 0; |
| int bestScoringFragNum; |
| int worstScoringFragNum; |
| //if blocks are contiguous.... |
| if (frag[i].follows(frag[x])) |
| { |
| frag1 = frag[x]; |
| frag1Num = x; |
| frag2 = frag[i]; |
| frag2Num = i; |
| } |
| else |
| if (frag[x].follows(frag[i])) |
| { |
| frag1 = frag[i]; |
| frag1Num = i; |
| frag2 = frag[x]; |
| frag2Num = x; |
| } |
| //merging required.. |
| if (frag1 != null) |
| { |
| if (frag1.getScore() > frag2.getScore()) |
| { |
| bestScoringFragNum = frag1Num; |
| worstScoringFragNum = frag2Num; |
| } |
| else |
| { |
| bestScoringFragNum = frag2Num; |
| worstScoringFragNum = frag1Num; |
| } |
| frag1.merge(frag2); |
| frag[worstScoringFragNum] = null; |
| mergingStillBeingDone = true; |
| frag[bestScoringFragNum] = frag1; |
| } |
| } |
| } |
| } |
| while (mergingStillBeingDone); |
| } |
| |
| |
| /** |
| * Highlights terms in the text , extracting the most relevant sections |
| * and concatenating the chosen fragments with a separator (typically "..."). |
| * The document text is analysed in chunks to record hit statistics |
| * across the document. After accumulating stats, the fragments with the highest scores |
| * are returned in order as "separator" delimited strings. |
| * |
| * @param text text to highlight terms in |
| * @param maxNumFragments the maximum number of fragments. |
| * @param separator the separator used to intersperse the document fragments (typically "...") |
| * |
| * @return highlighted text |
| * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length |
| */ |
| public final String getBestFragments( |
| TokenStream tokenStream, |
| String text, |
| int maxNumFragments, |
| String separator) |
| throws IOException, InvalidTokenOffsetsException |
| { |
| String sections[] = getBestFragments(tokenStream,text, maxNumFragments); |
| StringBuilder result = new StringBuilder(); |
| for (int i = 0; i < sections.length; i++) |
| { |
| if (i > 0) |
| { |
| result.append(separator); |
| } |
| result.append(sections[i]); |
| } |
| return result.toString(); |
| } |
| |
| public int getMaxDocCharsToAnalyze() { |
| return maxDocCharsToAnalyze; |
| } |
| |
| public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { |
| this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; |
| } |
| |
| public Fragmenter getTextFragmenter() |
| { |
| return textFragmenter; |
| } |
| |
| public void setTextFragmenter(Fragmenter fragmenter) |
| { |
| textFragmenter = Objects.requireNonNull(fragmenter); |
| } |
| |
| /** |
| * @return Object used to score each text fragment |
| */ |
| public Scorer getFragmentScorer() |
| { |
| return fragmentScorer; |
| } |
| |
| public void setFragmentScorer(Scorer scorer) |
| { |
| fragmentScorer = Objects.requireNonNull(scorer); |
| } |
| |
| public Encoder getEncoder() { |
| return encoder; |
| } |
| |
| public void setEncoder(Encoder encoder) { |
| this.encoder = Objects.requireNonNull(encoder); |
| } |
| |
| /** |
| * Throws an IllegalArgumentException with the provided message if 'argument' is null. |
| * |
| * @param argument the argument to be null-checked |
| * @param message the message of the exception thrown if argument == null |
| */ |
| private static void ensureArgumentNotNull(Object argument, String message) { |
| if (argument == null) { |
| throw new IllegalArgumentException(message); |
| } |
| } |
| |
| static class FragmentQueue extends PriorityQueue<TextFragment> |
| { |
| FragmentQueue(int size) |
| { |
| super(size); |
| } |
| |
| @Override |
| public final boolean lessThan(TextFragment fragA, TextFragment fragB) |
| { |
| if (fragA.getScore() == fragB.getScore()) |
| return fragA.fragNum > fragB.fragNum; |
| else |
| return fragA.getScore() < fragB.getScore(); |
| } |
| } |
| } |