/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package org.apache.solr.highlight; | |
import java.util.ArrayList; | |
import java.util.Arrays; | |
import java.util.regex.Matcher; | |
import java.util.regex.Pattern; | |
import org.apache.lucene.analysis.TokenStream; | |
import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; | |
import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; | |
import org.apache.lucene.search.highlight.Fragmenter; | |
/** | |
* Fragmenter that tries to produce snippets that "look" like a regular | |
* expression. | |
* | |
* NOTE: the default for <code>maxAnalyzedChars</code> is much lower for this | |
* fragmenter. After this limit is exhausted, fragments are produced in the | |
* same way as <code>GapFragmenter</code> | |
*/ | |
class LuceneRegexFragmenter implements Fragmenter | |
{ | |
// ** defaults | |
public static final int DEFAULT_FRAGMENT_SIZE = 70; | |
public static final int DEFAULT_INCREMENT_GAP = 50; | |
public static final float DEFAULT_SLOP = 0.6f; | |
public static final int DEFAULT_MAX_ANALYZED_CHARS = 10000; | |
// ** settings | |
// desired length of fragments, in characters | |
protected int targetFragChars; | |
// increment gap which indicates a new fragment should occur | |
// (often due to multi-valued fields) | |
protected int incrementGapThreshold; | |
// factor by which we are allowed to bend the frag size (larger or smaller) | |
protected float slop; | |
// analysis limit (ensures we don't waste too much time on long fields) | |
protected int maxAnalyzedChars; | |
// default desirable pattern for text fragments. | |
protected Pattern textRE; | |
// ** state | |
protected int currentNumFrags; | |
protected int currentOffset; | |
protected int targetOffset; | |
protected int[] hotspots; | |
private PositionIncrementAttribute posIncAtt; | |
private OffsetAttribute offsetAtt; | |
// ** other | |
// note: could dynamically change size of sentences extracted to match | |
// target frag size | |
public static final String | |
DEFAULT_PATTERN_RAW = "[-\\w ,\\n\"']{20,200}"; | |
public static final Pattern | |
DEFAULT_PATTERN = Pattern.compile(DEFAULT_PATTERN_RAW); | |
public LuceneRegexFragmenter() { | |
this(DEFAULT_FRAGMENT_SIZE, | |
DEFAULT_INCREMENT_GAP, | |
DEFAULT_SLOP, | |
DEFAULT_MAX_ANALYZED_CHARS); | |
} | |
public LuceneRegexFragmenter(int targetFragChars) { | |
this(targetFragChars, | |
DEFAULT_INCREMENT_GAP, | |
DEFAULT_SLOP, | |
DEFAULT_MAX_ANALYZED_CHARS); | |
} | |
public LuceneRegexFragmenter(int targetFragChars, | |
int incrementGapThreshold, | |
float slop, | |
int maxAnalyzedChars ) { | |
this(targetFragChars, incrementGapThreshold, slop, maxAnalyzedChars, | |
DEFAULT_PATTERN); | |
} | |
public LuceneRegexFragmenter(int targetFragChars, | |
int incrementGapThreshold, | |
float slop, | |
int maxAnalyzedChars, | |
Pattern targetPattern) { | |
this.targetFragChars = targetFragChars; | |
this.incrementGapThreshold = incrementGapThreshold; | |
this.slop = slop; | |
this.maxAnalyzedChars = maxAnalyzedChars; | |
this.textRE = targetPattern; | |
} | |
/* (non-Javadoc) | |
* @see org.apache.lucene.search.highlight.TextFragmenter#start(java.lang.String) | |
*/ | |
@Override | |
public void start(String originalText, TokenStream tokenStream) { | |
currentNumFrags = 1; | |
currentOffset = 0; | |
addHotSpots(originalText); | |
posIncAtt = tokenStream.getAttribute(PositionIncrementAttribute.class); | |
offsetAtt = tokenStream.getAttribute(OffsetAttribute.class); | |
} | |
//////////////////////////////////// | |
// pre-analysis | |
//////////////////////////////////// | |
protected void addHotSpots(String text) { | |
//System.out.println("hot spotting"); | |
ArrayList<Integer> temphs = new ArrayList<>( | |
text.length() / targetFragChars); | |
Matcher match = textRE.matcher(text); | |
int cur = 0; | |
while(match.find() && cur < maxAnalyzedChars) { | |
int start=match.start(), end=match.end(); | |
temphs.add(start); | |
temphs.add(end); | |
cur = end; | |
//System.out.println("Matched " + match.group()); | |
} | |
hotspots = new int[temphs.size()]; | |
for(int i = 0; i < temphs.size(); i++) { | |
hotspots[i] = temphs.get(i); | |
} | |
// perhaps not necessary--I don't know if re matches are non-overlapping | |
Arrays.sort(hotspots); | |
} | |
//////////////////////////////////// | |
// fragmenting | |
//////////////////////////////////// | |
/* (non-Javadoc) | |
* @see org.apache.lucene.search.highlight.TextFragmenter#isNewFragment(org.apache.lucene.analysis.Token) | |
*/ | |
@Override | |
public boolean isNewFragment() | |
{ | |
boolean isNewFrag = false; | |
int minFragLen = (int)((1.0f - slop)*targetFragChars); | |
int endOffset = offsetAtt.endOffset(); | |
// ** determin isNewFrag | |
if(posIncAtt.getPositionIncrement() > incrementGapThreshold) { | |
// large position gaps always imply new fragments | |
isNewFrag = true; | |
} else if(endOffset - currentOffset < minFragLen) { | |
// we're not in our range of flexibility | |
isNewFrag = false; | |
} else if(targetOffset > 0) { | |
// we've already decided on a target | |
isNewFrag = endOffset > targetOffset; | |
} else { | |
// we might be able to do something | |
int minOffset = currentOffset + minFragLen; | |
int maxOffset = (int)(currentOffset + (1.0f + slop)*targetFragChars); | |
int hotIndex; | |
// look for a close hotspot | |
hotIndex = Arrays.binarySearch(hotspots, endOffset); | |
if(hotIndex < 0) hotIndex = -hotIndex; | |
if(hotIndex >= hotspots.length) { | |
// no more hotspots in this input stream | |
targetOffset = currentOffset + targetFragChars; | |
} else if(hotspots[hotIndex] > maxOffset) { | |
// no hotspots within slop | |
targetOffset = currentOffset + targetFragChars; | |
} else { | |
// try to find hotspot in slop | |
int goal = hotspots[hotIndex]; | |
while(goal < minOffset && hotIndex < hotspots.length) { | |
hotIndex++; | |
goal = hotspots[hotIndex]; | |
} | |
targetOffset = goal <= maxOffset ? goal : currentOffset + targetFragChars; | |
} | |
isNewFrag = endOffset > targetOffset; | |
} | |
// ** operate on isNewFrag | |
if(isNewFrag) { | |
currentNumFrags++; | |
currentOffset = endOffset; | |
targetOffset = -1; | |
} | |
return isNewFrag; | |
} | |
} |