| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.pattern; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.StringReader; |
| import java.util.regex.Matcher; |
| import java.util.regex.Pattern; |
| |
| import org.apache.lucene.analysis.charfilter.BaseCharFilter; |
| |
| /** |
| * CharFilter that uses a regular expression for the target of replace string. |
| * The pattern match will be done in each "block" in char stream. |
| * |
| * <p> |
| * ex1) source="aa bb aa bb", pattern="(aa)\\s+(bb)" replacement="$1#$2"<br> |
| * output="aa#bb aa#bb" |
| * </p> |
| * |
| * NOTE: If you produce a phrase that has different length to source string |
| * and the field is used for highlighting for a term of the phrase, you will |
| * face a trouble. |
| * |
| * <p> |
| * ex2) source="aa123bb", pattern="(aa)\\d+(bb)" replacement="$1 $2"<br> |
| * output="aa bb"<br> |
| * and you want to search bb and highlight it, you will get<br> |
| * highlight snippet="aa1<em>23bb</em>" |
| * </p> |
| * |
| * @since Solr 1.5 |
| */ |
| public class PatternReplaceCharFilter extends BaseCharFilter { |
| |
| private final Pattern pattern; |
| private final String replacement; |
| private Reader transformedInput; |
| |
| public PatternReplaceCharFilter(Pattern pattern, String replacement, Reader in) { |
| super(in); |
| this.pattern = pattern; |
| this.replacement = replacement; |
| } |
| |
| @Override |
| public int read(char[] cbuf, int off, int len) throws IOException { |
| // Buffer all input on the first call. |
| if (transformedInput == null) { |
| fill(); |
| } |
| |
| return transformedInput.read(cbuf, off, len); |
| } |
| |
| private void fill() throws IOException { |
| StringBuilder buffered = new StringBuilder(); |
| char [] temp = new char [1024]; |
| for (int cnt = input.read(temp); cnt > 0; cnt = input.read(temp)) { |
| buffered.append(temp, 0, cnt); |
| } |
| transformedInput = new StringReader(processPattern(buffered).toString()); |
| } |
| |
| @Override |
| public int read() throws IOException { |
| if (transformedInput == null) { |
| fill(); |
| } |
| |
| return transformedInput.read(); |
| } |
| |
| @Override |
| protected int correct(int currentOff) { |
| return Math.max(0, super.correct(currentOff)); |
| } |
| |
| /** |
| * Replace pattern in input and mark correction offsets. |
| */ |
| CharSequence processPattern(CharSequence input) { |
| final Matcher m = pattern.matcher(input); |
| |
| final StringBuffer cumulativeOutput = new StringBuffer(); |
| int cumulative = 0; |
| int lastMatchEnd = 0; |
| while (m.find()) { |
| final int groupSize = m.end() - m.start(); |
| final int skippedSize = m.start() - lastMatchEnd; |
| lastMatchEnd = m.end(); |
| |
| final int lengthBeforeReplacement = cumulativeOutput.length() + skippedSize; |
| m.appendReplacement(cumulativeOutput, replacement); |
| // Matcher doesn't tell us how many characters have been appended before the replacement. |
| // So we need to calculate it. Skipped characters have been added as part of appendReplacement. |
| final int replacementSize = cumulativeOutput.length() - lengthBeforeReplacement; |
| |
| if (groupSize != replacementSize) { |
| if (replacementSize < groupSize) { |
| // The replacement is smaller. |
| // Add the 'backskip' to the next index after the replacement (this is possibly |
| // after the end of string, but it's fine -- it just means the last character |
| // of the replaced block doesn't reach the end of the original string. |
| cumulative += groupSize - replacementSize; |
| int atIndex = lengthBeforeReplacement + replacementSize; |
| // System.err.println(atIndex + "!" + cumulative); |
| addOffCorrectMap(atIndex, cumulative); |
| } else { |
| // The replacement is larger. Every new index needs to point to the last |
| // element of the original group (if any). |
| for (int i = groupSize; i < replacementSize; i++) { |
| addOffCorrectMap(lengthBeforeReplacement + i, --cumulative); |
| // System.err.println((lengthBeforeReplacement + i) + " " + cumulative); |
| } |
| } |
| } |
| } |
| |
| // Append the remaining output, no further changes to indices. |
| m.appendTail(cumulativeOutput); |
| return cumulativeOutput; |
| } |
| } |