| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.charfilter; |
| |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.util.Map; |
| import org.apache.lucene.analysis.CharFilter; // javadocs |
| import org.apache.lucene.analysis.util.RollingCharBuffer; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.fst.CharSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Outputs; |
| |
| /** |
| * Simplistic {@link CharFilter} that applies the mappings contained in a {@link NormalizeCharMap} |
| * to the character stream, and correcting the resulting changes to the offsets. Matching is greedy |
| * (longest pattern matching at a given point wins). Replacement is allowed to be the empty string. |
| */ |
| public class MappingCharFilter extends BaseCharFilter { |
| |
| private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
| private final FST<CharsRef> map; |
| private final FST.BytesReader fstReader; |
| private final RollingCharBuffer buffer = new RollingCharBuffer(); |
| private final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); |
| private final Map<Character, FST.Arc<CharsRef>> cachedRootArcs; |
| |
| private CharsRef replacement; |
| private int replacementPointer; |
| private int inputOff; |
| |
| /** Default constructor that takes a {@link Reader}. */ |
| public MappingCharFilter(NormalizeCharMap normMap, Reader in) { |
| super(in); |
| buffer.reset(in); |
| |
| map = normMap.map; |
| cachedRootArcs = normMap.cachedRootArcs; |
| |
| if (map != null) { |
| fstReader = map.getBytesReader(); |
| } else { |
| fstReader = null; |
| } |
| } |
| |
| @Override |
| public void reset() throws IOException { |
| input.reset(); |
| buffer.reset(input); |
| replacement = null; |
| inputOff = 0; |
| } |
| |
| @Override |
| public int read() throws IOException { |
| |
| // System.out.println("\nread"); |
| while (true) { |
| |
| if (replacement != null && replacementPointer < replacement.length) { |
| // System.out.println(" return repl[" + replacementPointer + "]=" + |
| // replacement.chars[replacement.offset + replacementPointer]); |
| return replacement.chars[replacement.offset + replacementPointer++]; |
| } |
| |
| // TODO: a more efficient approach would be Aho/Corasick's |
| // algorithm |
| // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm) |
| // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps |
| // |
| // I think this would be (almost?) equivalent to 1) adding |
| // epsilon arcs from all final nodes back to the init |
| // node in the FST, 2) adding a .* (skip any char) |
| // loop on the initial node, and 3) determinizing |
| // that. Then we would not have to restart matching |
| // at each position. |
| |
| int lastMatchLen = -1; |
| CharsRef lastMatch = null; |
| |
| final int firstCH = buffer.get(inputOff); |
| if (firstCH != -1) { |
| FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH)); |
| if (arc != null) { |
| if (!FST.targetHasArcs(arc)) { |
| // Fast pass for single character match: |
| assert arc.isFinal(); |
| lastMatchLen = 1; |
| lastMatch = arc.output(); |
| } else { |
| int lookahead = 0; |
| CharsRef output = arc.output(); |
| while (true) { |
| lookahead++; |
| |
| if (arc.isFinal()) { |
| // Match! (to node is final) |
| lastMatchLen = lookahead; |
| lastMatch = outputs.add(output, arc.nextFinalOutput()); |
| // Greedy: keep searching to see if there's a |
| // longer match... |
| } |
| |
| if (!FST.targetHasArcs(arc)) { |
| break; |
| } |
| |
| int ch = buffer.get(inputOff + lookahead); |
| if (ch == -1) { |
| break; |
| } |
| if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) { |
| // Dead end |
| break; |
| } |
| output = outputs.add(output, arc.output()); |
| } |
| } |
| } |
| } |
| |
| if (lastMatch != null) { |
| inputOff += lastMatchLen; |
| // System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch); |
| |
| final int diff = lastMatchLen - lastMatch.length; |
| |
| if (diff != 0) { |
| final int prevCumulativeDiff = getLastCumulativeDiff(); |
| if (diff > 0) { |
| // Replacement is shorter than matched input: |
| addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff); |
| } else { |
| // Replacement is longer than matched input: remap |
| // the "extra" chars all back to the same input |
| // offset: |
| final int outputStart = inputOff - prevCumulativeDiff; |
| for (int extraIDX = 0; extraIDX < -diff; extraIDX++) { |
| addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1); |
| } |
| } |
| } |
| |
| replacement = lastMatch; |
| replacementPointer = 0; |
| |
| } else { |
| final int ret = buffer.get(inputOff); |
| if (ret != -1) { |
| inputOff++; |
| buffer.freeBefore(inputOff); |
| } |
| return ret; |
| } |
| } |
| } |
| |
| @Override |
| public int read(char[] cbuf, int off, int len) throws IOException { |
| int numRead = 0; |
| for (int i = off; i < off + len; i++) { |
| int c = read(); |
| if (c == -1) break; |
| cbuf[i] = (char) c; |
| numRead++; |
| } |
| |
| return numRead == 0 ? -1 : numRead; |
| } |
| } |