lucene/analysis/common/src/java/org/apache/lucene/analysis/charfilter/MappingCharFilter.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.charfilter;

 import java.io.IOException;
 import java.io.Reader;
 import java.util.Map;

 import org.apache.lucene.analysis.CharFilter; // javadocs
 import org.apache.lucene.analysis.util.RollingCharBuffer;
 import org.apache.lucene.util.CharsRef;
 import org.apache.lucene.util.fst.CharSequenceOutputs;
 import org.apache.lucene.util.fst.FST;
 import org.apache.lucene.util.fst.Outputs;

 /**
  * Simplistic {@link CharFilter} that applies the mappings
  * contained in a {@link NormalizeCharMap} to the character
  * stream, and correcting the resulting changes to the
  * offsets.  Matching is greedy (longest pattern matching at
  * a given point wins).  Replacement is allowed to be the
  * empty string.
  */

 public class MappingCharFilter extends BaseCharFilter {

   private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
   private final FST<CharsRef> map;
   private final FST.BytesReader fstReader;
   private final RollingCharBuffer buffer = new RollingCharBuffer();
   private final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
   private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs;

   private CharsRef replacement;
   private int replacementPointer;
   private int inputOff;

   /** Default constructor that takes a {@link Reader}. */
   public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
     super(in);
     buffer.reset(in);

     map = normMap.map;
     cachedRootArcs = normMap.cachedRootArcs;

     if (map != null) {
       fstReader = map.getBytesReader();
     } else {
       fstReader = null;
     }
   }

   @Override
   public void reset() throws IOException {
     input.reset();
     buffer.reset(input);
     replacement = null;
     inputOff = 0;
   }

   @Override
   public int read() throws IOException {

     //System.out.println("\nread");
     while(true) {

       if (replacement != null && replacementPointer < replacement.length) {
         //System.out.println("  return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
         return replacement.chars[replacement.offset + replacementPointer++];
       }

       // TODO: a more efficient approach would be Aho/Corasick's
       // algorithm
       // (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
       // or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
       //
       // I think this would be (almost?) equivalent to 1) adding
       // epsilon arcs from all final nodes back to the init
       // node in the FST, 2) adding a .* (skip any char)
       // loop on the initial node, and 3) determinizing
       // that.  Then we would not have to restart matching
       // at each position.

       int lastMatchLen = -1;
       CharsRef lastMatch = null;

       final int firstCH = buffer.get(inputOff);
       if (firstCH != -1) {
         FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
         if (arc != null) {
           if (!FST.targetHasArcs(arc)) {
             // Fast pass for single character match:
             assert arc.isFinal();
             lastMatchLen = 1;
             lastMatch = arc.output();
           } else {
             int lookahead = 0;
             CharsRef output = arc.output();
             while (true) {
               lookahead++;

               if (arc.isFinal()) {
                 // Match! (to node is final)
                 lastMatchLen = lookahead;
                 lastMatch = outputs.add(output, arc.nextFinalOutput());
                 // Greedy: keep searching to see if there's a
                 // longer match...
               }

               if (!FST.targetHasArcs(arc)) {
                 break;
               }

               int ch = buffer.get(inputOff + lookahead);
               if (ch == -1) {
                 break;
               }
               if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
                 // Dead end
                 break;
               }
               output = outputs.add(output, arc.output());
             }
           }
         }
       }

       if (lastMatch != null) {
         inputOff += lastMatchLen;
         //System.out.println("  match!  len=" + lastMatchLen + " repl=" + lastMatch);

         final int diff = lastMatchLen - lastMatch.length;

         if (diff != 0) {
           final int prevCumulativeDiff = getLastCumulativeDiff();
           if (diff > 0) {
             // Replacement is shorter than matched input:
             addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
           } else {
             // Replacement is longer than matched input: remap
             // the "extra" chars all back to the same input
             // offset:
             final int outputStart = inputOff - prevCumulativeDiff;
             for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
               addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
             }
           }
         }

         replacement = lastMatch;
         replacementPointer = 0;

       } else {
         final int ret = buffer.get(inputOff);
         if (ret != -1) {
           inputOff++;
           buffer.freeBefore(inputOff);
         }
         return ret;
       }
     }
   }

   @Override
   public int read(char[] cbuf, int off, int len) throws IOException {
     int numRead = 0;
     for(int i = off; i < off + len; i++) {
       int c = read();
       if (c == -1) break;
       cbuf[i] = (char) c;
       numRead++;
     }

     return numRead == 0 ? -1 : numRead;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.charfilter;

	import java.io.IOException;
	import java.io.Reader;
	import java.util.Map;

	import org.apache.lucene.analysis.CharFilter; // javadocs
	import org.apache.lucene.analysis.util.RollingCharBuffer;
	import org.apache.lucene.util.CharsRef;
	import org.apache.lucene.util.fst.CharSequenceOutputs;
	import org.apache.lucene.util.fst.FST;
	import org.apache.lucene.util.fst.Outputs;

	/**
	* Simplistic {@link CharFilter} that applies the mappings
	* contained in a {@link NormalizeCharMap} to the character
	* stream, and correcting the resulting changes to the
	* offsets. Matching is greedy (longest pattern matching at
	* a given point wins). Replacement is allowed to be the
	* empty string.
	*/

	public class MappingCharFilter extends BaseCharFilter {

	private final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
	private final FST<CharsRef> map;
	private final FST.BytesReader fstReader;
	private final RollingCharBuffer buffer = new RollingCharBuffer();
	private final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
	private final Map<Character,FST.Arc<CharsRef>> cachedRootArcs;

	private CharsRef replacement;
	private int replacementPointer;
	private int inputOff;

	/** Default constructor that takes a {@link Reader}. */
	public MappingCharFilter(NormalizeCharMap normMap, Reader in) {
	super(in);
	buffer.reset(in);

	map = normMap.map;
	cachedRootArcs = normMap.cachedRootArcs;

	if (map != null) {
	fstReader = map.getBytesReader();
	} else {
	fstReader = null;
	}
	}

	@Override
	public void reset() throws IOException {
	input.reset();
	buffer.reset(input);
	replacement = null;
	inputOff = 0;
	}

	@Override
	public int read() throws IOException {

	//System.out.println("\nread");
	while(true) {

	if (replacement != null && replacementPointer < replacement.length) {
	//System.out.println(" return repl[" + replacementPointer + "]=" + replacement.chars[replacement.offset + replacementPointer]);
	return replacement.chars[replacement.offset + replacementPointer++];
	}

	// TODO: a more efficient approach would be Aho/Corasick's
	// algorithm
	// (http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm)
	// or this generalizatio: www.cis.uni-muenchen.de/people/Schulz/Pub/dictle5.ps
	//
	// I think this would be (almost?) equivalent to 1) adding
	// epsilon arcs from all final nodes back to the init
	// node in the FST, 2) adding a .* (skip any char)
	// loop on the initial node, and 3) determinizing
	// that. Then we would not have to restart matching
	// at each position.

	int lastMatchLen = -1;
	CharsRef lastMatch = null;

	final int firstCH = buffer.get(inputOff);
	if (firstCH != -1) {
	FST.Arc<CharsRef> arc = cachedRootArcs.get(Character.valueOf((char) firstCH));
	if (arc != null) {
	if (!FST.targetHasArcs(arc)) {
	// Fast pass for single character match:
	assert arc.isFinal();
	lastMatchLen = 1;
	lastMatch = arc.output();
	} else {
	int lookahead = 0;
	CharsRef output = arc.output();
	while (true) {
	lookahead++;

	if (arc.isFinal()) {
	// Match! (to node is final)
	lastMatchLen = lookahead;
	lastMatch = outputs.add(output, arc.nextFinalOutput());
	// Greedy: keep searching to see if there's a
	// longer match...
	}

	if (!FST.targetHasArcs(arc)) {
	break;
	}

	int ch = buffer.get(inputOff + lookahead);
	if (ch == -1) {
	break;
	}
	if ((arc = map.findTargetArc(ch, arc, scratchArc, fstReader)) == null) {
	// Dead end
	break;
	}
	output = outputs.add(output, arc.output());
	}
	}
	}
	}

	if (lastMatch != null) {
	inputOff += lastMatchLen;
	//System.out.println(" match! len=" + lastMatchLen + " repl=" + lastMatch);

	final int diff = lastMatchLen - lastMatch.length;

	if (diff != 0) {
	final int prevCumulativeDiff = getLastCumulativeDiff();
	if (diff > 0) {
	// Replacement is shorter than matched input:
	addOffCorrectMap(inputOff - diff - prevCumulativeDiff, prevCumulativeDiff + diff);
	} else {
	// Replacement is longer than matched input: remap
	// the "extra" chars all back to the same input
	// offset:
	final int outputStart = inputOff - prevCumulativeDiff;
	for(int extraIDX=0;extraIDX<-diff;extraIDX++) {
	addOffCorrectMap(outputStart + extraIDX, prevCumulativeDiff - extraIDX - 1);
	}
	}
	}

	replacement = lastMatch;
	replacementPointer = 0;

	} else {
	final int ret = buffer.get(inputOff);
	if (ret != -1) {
	inputOff++;
	buffer.freeBefore(inputOff);
	}
	return ret;
	}
	}
	}

	@Override
	public int read(char[] cbuf, int off, int len) throws IOException {
	int numRead = 0;
	for(int i = off; i < off + len; i++) {
	int c = read();
	if (c == -1) break;
	cbuf[i] = (char) c;
	numRead++;
	}

	return numRead == 0 ? -1 : numRead;
	}
	}