| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.charfilter; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.TreeMap; |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.CharSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.FSTCompiler; |
| import org.apache.lucene.util.fst.Outputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| // TODO: save/load? |
| |
| /** |
| * Holds a map of String input to String output, to be used with {@link MappingCharFilter}. Use the |
| * {@link Builder} to create this. |
| */ |
| public class NormalizeCharMap { |
| |
| final FST<CharsRef> map; |
| final Map<Character, FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>(); |
| |
| // Use the builder to create: |
| private NormalizeCharMap(FST<CharsRef> map) { |
| this.map = map; |
| if (map != null) { |
| try { |
| // Pre-cache root arcs: |
| final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); |
| final FST.BytesReader fstReader = map.getBytesReader(); |
| map.getFirstArc(scratchArc); |
| if (FST.targetHasArcs(scratchArc)) { |
| map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader); |
| while (true) { |
| assert scratchArc.label() != FST.END_LABEL; |
| cachedRootArcs.put( |
| Character.valueOf((char) scratchArc.label()), |
| new FST.Arc<CharsRef>().copyFrom(scratchArc)); |
| if (scratchArc.isLast()) { |
| break; |
| } |
| map.readNextRealArc(scratchArc, fstReader); |
| } |
| } |
| // System.out.println("cached " + cachedRootArcs.size() + " root arcs"); |
| } catch (IOException ioe) { |
| // Bogus FST IOExceptions!! (will never happen) |
| throw new RuntimeException(ioe); |
| } |
| } |
| } |
| |
| /** |
| * Builds an NormalizeCharMap. |
| * |
| * <p>Call add() until you have added all the mappings, then call build() to get a |
| * NormalizeCharMap |
| * |
| * @lucene.experimental |
| */ |
| public static class Builder { |
| |
| private final Map<String, String> pendingPairs = new TreeMap<>(); |
| |
| /** |
| * Records a replacement to be applied to the input stream. Whenever <code>singleMatch</code> |
| * occurs in the input, it will be replaced with <code>replacement</code>. |
| * |
| * @param match input String to be replaced |
| * @param replacement output String |
| * @throws IllegalArgumentException if <code>match</code> is the empty string, or was already |
| * previously added |
| */ |
| public void add(String match, String replacement) { |
| if (match.length() == 0) { |
| throw new IllegalArgumentException("cannot match the empty string"); |
| } |
| if (pendingPairs.containsKey(match)) { |
| throw new IllegalArgumentException("match \"" + match + "\" was already added"); |
| } |
| pendingPairs.put(match, replacement); |
| } |
| |
| /** Builds the NormalizeCharMap; call this once you are done calling {@link #add}. */ |
| public NormalizeCharMap build() { |
| |
| final FST<CharsRef> map; |
| try { |
| final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
| final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs); |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| for (Map.Entry<String, String> ent : pendingPairs.entrySet()) { |
| fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue())); |
| } |
| map = fstCompiler.compile(); |
| pendingPairs.clear(); |
| } catch (IOException ioe) { |
| // Bogus FST IOExceptions!! (will never happen) |
| throw new RuntimeException(ioe); |
| } |
| |
| return new NormalizeCharMap(map); |
| } |
| } |
| } |