| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.charfilter; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.Map; |
| import java.util.TreeMap; |
| |
| import org.apache.lucene.util.CharsRef; |
| import org.apache.lucene.util.IntsRefBuilder; |
| import org.apache.lucene.util.fst.CharSequenceOutputs; |
| import org.apache.lucene.util.fst.FST; |
| import org.apache.lucene.util.fst.Outputs; |
| import org.apache.lucene.util.fst.Util; |
| |
| // TODO: save/load? |
| |
| /** |
| * Holds a map of String input to String output, to be used |
| * with {@link MappingCharFilter}. Use the {@link Builder} |
| * to create this. |
| */ |
| public class NormalizeCharMap { |
| |
| final FST<CharsRef> map; |
| final Map<Character,FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>(); |
| |
| // Use the builder to create: |
| private NormalizeCharMap(FST<CharsRef> map) { |
| this.map = map; |
| if (map != null) { |
| try { |
| // Pre-cache root arcs: |
| final FST.Arc<CharsRef> scratchArc = new FST.Arc<>(); |
| final FST.BytesReader fstReader = map.getBytesReader(); |
| map.getFirstArc(scratchArc); |
| if (FST.targetHasArcs(scratchArc)) { |
| map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader); |
| while(true) { |
| assert scratchArc.label() != FST.END_LABEL; |
| cachedRootArcs.put(Character.valueOf((char) scratchArc.label()), new FST.Arc<CharsRef>().copyFrom(scratchArc)); |
| if (scratchArc.isLast()) { |
| break; |
| } |
| map.readNextRealArc(scratchArc, fstReader); |
| } |
| } |
| //System.out.println("cached " + cachedRootArcs.size() + " root arcs"); |
| } catch (IOException ioe) { |
| // Bogus FST IOExceptions!! (will never happen) |
| throw new RuntimeException(ioe); |
| } |
| } |
| } |
| |
| /** |
| * Builds an NormalizeCharMap. |
| * <p> |
| * Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap |
| * @lucene.experimental |
| */ |
| public static class Builder { |
| |
| private final Map<String,String> pendingPairs = new TreeMap<>(); |
| |
| /** Records a replacement to be applied to the input |
| * stream. Whenever <code>singleMatch</code> occurs in |
| * the input, it will be replaced with |
| * <code>replacement</code>. |
| * |
| * @param match input String to be replaced |
| * @param replacement output String |
| * @throws IllegalArgumentException if |
| * <code>match</code> is the empty string, or was |
| * already previously added |
| */ |
| public void add(String match, String replacement) { |
| if (match.length() == 0 ){ |
| throw new IllegalArgumentException("cannot match the empty string"); |
| } |
| if (pendingPairs.containsKey(match)) { |
| throw new IllegalArgumentException("match \"" + match + "\" was already added"); |
| } |
| pendingPairs.put(match, replacement); |
| } |
| |
| /** Builds the NormalizeCharMap; call this once you |
| * are done calling {@link #add}. */ |
| public NormalizeCharMap build() { |
| |
| final FST<CharsRef> map; |
| try { |
| final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton(); |
| final org.apache.lucene.util.fst.Builder<CharsRef> builder = new org.apache.lucene.util.fst.Builder<>(FST.INPUT_TYPE.BYTE2, outputs); |
| final IntsRefBuilder scratch = new IntsRefBuilder(); |
| for(Map.Entry<String,String> ent : pendingPairs.entrySet()) { |
| builder.add(Util.toUTF16(ent.getKey(), scratch), |
| new CharsRef(ent.getValue())); |
| } |
| map = builder.finish(); |
| pendingPairs.clear(); |
| } catch (IOException ioe) { |
| // Bogus FST IOExceptions!! (will never happen) |
| throw new RuntimeException(ioe); |
| } |
| |
| return new NormalizeCharMap(map); |
| } |
| } |
| } |