blob: b3dba24718b086bfd9d080703d3c21186ed5848c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.analysis.charfilter;
import java.io.IOException;
import java.util.HashMap;
import java.util.Map;
import java.util.TreeMap;
import org.apache.lucene.util.CharsRef;
import org.apache.lucene.util.IntsRefBuilder;
import org.apache.lucene.util.fst.CharSequenceOutputs;
import org.apache.lucene.util.fst.FST;
import org.apache.lucene.util.fst.FSTCompiler;
import org.apache.lucene.util.fst.Outputs;
import org.apache.lucene.util.fst.Util;
// TODO: save/load?
/**
* Holds a map of String input to String output, to be used with {@link MappingCharFilter}. Use the
* {@link Builder} to create this.
*/
public class NormalizeCharMap {
final FST<CharsRef> map;
final Map<Character, FST.Arc<CharsRef>> cachedRootArcs = new HashMap<>();
// Use the builder to create:
private NormalizeCharMap(FST<CharsRef> map) {
this.map = map;
if (map != null) {
try {
// Pre-cache root arcs:
final FST.Arc<CharsRef> scratchArc = new FST.Arc<>();
final FST.BytesReader fstReader = map.getBytesReader();
map.getFirstArc(scratchArc);
if (FST.targetHasArcs(scratchArc)) {
map.readFirstRealTargetArc(scratchArc.target(), scratchArc, fstReader);
while (true) {
assert scratchArc.label() != FST.END_LABEL;
cachedRootArcs.put(
Character.valueOf((char) scratchArc.label()),
new FST.Arc<CharsRef>().copyFrom(scratchArc));
if (scratchArc.isLast()) {
break;
}
map.readNextRealArc(scratchArc, fstReader);
}
}
// System.out.println("cached " + cachedRootArcs.size() + " root arcs");
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
throw new RuntimeException(ioe);
}
}
}
/**
* Builds an NormalizeCharMap.
*
* <p>Call add() until you have added all the mappings, then call build() to get a
* NormalizeCharMap
*
* @lucene.experimental
*/
public static class Builder {
private final Map<String, String> pendingPairs = new TreeMap<>();
/**
* Records a replacement to be applied to the input stream. Whenever <code>singleMatch</code>
* occurs in the input, it will be replaced with <code>replacement</code>.
*
* @param match input String to be replaced
* @param replacement output String
* @throws IllegalArgumentException if <code>match</code> is the empty string, or was already
* previously added
*/
public void add(String match, String replacement) {
if (match.length() == 0) {
throw new IllegalArgumentException("cannot match the empty string");
}
if (pendingPairs.containsKey(match)) {
throw new IllegalArgumentException("match \"" + match + "\" was already added");
}
pendingPairs.put(match, replacement);
}
/** Builds the NormalizeCharMap; call this once you are done calling {@link #add}. */
public NormalizeCharMap build() {
final FST<CharsRef> map;
try {
final Outputs<CharsRef> outputs = CharSequenceOutputs.getSingleton();
final FSTCompiler<CharsRef> fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE2, outputs);
final IntsRefBuilder scratch = new IntsRefBuilder();
for (Map.Entry<String, String> ent : pendingPairs.entrySet()) {
fstCompiler.add(Util.toUTF16(ent.getKey(), scratch), new CharsRef(ent.getValue()));
}
map = fstCompiler.compile();
pendingPairs.clear();
} catch (IOException ioe) {
// Bogus FST IOExceptions!! (will never happen)
throw new RuntimeException(ioe);
}
return new NormalizeCharMap(map);
}
}
}