blob: e3963d21e56c9b7aac346fc5261944564e51c02c [file] [log] [blame]
using Lucene.Net.Diagnostics;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
using System.IO;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.CharFilters
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// TODO: save/load?
/// <summary>
/// Holds a map of <see cref="string"/> input to <see cref="string"/> output, to be used
/// with <see cref="Builder"/>. Use the <see cref="MappingCharFilter"/>
/// to create this.
/// </summary>
public class NormalizeCharMap
{
internal readonly FST<CharsRef> map;
internal readonly IDictionary<char?, FST.Arc<CharsRef>> cachedRootArcs = new Dictionary<char?, FST.Arc<CharsRef>>();
// Use the builder to create:
private NormalizeCharMap(FST<CharsRef> map)
{
this.map = map;
if (map != null)
{
try
{
// Pre-cache root arcs:
var scratchArc = new FST.Arc<CharsRef>();
FST.BytesReader fstReader = map.GetBytesReader();
map.GetFirstArc(scratchArc);
if (FST<CharsRef>.TargetHasArcs(scratchArc))
{
map.ReadFirstRealTargetArc(scratchArc.Target, scratchArc, fstReader);
while (true)
{
if (Debugging.AssertsEnabled) Debugging.Assert(scratchArc.Label != FST.END_LABEL);
cachedRootArcs[Convert.ToChar((char)scratchArc.Label)] = (new FST.Arc<CharsRef>()).CopyFrom(scratchArc);
if (scratchArc.IsLast)
{
break;
}
map.ReadNextRealArc(scratchArc, fstReader);
}
}
//System.out.println("cached " + cachedRootArcs.size() + " root arcs");
}
catch (IOException ioe)
{
// Bogus FST IOExceptions!! (will never happen)
throw new Exception("Should never happen", ioe);
}
}
}
/// <summary>
/// Builds an NormalizeCharMap.
/// <para>
/// Call add() until you have added all the mappings, then call build() to get a NormalizeCharMap
/// @lucene.experimental
/// </para>
/// </summary>
public class Builder
{
// LUCENENET specific - we need to use StringComparer.Ordinal for the
// sort order to correctly match Lucene, otherwise FST.Builder will throw Debug.Assert failures
private readonly IDictionary<string, string> pendingPairs = new JCG.SortedDictionary<string, string>(StringComparer.Ordinal);
/// <summary>
/// Records a replacement to be applied to the input
/// stream. Whenever <code>singleMatch</code> occurs in
/// the input, it will be replaced with
/// <code>replacement</code>.
/// </summary>
/// <param name="match"> input String to be replaced </param>
/// <param name="replacement"> output String </param>
/// <exception cref="ArgumentException"> if
/// <code>match</code> is the empty string, or was
/// already previously added </exception>
public virtual void Add(string match, string replacement)
{
if (match.Length == 0)
{
throw new ArgumentException("cannot match the empty string");
}
if (pendingPairs.ContainsKey(match))
{
throw new ArgumentException("match \"" + match + "\" was already added");
}
pendingPairs[match] = replacement;
}
/// <summary>
/// Builds the <see cref="NormalizeCharMap"/>; call this once you
/// are done calling <see cref="Add"/>.
/// </summary>
public virtual NormalizeCharMap Build()
{
FST<CharsRef> map;
try
{
Outputs<CharsRef> outputs = CharSequenceOutputs.Singleton;
Builder<CharsRef> builder = new Builder<CharsRef>(FST.INPUT_TYPE.BYTE2, outputs);
Int32sRef scratch = new Int32sRef();
foreach (var ent in pendingPairs)
{
builder.Add(Lucene.Net.Util.Fst.Util.ToUTF16(ent.Key, scratch), new CharsRef(ent.Value));
}
map = builder.Finish();
pendingPairs.Clear();
}
catch (IOException ioe)
{
// Bogus FST IOExceptions!! (will never happen)
throw new Exception("Should never happen", ioe);
}
return new NormalizeCharMap(map);
}
}
}
}