blob: 6127ca03469ef142a310813cfd888bdf33dc7f9e [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Support;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using Lucene.Net.Util.Packed;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using Console = Lucene.Net.Support.SystemConsole;
namespace Lucene.Net.Analysis.Ja.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class TokenInfoDictionaryBuilder
{
/// <summary>Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file</summary>
private int offset = 0;
private string encoding = "euc-jp";
private bool normalizeEntries = false;
//private Normalizer2 normalizer;
private DictionaryBuilder.DictionaryFormat format = DictionaryBuilder.DictionaryFormat.IPADIC;
public TokenInfoDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, string encoding, bool normalizeEntries)
{
this.format = format;
this.encoding = encoding;
this.normalizeEntries = normalizeEntries;
//this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null;
}
public virtual TokenInfoDictionaryWriter Build(string dirname)
{
List<string> csvFiles = new List<string>();
foreach (FileInfo file in new DirectoryInfo(dirname).EnumerateFiles("*.csv"))
{
csvFiles.Add(file.FullName);
}
csvFiles.Sort();
return BuildDictionary(csvFiles);
}
public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles)
{
TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024);
// all lines in the file
Console.WriteLine(" parse...");
List<string[]> lines = new List<string[]>(400000);
foreach (string file in csvFiles)
{
using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read))
{
Encoding decoder = Encoding.GetEncoding(encoding);
TextReader reader = new StreamReader(inputStream, decoder);
string line = null;
while ((line = reader.ReadLine()) != null)
{
string[] entry = CSVUtil.Parse(line);
if (entry.Length < 13)
{
Console.WriteLine("Entry in CSV is not valid: " + line);
continue;
}
string[] formatted = FormatEntry(entry);
lines.Add(formatted);
// NFKC normalize dictionary entry
if (normalizeEntries)
{
//if (normalizer.isNormalized(entry[0])){
if (entry[0].IsNormalized(NormalizationForm.FormKC))
{
continue;
}
string[] normalizedEntry = new string[entry.Length];
for (int i = 0; i < entry.Length; i++)
{
//normalizedEntry[i] = normalizer.normalize(entry[i]);
normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC);
}
formatted = FormatEntry(normalizedEntry);
lines.Add(formatted);
}
}
}
}
Console.WriteLine(" sort...");
// sort by term: we sorted the files already and use a stable sort.
lines.Sort(new ComparerAnonymousHelper());
Console.WriteLine(" encode...");
PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15);
Int32sRef scratch = new Int32sRef();
long ord = -1; // first ord will be 0
string lastValue = null;
// build tokeninfo dictionary
foreach (string[] entry in lines)
{
int next = dictionary.Put(entry);
if (next == offset)
{
Console.WriteLine("Failed to process line: " + Collections.ToString(entry));
continue;
}
string token = entry[0];
if (!token.Equals(lastValue, StringComparison.Ordinal))
{
// new word to add to fst
ord++;
lastValue = token;
scratch.Grow(token.Length);
scratch.Length = token.Length;
for (int i = 0; i < token.Length; i++)
{
scratch.Int32s[i] = (int)token[i];
}
fstBuilder.Add(scratch, ord);
}
dictionary.AddMapping((int)ord, offset);
offset = next;
}
FST<long?> fst = fstBuilder.Finish();
Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... ");
dictionary.SetFST(fst);
Console.WriteLine(" done");
return dictionary;
}
private class ComparerAnonymousHelper : IComparer<string[]>
{
public int Compare(string[] left, string[] right)
{
return left[0].CompareToOrdinal(right[0]);
}
}
/// <summary>
/// IPADIC features
///
/// 0 - surface
/// 1 - left cost
/// 2 - right cost
/// 3 - word cost
/// 4-9 - pos
/// 10 - base form
/// 11 - reading
/// 12 - pronounciation
///
/// UniDic features
///
/// 0 - surface
/// 1 - left cost
/// 2 - right cost
/// 3 - word cost
/// 4-9 - pos
/// 10 - base form reading
/// 11 - base form
/// 12 - surface form
/// 13 - surface reading
/// </summary>
public virtual string[] FormatEntry(string[] features)
{
if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC)
{
return features;
}
else
{
string[] features2 = new string[13];
features2[0] = features[0];
features2[1] = features[1];
features2[2] = features[2];
features2[3] = features[3];
features2[4] = features[4];
features2[5] = features[5];
features2[6] = features[6];
features2[7] = features[7];
features2[8] = features[8];
features2[9] = features[9];
features2[10] = features[11];
// If the surface reading is non-existent, use surface form for reading and pronunciation.
// This happens with punctuation in UniDic and there are possibly other cases as well
if (features[13].Length == 0)
{
features2[11] = features[0];
features2[12] = features[0];
}
else
{
features2[11] = features[13];
features2[12] = features[13];
}
return features2;
}
}
}
}