blob: 61e30f2187ee9016caf86082127b6268d517703b [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Analysis.Ja.Dict;
using System;
using System.Collections.Generic;
using System.Globalization;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
namespace Lucene.Net.Analysis.Ja.Util
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
public class UnknownDictionaryBuilder
{
private const string NGRAM_DICTIONARY_ENTRY = "NGRAM,5,5,-32768,記号,一般,*,*,*,*,*,*,*";
private readonly string encoding = "euc-jp";
public UnknownDictionaryBuilder(string encoding)
{
this.encoding = encoding;
}
public virtual UnknownDictionaryWriter Build(string dirname)
{
UnknownDictionaryWriter unkDictionary = ReadDictionaryFile(dirname + System.IO.Path.DirectorySeparatorChar + "unk.def"); //Should be only one file
ReadCharacterDefinition(dirname + System.IO.Path.DirectorySeparatorChar + "char.def", unkDictionary);
return unkDictionary;
}
public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename)
{
return ReadDictionaryFile(filename, encoding);
}
public virtual UnknownDictionaryWriter ReadDictionaryFile(string filename, string encoding)
{
UnknownDictionaryWriter dictionary = new UnknownDictionaryWriter(5 * 1024 * 1024);
List<string[]> lines = new List<string[]>();
Encoding decoder = Encoding.GetEncoding(encoding);
using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, decoder))
{
dictionary.Put(CSVUtil.Parse(NGRAM_DICTIONARY_ENTRY));
string line = null;
while ((line = reader.ReadLine()) != null)
{
// note: unk.def only has 10 fields, it simplifies the writer to just append empty reading and pronunciation,
// even though the unknown dictionary returns hardcoded null here.
string[] parsed = CSVUtil.Parse(line + ",*,*"); // Probably we don't need to validate entry
lines.Add(parsed);
}
}
lines.Sort(Comparer<string[]>.Create((left, right) =>
{
int leftId = CharacterDefinition.LookupCharacterClass(left[0]);
int rightId = CharacterDefinition.LookupCharacterClass(right[0]);
return leftId - rightId;
}));
foreach (string[] entry in lines)
{
dictionary.Put(entry);
}
return dictionary;
}
public virtual void ReadCharacterDefinition(string filename, UnknownDictionaryWriter dictionary)
{
using (Stream inputStream = new FileStream(filename, FileMode.Open, FileAccess.Read))
using (TextReader reader = new StreamReader(inputStream, Encoding.GetEncoding(encoding)))
{
string line = null;
while ((line = reader.ReadLine()) != null)
{
line = Regex.Replace(line, "^\\s", "");
line = Regex.Replace(line, "\\s*#.*", "");
line = Regex.Replace(line, "\\s+", " ");
// Skip empty line or comment line
if (line.Length == 0)
{
continue;
}
if (line.StartsWith("0x", StringComparison.Ordinal))
{ // Category mapping
string[] values = new Regex(" ").Split(line, 2); // Split only first space
if (!values[0].Contains(".."))
{
int cp = Convert.ToInt32(values[0], 16);
dictionary.PutCharacterCategory(cp, values[1]);
}
else
{
string[] codePoints = Regex.Split(values[0], "\\.\\.").TrimEnd();
int cpFrom = Convert.ToInt32(codePoints[0], 16);
int cpTo = Convert.ToInt32(codePoints[1], 16);
for (int i = cpFrom; i <= cpTo; i++)
{
dictionary.PutCharacterCategory(i, values[1]);
}
}
}
else
{ // Invoke definition
string[] values = line.Split(' ').TrimEnd(); // Consecutive space is merged above
string characterClassName = values[0];
int invoke = int.Parse(values[1], CultureInfo.InvariantCulture);
int group = int.Parse(values[2], CultureInfo.InvariantCulture);
int length = int.Parse(values[3], CultureInfo.InvariantCulture);
dictionary.PutInvokeDefinition(characterClassName, invoke, group, length);
}
}
}
}
}
}