| using J2N.Text; |
| using Lucene.Net.Support; |
| using Lucene.Net.Util; |
| using Lucene.Net.Util.Fst; |
| using Lucene.Net.Util.Packed; |
| using System; |
| using System.Collections.Generic; |
| using System.IO; |
| using System.Text; |
| using Console = Lucene.Net.Support.SystemConsole; |
| |
| namespace Lucene.Net.Analysis.Ja.Util |
| { |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| public class TokenInfoDictionaryBuilder |
| { |
| /// <summary>Internal word id - incrementally assigned as entries are read and added. This will be byte offset of dictionary file</summary> |
| private int offset = 0; |
| |
| private string encoding = "euc-jp"; |
| |
| private bool normalizeEntries = false; |
| //private Normalizer2 normalizer; |
| |
| private DictionaryBuilder.DictionaryFormat format = DictionaryBuilder.DictionaryFormat.IPADIC; |
| |
| public TokenInfoDictionaryBuilder(DictionaryBuilder.DictionaryFormat format, string encoding, bool normalizeEntries) |
| { |
| this.format = format; |
| this.encoding = encoding; |
| this.normalizeEntries = normalizeEntries; |
| //this.normalizer = normalizeEntries ? Normalizer2.getInstance(null, "nfkc", Normalizer2.Mode.COMPOSE) : null; |
| } |
| |
| public virtual TokenInfoDictionaryWriter Build(string dirname) |
| { |
| List<string> csvFiles = new List<string>(); |
| foreach (FileInfo file in new DirectoryInfo(dirname).EnumerateFiles("*.csv")) |
| { |
| csvFiles.Add(file.FullName); |
| } |
| csvFiles.Sort(); |
| return BuildDictionary(csvFiles); |
| } |
| |
| public virtual TokenInfoDictionaryWriter BuildDictionary(IList<string> csvFiles) |
| { |
| TokenInfoDictionaryWriter dictionary = new TokenInfoDictionaryWriter(10 * 1024 * 1024); |
| |
| // all lines in the file |
| Console.WriteLine(" parse..."); |
| List<string[]> lines = new List<string[]>(400000); |
| foreach (string file in csvFiles) |
| { |
| using (Stream inputStream = new FileStream(file, FileMode.Open, FileAccess.Read)) |
| { |
| Encoding decoder = Encoding.GetEncoding(encoding); |
| TextReader reader = new StreamReader(inputStream, decoder); |
| |
| string line = null; |
| while ((line = reader.ReadLine()) != null) |
| { |
| string[] entry = CSVUtil.Parse(line); |
| |
| if (entry.Length < 13) |
| { |
| Console.WriteLine("Entry in CSV is not valid: " + line); |
| continue; |
| } |
| |
| string[] formatted = FormatEntry(entry); |
| lines.Add(formatted); |
| |
| // NFKC normalize dictionary entry |
| if (normalizeEntries) |
| { |
| //if (normalizer.isNormalized(entry[0])){ |
| if (entry[0].IsNormalized(NormalizationForm.FormKC)) |
| { |
| continue; |
| } |
| string[] normalizedEntry = new string[entry.Length]; |
| for (int i = 0; i < entry.Length; i++) |
| { |
| //normalizedEntry[i] = normalizer.normalize(entry[i]); |
| normalizedEntry[i] = entry[i].Normalize(NormalizationForm.FormKC); |
| } |
| |
| formatted = FormatEntry(normalizedEntry); |
| lines.Add(formatted); |
| } |
| } |
| } |
| } |
| |
| Console.WriteLine(" sort..."); |
| |
| // sort by term: we sorted the files already and use a stable sort. |
| lines.Sort(new ComparerAnonymousHelper()); |
| |
| Console.WriteLine(" encode..."); |
| |
| PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton; |
| Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, 0, 0, true, true, int.MaxValue, fstOutput, null, true, PackedInt32s.DEFAULT, true, 15); |
| Int32sRef scratch = new Int32sRef(); |
| long ord = -1; // first ord will be 0 |
| string lastValue = null; |
| |
| // build tokeninfo dictionary |
| foreach (string[] entry in lines) |
| { |
| int next = dictionary.Put(entry); |
| |
| if (next == offset) |
| { |
| Console.WriteLine("Failed to process line: " + Collections.ToString(entry)); |
| continue; |
| } |
| |
| string token = entry[0]; |
| if (!token.Equals(lastValue, StringComparison.Ordinal)) |
| { |
| // new word to add to fst |
| ord++; |
| lastValue = token; |
| scratch.Grow(token.Length); |
| scratch.Length = token.Length; |
| for (int i = 0; i < token.Length; i++) |
| { |
| scratch.Int32s[i] = (int)token[i]; |
| } |
| fstBuilder.Add(scratch, ord); |
| } |
| dictionary.AddMapping((int)ord, offset); |
| offset = next; |
| } |
| |
| FST<long?> fst = fstBuilder.Finish(); |
| |
| Console.WriteLine(" " + fst.NodeCount + " nodes, " + fst.ArcCount + " arcs, " + fst.GetSizeInBytes() + " bytes... "); |
| dictionary.SetFST(fst); |
| Console.WriteLine(" done"); |
| |
| return dictionary; |
| } |
| |
| private class ComparerAnonymousHelper : IComparer<string[]> |
| { |
| public int Compare(string[] left, string[] right) |
| { |
| return left[0].CompareToOrdinal(right[0]); |
| } |
| } |
| |
| /// <summary> |
| /// IPADIC features |
| /// |
| /// 0 - surface |
| /// 1 - left cost |
| /// 2 - right cost |
| /// 3 - word cost |
| /// 4-9 - pos |
| /// 10 - base form |
| /// 11 - reading |
| /// 12 - pronounciation |
| /// |
| /// UniDic features |
| /// |
| /// 0 - surface |
| /// 1 - left cost |
| /// 2 - right cost |
| /// 3 - word cost |
| /// 4-9 - pos |
| /// 10 - base form reading |
| /// 11 - base form |
| /// 12 - surface form |
| /// 13 - surface reading |
| /// </summary> |
| public virtual string[] FormatEntry(string[] features) |
| { |
| if (this.format == DictionaryBuilder.DictionaryFormat.IPADIC) |
| { |
| return features; |
| } |
| else |
| { |
| string[] features2 = new string[13]; |
| features2[0] = features[0]; |
| features2[1] = features[1]; |
| features2[2] = features[2]; |
| features2[3] = features[3]; |
| features2[4] = features[4]; |
| features2[5] = features[5]; |
| features2[6] = features[6]; |
| features2[7] = features[7]; |
| features2[8] = features[8]; |
| features2[9] = features[9]; |
| features2[10] = features[11]; |
| |
| // If the surface reading is non-existent, use surface form for reading and pronunciation. |
| // This happens with punctuation in UniDic and there are possibly other cases as well |
| if (features[13].Length == 0) |
| { |
| features2[11] = features[0]; |
| features2[12] = features[0]; |
| } |
| else |
| { |
| features2[11] = features[13]; |
| features2[12] = features[13]; |
| } |
| return features2; |
| } |
| } |
| } |
| } |