blob: d98f613cc0e4bbc9d8f16ddc917ce915925e740b [file] [log] [blame]
using J2N.Text;
using Lucene.Net.Analysis.Ja.Util;
using Lucene.Net.Util;
using Lucene.Net.Util.Fst;
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using JCG = J2N.Collections.Generic;
namespace Lucene.Net.Analysis.Ja.Dict
{
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/// <summary>
/// Class for building a User Dictionary.
/// This class allows for custom segmentation of phrases.
/// </summary>
public sealed class UserDictionary : IDictionary
{
// phrase text -> phrase ID
private readonly TokenInfoFST fst;
// holds wordid, length, length... indexed by phrase ID
private readonly int[][] segmentations;
// holds readings and POS, indexed by wordid
private readonly string[] data;
private static readonly int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;
public static readonly int WORD_COST = -100000;
public static readonly int LEFT_ID = 5;
public static readonly int RIGHT_ID = 5;
private static readonly Regex specialChars = new Regex(@"#.*$", RegexOptions.Compiled);
private static readonly Regex commentLine = new Regex(@" *", RegexOptions.Compiled);
public UserDictionary(TextReader reader)
{
string line = null;
int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
List<string[]> featureEntries = new List<string[]>();
// text, segmentation, readings, POS
while ((line = reader.ReadLine()) != null)
{
// Remove comments
line = specialChars.Replace(line, "");
// Skip empty lines or comment lines
if (line.Trim().Length == 0)
{
continue;
}
string[] values = CSVUtil.Parse(line);
featureEntries.Add(values);
}
// TODO: should we allow multiple segmentations per input 'phrase'?
// the old treemap didn't support this either, and i'm not sure if its needed/useful?
featureEntries.Sort(Comparer<string[]>.Create((left, right) => left[0].CompareToOrdinal(right[0])));
List<string> data = new List<string>(featureEntries.Count);
List<int[]> segmentations = new List<int[]>(featureEntries.Count);
PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
Int32sRef scratch = new Int32sRef();
long ord = 0;
foreach (string[] values in featureEntries)
{
string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
string pos = values[3];
if (segmentation.Length != readings.Length)
{
throw new Exception("Illegal user dictionary entry " + values[0] +
" - the number of segmentations (" + segmentation.Length + ")" +
" does not the match number of readings (" + readings.Length + ")");
}
int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
wordIdAndLength[0] = wordId;
for (int i = 0; i < segmentation.Length; i++)
{
wordIdAndLength[i + 1] = segmentation[i].Length;
data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
wordId++;
}
// add mapping to FST
string token = values[0];
scratch.Grow(token.Length);
scratch.Length = token.Length;
for (int i = 0; i < token.Length; i++)
{
scratch.Int32s[i] = (int)token[i];
}
fstBuilder.Add(scratch, ord);
segmentations.Add(wordIdAndLength);
ord++;
}
this.fst = new TokenInfoFST(fstBuilder.Finish(), false);
this.data = data.ToArray(/*new string[data.Count]*/);
this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
}
/// <summary>
/// Lookup words in text.
/// </summary>
/// <param name="chars">Text.</param>
/// <param name="off">Offset into text.</param>
/// <param name="len">Length of text.</param>
/// <returns>Array of {wordId, position, length}.</returns>
public int[][] Lookup(char[] chars, int off, int len)
{
// TODO: can we avoid this treemap/toIndexArray?
IDictionary<int, int[]> result = new JCG.SortedDictionary<int, int[]>(); // index, [length, length...]
bool found = false; // true if we found any results
FST.BytesReader fstReader = fst.GetBytesReader();
FST.Arc<long?> arc = new FST.Arc<long?>();
int end = off + len;
for (int startOffset = off; startOffset < end; startOffset++)
{
arc = fst.GetFirstArc(arc);
int output = 0;
int remaining = end - startOffset;
for (int i = 0; i < remaining; i++)
{
int ch = chars[startOffset + i];
if (fst.FindTargetArc(ch, arc, arc, i == 0, fstReader) == null)
{
break; // continue to next position
}
output += (int)arc.Output;
if (arc.IsFinal)
{
int finalOutput = output + (int)arc.NextFinalOutput;
result[startOffset - off] = segmentations[finalOutput];
found = true;
}
}
}
return found ? ToIndexArray(result) : EMPTY_RESULT;
}
public TokenInfoFST FST => fst;
private static readonly int[][] EMPTY_RESULT = new int[0][];
/// <summary>
/// Convert Map of index and wordIdAndLength to array of {wordId, index, length}
/// </summary>
/// <param name="input"></param>
/// <returns>Array of {wordId, index, length}.</returns>
private int[][] ToIndexArray(IDictionary<int, int[]> input)
{
List<int[]> result = new List<int[]>();
foreach (int i in input.Keys)
{
int[] wordIdAndLength = input[i];
int wordId = wordIdAndLength[0];
// convert length to index
int current = i;
for (int j = 1; j < wordIdAndLength.Length; j++)
{ // first entry is wordId offset
int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
result.Add(token);
current += wordIdAndLength[j];
}
}
return result.ToArray(/*new int[result.size()][]*/);
}
public int[] LookupSegmentation(int phraseID)
{
return segmentations[phraseID];
}
public int GetLeftId(int wordId)
{
return LEFT_ID;
}
public int GetRightId(int wordId)
{
return RIGHT_ID;
}
public int GetWordCost(int wordId)
{
return WORD_COST;
}
public string GetReading(int wordId, char[] surface, int off, int len)
{
return GetFeature(wordId, 0);
}
public string GetPartOfSpeech(int wordId)
{
return GetFeature(wordId, 1);
}
public string GetBaseForm(int wordId, char[] surface, int off, int len)
{
return null; // TODO: add support?
}
public string GetPronunciation(int wordId, char[] surface, int off, int len)
{
return null; // TODO: add support?
}
public string GetInflectionType(int wordId)
{
return null; // TODO: add support?
}
public string GetInflectionForm(int wordId)
{
return null; // TODO: add support?
}
private string[] GetAllFeaturesArray(int wordId)
{
string allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET];
if (allFeatures == null)
{
return null;
}
return allFeatures.Split(new string[] { Dictionary.INTERNAL_SEPARATOR }, StringSplitOptions.None).TrimEnd();
}
private string GetFeature(int wordId, params int[] fields)
{
string[] allFeatures = GetAllFeaturesArray(wordId);
if (allFeatures == null)
{
return null;
}
StringBuilder sb = new StringBuilder();
if (fields.Length == 0)
{ // All features
foreach (string feature in allFeatures)
{
sb.Append(CSVUtil.QuoteEscape(feature)).Append(",");
}
}
else if (fields.Length == 1)
{ // One feature doesn't need to escape value
sb.Append(allFeatures[fields[0]]).Append(",");
}
else
{
foreach (int field in fields)
{
sb.Append(CSVUtil.QuoteEscape(allFeatures[field])).Append(",");
}
}
return sb.Remove(sb.Length - 1, 1).ToString();
}
}
}