src/Lucene.Net.Analysis.Kuromoji/Dict/UserDictionary.cs - lucenenet - Git at Google

 using J2N.Text;
 using Lucene.Net.Analysis.Ja.Util;
 using Lucene.Net.Util;
 using Lucene.Net.Util.Fst;
 using System;
 using System.Collections.Generic;
 using System.IO;
 using System.Text;
 using System.Text.RegularExpressions;
 using JCG = J2N.Collections.Generic;

 namespace Lucene.Net.Analysis.Ja.Dict
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Class for building a User Dictionary.
     /// This class allows for custom segmentation of phrases.
     /// </summary>
     public sealed class UserDictionary : IDictionary
     {
         // phrase text -> phrase ID
         private readonly TokenInfoFST fst;

         // holds wordid, length, length... indexed by phrase ID
         private readonly int[][] segmentations;

         // holds readings and POS, indexed by wordid
         private readonly string[] data;

         private static readonly int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;

         public static readonly int WORD_COST = -100000;

         public static readonly int LEFT_ID = 5;

         public static readonly int RIGHT_ID = 5;

         private static readonly Regex specialChars = new Regex(@"#.*$", RegexOptions.Compiled);
         private static readonly Regex commentLine = new Regex(@"  *", RegexOptions.Compiled);

         public UserDictionary(TextReader reader)
         {
             string line = null;
             int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
             List<string[]> featureEntries = new List<string[]>();

             // text, segmentation, readings, POS
             while ((line = reader.ReadLine()) != null)
             {
                 // Remove comments
                 line = specialChars.Replace(line, "");

                 // Skip empty lines or comment lines
                 if (line.Trim().Length == 0)
                 {
                     continue;
                 }
                 string[] values = CSVUtil.Parse(line);
                 featureEntries.Add(values);
             }

             // TODO: should we allow multiple segmentations per input 'phrase'?
             // the old treemap didn't support this either, and i'm not sure if its needed/useful?
             featureEntries.Sort(Comparer<string[]>.Create((left, right) => left[0].CompareToOrdinal(right[0])));

             List<string> data = new List<string>(featureEntries.Count);
             List<int[]> segmentations = new List<int[]>(featureEntries.Count);

             PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
             Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
             Int32sRef scratch = new Int32sRef();
             long ord = 0;

             foreach (string[] values in featureEntries)
             {
                 string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
                 string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
                 string pos = values[3];

                 if (segmentation.Length != readings.Length)
                 {
                     throw new Exception("Illegal user dictionary entry " + values[0] +
                                                " - the number of segmentations (" + segmentation.Length + ")" +
                                                " does not the match number of readings (" + readings.Length + ")");
                 }

                 int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
                 wordIdAndLength[0] = wordId;
                 for (int i = 0; i < segmentation.Length; i++)
                 {
                     wordIdAndLength[i + 1] = segmentation[i].Length;
                     data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
                     wordId++;
                 }
                 // add mapping to FST
                 string token = values[0];
                 scratch.Grow(token.Length);
                 scratch.Length = token.Length;
                 for (int i = 0; i < token.Length; i++)
                 {
                     scratch.Int32s[i] = (int)token[i];
                 }
                 fstBuilder.Add(scratch, ord);
                 segmentations.Add(wordIdAndLength);
                 ord++;
             }
             this.fst = new TokenInfoFST(fstBuilder.Finish(), false);
             this.data = data.ToArray(/*new string[data.Count]*/);
             this.segmentations = segmentations.ToArray(/*new int[segmentations.Count][]*/);
         }

         /// <summary>
         /// Lookup words in text.
         /// </summary>
         /// <param name="chars">Text.</param>
         /// <param name="off">Offset into text.</param>
         /// <param name="len">Length of text.</param>
         /// <returns>Array of {wordId, position, length}.</returns>
         public int[][] Lookup(char[] chars, int off, int len)
         {
             // TODO: can we avoid this treemap/toIndexArray?
             IDictionary<int, int[]> result = new JCG.SortedDictionary<int, int[]>(); // index, [length, length...]
             bool found = false; // true if we found any results

             FST.BytesReader fstReader = fst.GetBytesReader();

             FST.Arc<long?> arc = new FST.Arc<long?>();
             int end = off + len;
             for (int startOffset = off; startOffset < end; startOffset++)
             {
                 arc = fst.GetFirstArc(arc);
                 int output = 0;
                 int remaining = end - startOffset;
                 for (int i = 0; i < remaining; i++)
                 {
                     int ch = chars[startOffset + i];
                     if (fst.FindTargetArc(ch, arc, arc, i == 0, fstReader) == null)
                     {
                         break; // continue to next position
                     }
                     output += (int)arc.Output;
                     if (arc.IsFinal)
                     {
                         int finalOutput = output + (int)arc.NextFinalOutput;
                         result[startOffset - off] = segmentations[finalOutput];
                         found = true;
                     }
                 }
             }

             return found ? ToIndexArray(result) : EMPTY_RESULT;
         }

         public TokenInfoFST FST => fst;

         private static readonly int[][] EMPTY_RESULT = new int[0][];

         /// <summary>
         /// Convert Map of index and wordIdAndLength to array of {wordId, index, length}
         /// </summary>
         /// <param name="input"></param>
         /// <returns>Array of {wordId, index, length}.</returns>
         private int[][] ToIndexArray(IDictionary<int, int[]> input)
         {
             List<int[]> result = new List<int[]>();
             foreach (int i in input.Keys)
             {
                 int[] wordIdAndLength = input[i];
                 int wordId = wordIdAndLength[0];
                 // convert length to index
                 int current = i;
                 for (int j = 1; j < wordIdAndLength.Length; j++)
                 { // first entry is wordId offset
                     int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
                     result.Add(token);
                     current += wordIdAndLength[j];
                 }
             }
             return result.ToArray(/*new int[result.size()][]*/);
         }

         public int[] LookupSegmentation(int phraseID)
         {
             return segmentations[phraseID];
         }

         public int GetLeftId(int wordId)
         {
             return LEFT_ID;
         }

         public int GetRightId(int wordId)
         {
             return RIGHT_ID;
         }

         public int GetWordCost(int wordId)
         {
             return WORD_COST;
         }

         public string GetReading(int wordId, char[] surface, int off, int len)
         {
             return GetFeature(wordId, 0);
         }

         public string GetPartOfSpeech(int wordId)
         {
             return GetFeature(wordId, 1);
         }

         public string GetBaseForm(int wordId, char[] surface, int off, int len)
         {
             return null; // TODO: add support?
         }

         public string GetPronunciation(int wordId, char[] surface, int off, int len)
         {
             return null; // TODO: add support?
         }

         public string GetInflectionType(int wordId)
         {
             return null; // TODO: add support?
         }

         public string GetInflectionForm(int wordId)
         {
             return null; // TODO: add support?
         }

         private string[] GetAllFeaturesArray(int wordId)
         {
             string allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET];
             if (allFeatures == null)
             {
                 return null;
             }

             return allFeatures.Split(new string[] { Dictionary.INTERNAL_SEPARATOR }, StringSplitOptions.None).TrimEnd();
         }

         private string GetFeature(int wordId, params int[] fields)
         {
             string[] allFeatures = GetAllFeaturesArray(wordId);
             if (allFeatures == null)
             {
                 return null;
             }
             StringBuilder sb = new StringBuilder();
             if (fields.Length == 0)
             { // All features
                 foreach (string feature in allFeatures)
                 {
                     sb.Append(CSVUtil.QuoteEscape(feature)).Append(",");
                 }
             }
             else if (fields.Length == 1)
             { // One feature doesn't need to escape value
                 sb.Append(allFeatures[fields[0]]).Append(",");
             }
             else
             {
                 foreach (int field in fields)
                 {
                     sb.Append(CSVUtil.QuoteEscape(allFeatures[field])).Append(",");
                 }
             }
             return sb.Remove(sb.Length - 1, 1).ToString();
         }
     }
 }
	using J2N.Text;
	using Lucene.Net.Analysis.Ja.Util;
	using Lucene.Net.Util;
	using Lucene.Net.Util.Fst;
	using System;
	using System.Collections.Generic;
	using System.IO;
	using System.Text;
	using System.Text.RegularExpressions;
	using JCG = J2N.Collections.Generic;

	namespace Lucene.Net.Analysis.Ja.Dict
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Class for building a User Dictionary.
	/// This class allows for custom segmentation of phrases.
	/// </summary>
	public sealed class UserDictionary : IDictionary
	{
	// phrase text -> phrase ID
	private readonly TokenInfoFST fst;

	// holds wordid, length, length... indexed by phrase ID
	private readonly int[][] segmentations;

	// holds readings and POS, indexed by wordid
	private readonly string[] data;

	private static readonly int CUSTOM_DICTIONARY_WORD_ID_OFFSET = 100000000;

	public static readonly int WORD_COST = -100000;

	public static readonly int LEFT_ID = 5;

	public static readonly int RIGHT_ID = 5;

	private static readonly Regex specialChars = new Regex(@"#.*$", RegexOptions.Compiled);
	private static readonly Regex commentLine = new Regex(@" *", RegexOptions.Compiled);

	public UserDictionary(TextReader reader)
	{
	string line = null;
	int wordId = CUSTOM_DICTIONARY_WORD_ID_OFFSET;
	List<string[]> featureEntries = new List<string[]>();

	// text, segmentation, readings, POS
	while ((line = reader.ReadLine()) != null)
	{
	// Remove comments
	line = specialChars.Replace(line, "");

	// Skip empty lines or comment lines
	if (line.Trim().Length == 0)
	{
	continue;
	}
	string[] values = CSVUtil.Parse(line);
	featureEntries.Add(values);
	}

	// TODO: should we allow multiple segmentations per input 'phrase'?
	// the old treemap didn't support this either, and i'm not sure if its needed/useful?
	featureEntries.Sort(Comparer<string[]>.Create((left, right) => left[0].CompareToOrdinal(right[0])));

	List<string> data = new List<string>(featureEntries.Count);
	List<int[]> segmentations = new List<int[]>(featureEntries.Count);

	PositiveInt32Outputs fstOutput = PositiveInt32Outputs.Singleton;
	Builder<long?> fstBuilder = new Builder<long?>(Lucene.Net.Util.Fst.FST.INPUT_TYPE.BYTE2, fstOutput);
	Int32sRef scratch = new Int32sRef();
	long ord = 0;

	foreach (string[] values in featureEntries)
	{
	string[] segmentation = commentLine.Replace(values[1], " ").Split(' ').TrimEnd();
	string[] readings = commentLine.Replace(values[2], " ").Split(' ').TrimEnd();
	string pos = values[3];

	if (segmentation.Length != readings.Length)
	{
	throw new Exception("Illegal user dictionary entry " + values[0] +
	" - the number of segmentations (" + segmentation.Length + ")" +
	" does not the match number of readings (" + readings.Length + ")");
	}

	int[] wordIdAndLength = new int[segmentation.Length + 1]; // wordId offset, length, length....
	wordIdAndLength[0] = wordId;
	for (int i = 0; i < segmentation.Length; i++)
	{
	wordIdAndLength[i + 1] = segmentation[i].Length;
	data.Add(readings[i] + Dictionary.INTERNAL_SEPARATOR + pos);
	wordId++;
	}
	// add mapping to FST
	string token = values[0];
	scratch.Grow(token.Length);
	scratch.Length = token.Length;
	for (int i = 0; i < token.Length; i++)
	{
	scratch.Int32s[i] = (int)token[i];
	}
	fstBuilder.Add(scratch, ord);
	segmentations.Add(wordIdAndLength);
	ord++;
	}
	this.fst = new TokenInfoFST(fstBuilder.Finish(), false);
	this.data = data.ToArray(/new string[data.Count]/);
	this.segmentations = segmentations.ToArray(/new int[segmentations.Count][]/);
	}

	/// <summary>
	/// Lookup words in text.
	/// </summary>
	/// <param name="chars">Text.</param>
	/// <param name="off">Offset into text.</param>
	/// <param name="len">Length of text.</param>
	/// <returns>Array of {wordId, position, length}.</returns>
	public int[][] Lookup(char[] chars, int off, int len)
	{
	// TODO: can we avoid this treemap/toIndexArray?
	IDictionary<int, int[]> result = new JCG.SortedDictionary<int, int[]>(); // index, [length, length...]
	bool found = false; // true if we found any results

	FST.BytesReader fstReader = fst.GetBytesReader();

	FST.Arc<long?> arc = new FST.Arc<long?>();
	int end = off + len;
	for (int startOffset = off; startOffset < end; startOffset++)
	{
	arc = fst.GetFirstArc(arc);
	int output = 0;
	int remaining = end - startOffset;
	for (int i = 0; i < remaining; i++)
	{
	int ch = chars[startOffset + i];
	if (fst.FindTargetArc(ch, arc, arc, i == 0, fstReader) == null)
	{
	break; // continue to next position
	}
	output += (int)arc.Output;
	if (arc.IsFinal)
	{
	int finalOutput = output + (int)arc.NextFinalOutput;
	result[startOffset - off] = segmentations[finalOutput];
	found = true;
	}
	}
	}

	return found ? ToIndexArray(result) : EMPTY_RESULT;
	}

	public TokenInfoFST FST => fst;

	private static readonly int[][] EMPTY_RESULT = new int[0][];

	/// <summary>
	/// Convert Map of index and wordIdAndLength to array of {wordId, index, length}
	/// </summary>
	/// <param name="input"></param>
	/// <returns>Array of {wordId, index, length}.</returns>
	private int[][] ToIndexArray(IDictionary<int, int[]> input)
	{
	List<int[]> result = new List<int[]>();
	foreach (int i in input.Keys)
	{
	int[] wordIdAndLength = input[i];
	int wordId = wordIdAndLength[0];
	// convert length to index
	int current = i;
	for (int j = 1; j < wordIdAndLength.Length; j++)
	{ // first entry is wordId offset
	int[] token = { wordId + j - 1, current, wordIdAndLength[j] };
	result.Add(token);
	current += wordIdAndLength[j];
	}
	}
	return result.ToArray(/new int[result.size()][]/);
	}

	public int[] LookupSegmentation(int phraseID)
	{
	return segmentations[phraseID];
	}

	public int GetLeftId(int wordId)
	{
	return LEFT_ID;
	}

	public int GetRightId(int wordId)
	{
	return RIGHT_ID;
	}

	public int GetWordCost(int wordId)
	{
	return WORD_COST;
	}

	public string GetReading(int wordId, char[] surface, int off, int len)
	{
	return GetFeature(wordId, 0);
	}

	public string GetPartOfSpeech(int wordId)
	{
	return GetFeature(wordId, 1);
	}

	public string GetBaseForm(int wordId, char[] surface, int off, int len)
	{
	return null; // TODO: add support?
	}

	public string GetPronunciation(int wordId, char[] surface, int off, int len)
	{
	return null; // TODO: add support?
	}

	public string GetInflectionType(int wordId)
	{
	return null; // TODO: add support?
	}

	public string GetInflectionForm(int wordId)
	{
	return null; // TODO: add support?
	}

	private string[] GetAllFeaturesArray(int wordId)
	{
	string allFeatures = data[wordId - CUSTOM_DICTIONARY_WORD_ID_OFFSET];
	if (allFeatures == null)
	{
	return null;
	}

	return allFeatures.Split(new string[] { Dictionary.INTERNAL_SEPARATOR }, StringSplitOptions.None).TrimEnd();
	}

	private string GetFeature(int wordId, params int[] fields)
	{
	string[] allFeatures = GetAllFeaturesArray(wordId);
	if (allFeatures == null)
	{
	return null;
	}
	StringBuilder sb = new StringBuilder();
	if (fields.Length == 0)
	{ // All features
	foreach (string feature in allFeatures)
	{
	sb.Append(CSVUtil.QuoteEscape(feature)).Append(",");
	}
	}
	else if (fields.Length == 1)
	{ // One feature doesn't need to escape value
	sb.Append(allFeatures[fields[0]]).Append(",");
	}
	else
	{
	foreach (int field in fields)
	{
	sb.Append(CSVUtil.QuoteEscape(allFeatures[field])).Append(",");
	}
	}
	return sb.Remove(sb.Length - 1, 1).ToString();
	}
	}
	}