src/Lucene.Net.Analysis.SmartCn/Hhmm/BiSegGraph.cs - lucenenet - Git at Google

 // lucene version compatibility level: 4.8.1
 using Lucene.Net.Support;
 using System;
 using System.Collections.Generic;
 using System.Text;

 namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
 {
     /*
      * Licensed to the Apache Software Foundation (ASF) under one or more
      * contributor license agreements.  See the NOTICE file distributed with
      * this work for additional information regarding copyright ownership.
      * The ASF licenses this file to You under the Apache License, Version 2.0
      * (the "License"); you may not use this file except in compliance with
      * the License.  You may obtain a copy of the License at
      *
      *     http://www.apache.org/licenses/LICENSE-2.0
      *
      * Unless required by applicable law or agreed to in writing, software
      * distributed under the License is distributed on an "AS IS" BASIS,
      * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      * See the License for the specific language governing permissions and
      * limitations under the License.
      */

     /// <summary>
     /// Graph representing possible token pairs (bigrams) at each start offset in the sentence.
     /// <para>
     /// For each start offset, a list of possible token pairs is stored.
     /// </para>
     /// @lucene.experimental
     /// </summary>
     internal class BiSegGraph
     {
         private readonly IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>(); // LUCENENET: marked readonly

         private IList<SegToken> segTokenList;

         private static readonly BigramDictionary bigramDict = BigramDictionary.GetInstance(); // LUCENENET: marked readonly

         public BiSegGraph(SegGraph segGraph)
         {
             segTokenList = segGraph.MakeIndex();
             GenerateBiSegGraph(segGraph);
         }

         /// <summary>
         /// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
         /// </summary>
         private void GenerateBiSegGraph(SegGraph segGraph)
         {
             double smooth = 0.1;
             int wordPairFreq; // LUCENENET: IDE0059: Remove unnecessary value assignment
             int maxStart = segGraph.MaxStart;
             double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

             int next;
             char[] idBuffer;
             // get the list of tokens ordered and indexed
             segTokenList = segGraph.MakeIndex();
             // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
             int key = -1;
             IList<SegToken> nextTokens; // LUCENENET: IDE0059: Remove unnecessary value assignment
             while (key < maxStart)
             {
                 if (segGraph.IsStartExist(key))
                 {

                     IList<SegToken> tokenList = segGraph.GetStartList(key);

                     // Calculate all tokens for a given key.
                     foreach (SegToken t1 in tokenList)
                     {
                         oneWordFreq = t1.Weight;
                         next = t1.EndOffset;
                         nextTokens = null;
                         // Find the next corresponding Token.
                         // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
                         // If we cannot find the next Token, then go to the end and repeat the same cycle.
                         while (next <= maxStart)
                         {
                             // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
                             if (segGraph.IsStartExist(next))
                             {
                                 nextTokens = segGraph.GetStartList(next);
                                 break;
                             }
                             next++;
                         }
                         if (nextTokens == null)
                         {
                             break;
                         }
                         foreach (SegToken t2 in nextTokens)
                         {
                             idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
                             System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
                             idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
                             System.Array.Copy(t2.CharArray, 0, idBuffer,
                                 t1.CharArray.Length + 1, t2.CharArray.Length);

                             // Two linked Words frequency
                             wordPairFreq = bigramDict.GetFrequency(idBuffer);

                             // Smoothing

                             // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
                             weight = -Math
                                 .Log(smooth
                                     * (1.0 + oneWordFreq)
                                     / (Utility.MAX_FREQUENCE + 0.0)
                                     + (1.0 - smooth)
                                     * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

                             SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
                                 t2.Index, weight);
                             this.AddSegTokenPair(tokenPair);
                         }
                     }
                 }
                 key++;
             }

         }

         /// <summary>
         /// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token)
         /// </summary>
         /// <param name="to">index of the second token in the token pair</param>
         /// <returns><c>true</c> if a token pair exists</returns>
         public virtual bool IsToExist(int to)
         {
             return tokenPairListTable.TryGetValue(to, out IList<SegTokenPair> result) && result != null;
         }

         /// <summary>
         /// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token)
         /// </summary>
         /// <param name="to">index of the second token in the token pair</param>
         /// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns>
         public virtual IList<SegTokenPair> GetToList(int to)
         {
             tokenPairListTable.TryGetValue(to, out IList<SegTokenPair> result);
             return result;
         }

         /// <summary>
         /// Add a <see cref="SegTokenPair"/>
         /// </summary>
         /// <param name="tokenPair"><see cref="SegTokenPair"/></param>
         public virtual void AddSegTokenPair(SegTokenPair tokenPair)
         {
             int to = tokenPair.To;
             if (!IsToExist(to))
             {
                 List<SegTokenPair> newlist = new List<SegTokenPair>
                 {
                     tokenPair
                 };
                 tokenPairListTable[to] = newlist;
             }
             else
             {
                 IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
                 tokenPairList.Add(tokenPair);
             }
         }

         /// <summary>
         /// Get the number of <see cref="SegTokenPair"/> entries in the table.
         /// </summary>
         /// <returns>number of <see cref="SegTokenPair"/> entries</returns>
         public virtual int ToCount => tokenPairListTable.Count;

         /// <summary>
         /// Find the shortest path with the Viterbi algorithm.
         /// </summary>
         /// <returns><see cref="T:IList{SegToken}"/></returns>
         [ExceptionToNetNumericConvention]
         public virtual IList<SegToken> GetShortPath()
         {
             int current;
             int nodeCount = ToCount;
             IList<PathNode> path = new List<PathNode>();
             PathNode zeroPath = new PathNode();
             zeroPath.Weight = 0;
             zeroPath.PreNode = 0;
             path.Add(zeroPath);
             for (current = 1; current <= nodeCount; current++)
             {
                 double weight;
                 IList<SegTokenPair> edges = GetToList(current);

                 double minWeight = double.MaxValue;
                 SegTokenPair minEdge = null;
                 foreach (SegTokenPair edge in edges)
                 {
                     weight = edge.Weight;
                     PathNode preNode2 = path[edge.From];
                     if (preNode2.Weight + weight < minWeight)
                     {
                         minWeight = preNode2.Weight + weight;
                         minEdge = edge;
                     }
                 }
                 PathNode newNode = new PathNode();
                 newNode.Weight = minWeight;
                 newNode.PreNode = minEdge.From;
                 path.Add(newNode);
             }

             // Calculate PathNodes
             int preNode, lastNode;
             lastNode = path.Count - 1;
             current = lastNode;
             IList<int> rpath = new List<int>();
             IList<SegToken> resultPath = new List<SegToken>();

             rpath.Add(current);
             while (current != 0)
             {
                 PathNode currentPathNode = path[current];
                 preNode = currentPathNode.PreNode;
                 rpath.Add(preNode);
                 current = preNode;
             }
             for (int j = rpath.Count - 1; j >= 0; j--)
             {
                 //int idInteger = rpath.get(j);
                 //int id = idInteger.intValue();
                 int id = rpath[j];
                 SegToken t = segTokenList[id];
                 resultPath.Add(t);
             }
             return resultPath;
         }

         public override string ToString()
         {
             StringBuilder sb = new StringBuilder();
             ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values;
             foreach (IList<SegTokenPair> segList in values)
             {
                 foreach (SegTokenPair pair in segList)
                 {
                     sb.Append(pair + "\n");
                 }
             }
             return sb.ToString();
         }
     }
 }
	// lucene version compatibility level: 4.8.1
	using Lucene.Net.Support;
	using System;
	using System.Collections.Generic;
	using System.Text;

	namespace Lucene.Net.Analysis.Cn.Smart.Hhmm
	{
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/// <summary>
	/// Graph representing possible token pairs (bigrams) at each start offset in the sentence.
	/// <para>
	/// For each start offset, a list of possible token pairs is stored.
	/// </para>
	/// @lucene.experimental
	/// </summary>
	internal class BiSegGraph
	{
	private readonly IDictionary<int, IList<SegTokenPair>> tokenPairListTable = new Dictionary<int, IList<SegTokenPair>>(); // LUCENENET: marked readonly

	private IList<SegToken> segTokenList;

	private static readonly BigramDictionary bigramDict = BigramDictionary.GetInstance(); // LUCENENET: marked readonly

	public BiSegGraph(SegGraph segGraph)
	{
	segTokenList = segGraph.MakeIndex();
	GenerateBiSegGraph(segGraph);
	}

	/// <summary>
	/// Generate a <see cref="BiSegGraph"/> based upon a <see cref="SegGraph"/>
	/// </summary>
	private void GenerateBiSegGraph(SegGraph segGraph)
	{
	double smooth = 0.1;
	int wordPairFreq; // LUCENENET: IDE0059: Remove unnecessary value assignment
	int maxStart = segGraph.MaxStart;
	double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

	int next;
	char[] idBuffer;
	// get the list of tokens ordered and indexed
	segTokenList = segGraph.MakeIndex();
	// Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
	int key = -1;
	IList<SegToken> nextTokens; // LUCENENET: IDE0059: Remove unnecessary value assignment
	while (key < maxStart)
	{
	if (segGraph.IsStartExist(key))
	{

	IList<SegToken> tokenList = segGraph.GetStartList(key);

	// Calculate all tokens for a given key.
	foreach (SegToken t1 in tokenList)
	{
	oneWordFreq = t1.Weight;
	next = t1.EndOffset;
	nextTokens = null;
	// Find the next corresponding Token.
	// For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
	// If we cannot find the next Token, then go to the end and repeat the same cycle.
	while (next <= maxStart)
	{
	// Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
	if (segGraph.IsStartExist(next))
	{
	nextTokens = segGraph.GetStartList(next);
	break;
	}
	next++;
	}
	if (nextTokens == null)
	{
	break;
	}
	foreach (SegToken t2 in nextTokens)
	{
	idBuffer = new char[t1.CharArray.Length + t2.CharArray.Length + 1];
	System.Array.Copy(t1.CharArray, 0, idBuffer, 0, t1.CharArray.Length);
	idBuffer[t1.CharArray.Length] = BigramDictionary.WORD_SEGMENT_CHAR;
	System.Array.Copy(t2.CharArray, 0, idBuffer,
	t1.CharArray.Length + 1, t2.CharArray.Length);

	// Two linked Words frequency
	wordPairFreq = bigramDict.GetFrequency(idBuffer);

	// Smoothing

	// -log{a*P(Ci-1)+(1-a)P(Ci\|Ci-1)} Note 0<a<1
	weight = -Math
	.Log(smooth
	* (1.0 + oneWordFreq)
	/ (Utility.MAX_FREQUENCE + 0.0)
	+ (1.0 - smooth)
	* ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

	SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.Index,
	t2.Index, weight);
	this.AddSegTokenPair(tokenPair);
	}
	}
	}
	key++;
	}

	}

	/// <summary>
	/// Returns <c>true</c> if their is a list of token pairs at this offset (index of the second token)
	/// </summary>
	/// <param name="to">index of the second token in the token pair</param>
	/// <returns><c>true</c> if a token pair exists</returns>
	public virtual bool IsToExist(int to)
	{
	return tokenPairListTable.TryGetValue(to, out IList<SegTokenPair> result) && result != null;
	}

	/// <summary>
	/// Return a <see cref="T:IList{SegTokenPair}"/> of all token pairs at this offset (index of the second token)
	/// </summary>
	/// <param name="to">index of the second token in the token pair</param>
	/// <returns><see cref="T:IList{SegTokenPair}"/> of token pairs. </returns>
	public virtual IList<SegTokenPair> GetToList(int to)
	{
	tokenPairListTable.TryGetValue(to, out IList<SegTokenPair> result);
	return result;
	}

	/// <summary>
	/// Add a <see cref="SegTokenPair"/>
	/// </summary>
	/// <param name="tokenPair"><see cref="SegTokenPair"/></param>
	public virtual void AddSegTokenPair(SegTokenPair tokenPair)
	{
	int to = tokenPair.To;
	if (!IsToExist(to))
	{
	List<SegTokenPair> newlist = new List<SegTokenPair>
	{
	tokenPair
	};
	tokenPairListTable[to] = newlist;
	}
	else
	{
	IList<SegTokenPair> tokenPairList = tokenPairListTable[to];
	tokenPairList.Add(tokenPair);
	}
	}

	/// <summary>
	/// Get the number of <see cref="SegTokenPair"/> entries in the table.
	/// </summary>
	/// <returns>number of <see cref="SegTokenPair"/> entries</returns>
	public virtual int ToCount => tokenPairListTable.Count;

	/// <summary>
	/// Find the shortest path with the Viterbi algorithm.
	/// </summary>
	/// <returns><see cref="T:IList{SegToken}"/></returns>
	[ExceptionToNetNumericConvention]
	public virtual IList<SegToken> GetShortPath()
	{
	int current;
	int nodeCount = ToCount;
	IList<PathNode> path = new List<PathNode>();
	PathNode zeroPath = new PathNode();
	zeroPath.Weight = 0;
	zeroPath.PreNode = 0;
	path.Add(zeroPath);
	for (current = 1; current <= nodeCount; current++)
	{
	double weight;
	IList<SegTokenPair> edges = GetToList(current);

	double minWeight = double.MaxValue;
	SegTokenPair minEdge = null;
	foreach (SegTokenPair edge in edges)
	{
	weight = edge.Weight;
	PathNode preNode2 = path[edge.From];
	if (preNode2.Weight + weight < minWeight)
	{
	minWeight = preNode2.Weight + weight;
	minEdge = edge;
	}
	}
	PathNode newNode = new PathNode();
	newNode.Weight = minWeight;
	newNode.PreNode = minEdge.From;
	path.Add(newNode);
	}

	// Calculate PathNodes
	int preNode, lastNode;
	lastNode = path.Count - 1;
	current = lastNode;
	IList<int> rpath = new List<int>();
	IList<SegToken> resultPath = new List<SegToken>();

	rpath.Add(current);
	while (current != 0)
	{
	PathNode currentPathNode = path[current];
	preNode = currentPathNode.PreNode;
	rpath.Add(preNode);
	current = preNode;
	}
	for (int j = rpath.Count - 1; j >= 0; j--)
	{
	//int idInteger = rpath.get(j);
	//int id = idInteger.intValue();
	int id = rpath[j];
	SegToken t = segTokenList[id];
	resultPath.Add(t);
	}
	return resultPath;
	}

	public override string ToString()
	{
	StringBuilder sb = new StringBuilder();
	ICollection<IList<SegTokenPair>> values = tokenPairListTable.Values;
	foreach (IList<SegTokenPair> segList in values)
	{
	foreach (SegTokenPair pair in segList)
	{
	sb.Append(pair + "\n");
	}
	}
	return sb.ToString();
	}
	}
	}