lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/BiSegGraph.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.cn.smart.hhmm;

 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;

 import org.apache.lucene.analysis.cn.smart.Utility;

 /**
  * Graph representing possible token pairs (bigrams) at each start offset in the sentence.
  * <p>
  * For each start offset, a list of possible token pairs is stored.
  * </p>
  * @lucene.experimental
  */
 class BiSegGraph {

   private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<>();

   private List<SegToken> segTokenList;

   private static BigramDictionary bigramDict = BigramDictionary.getInstance();

   public BiSegGraph(SegGraph segGraph) {
     segTokenList = segGraph.makeIndex();
     generateBiSegGraph(segGraph);
   }

   /*
    * Generate a BiSegGraph based upon a SegGraph
    */
   private void generateBiSegGraph(SegGraph segGraph) {
     double smooth = 0.1;
     int wordPairFreq = 0;
     int maxStart = segGraph.getMaxStart();
     double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

     int next;
     char[] idBuffer;
     // get the list of tokens ordered and indexed
     segTokenList = segGraph.makeIndex();
     // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
     int key = -1;
     List<SegToken> nextTokens = null;
     while (key < maxStart) {
       if (segGraph.isStartExist(key)) {

         List<SegToken> tokenList = segGraph.getStartList(key);

         // Calculate all tokens for a given key.
         for (SegToken t1 : tokenList) {
           oneWordFreq = t1.weight;
           next = t1.endOffset;
           nextTokens = null;
           // Find the next corresponding Token.
           // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
           // If we cannot find the next Token, then go to the end and repeat the same cycle.
           while (next <= maxStart) {
             // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
             if (segGraph.isStartExist(next)) {
               nextTokens = segGraph.getStartList(next);
               break;
             }
             next++;
           }
           if (nextTokens == null) {
             break;
           }
           for (SegToken t2 : nextTokens) {
             idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
             System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
             idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
             System.arraycopy(t2.charArray, 0, idBuffer,
                 t1.charArray.length + 1, t2.charArray.length);

             // Two linked Words frequency
             wordPairFreq = bigramDict.getFrequency(idBuffer);

             // Smoothing

             // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1
             weight = -Math
                 .log(smooth
                     * (1.0 + oneWordFreq)
                     / (Utility.MAX_FREQUENCE + 0.0)
                     + (1.0 - smooth)
                     * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

             SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
                 t2.index, weight);
             this.addSegTokenPair(tokenPair);
           }
         }
       }
       key++;
     }

   }

   /**
    * Returns true if their is a list of token pairs at this offset (index of the second token)
    *
    * @param to index of the second token in the token pair
    * @return true if a token pair exists
    */
   public boolean isToExist(int to) {
     return tokenPairListTable.get(Integer.valueOf(to)) != null;
   }

   /**
    * Return a {@link List} of all token pairs at this offset (index of the second token)
    *
    * @param to index of the second token in the token pair
    * @return {@link List} of token pairs.
    */
   public List<SegTokenPair> getToList(int to) {
     return tokenPairListTable.get(to);
   }

   /**
    * Add a {@link SegTokenPair}
    *
    * @param tokenPair {@link SegTokenPair}
    */
   public void addSegTokenPair(SegTokenPair tokenPair) {
     int to = tokenPair.to;
     if (!isToExist(to)) {
       ArrayList<SegTokenPair> newlist = new ArrayList<>();
       newlist.add(tokenPair);
       tokenPairListTable.put(to, newlist);
     } else {
       List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
       tokenPairList.add(tokenPair);
     }
   }

   /**
    * Get the number of {@link SegTokenPair} entries in the table.
    * @return number of {@link SegTokenPair} entries
    */
   public int getToCount() {
     return tokenPairListTable.size();
   }

   /**
    * Find the shortest path with the Viterbi algorithm.
    * @return {@link List}
    */
   public List<SegToken> getShortPath() {
     int current;
     int nodeCount = getToCount();
     List<PathNode> path = new ArrayList<>();
     PathNode zeroPath = new PathNode();
     zeroPath.weight = 0;
     zeroPath.preNode = 0;
     path.add(zeroPath);
     for (current = 1; current <= nodeCount; current++) {
       double weight;
       List<SegTokenPair> edges = getToList(current);

       double minWeight = Double.MAX_VALUE;
       SegTokenPair minEdge = null;
       for (SegTokenPair edge : edges) {
         weight = edge.weight;
         PathNode preNode = path.get(edge.from);
         if (preNode.weight + weight < minWeight) {
           minWeight = preNode.weight + weight;
           minEdge = edge;
         }
       }
       PathNode newNode = new PathNode();
       newNode.weight = minWeight;
       newNode.preNode = minEdge.from;
       path.add(newNode);
     }

     // Calculate PathNodes
     int preNode, lastNode;
     lastNode = path.size() - 1;
     current = lastNode;
     List<Integer> rpath = new ArrayList<>();
     List<SegToken> resultPath = new ArrayList<>();

     rpath.add(current);
     while (current != 0) {
       PathNode currentPathNode = path.get(current);
       preNode = currentPathNode.preNode;
       rpath.add(Integer.valueOf(preNode));
       current = preNode;
     }
     for (int j = rpath.size() - 1; j >= 0; j--) {
       Integer idInteger = rpath.get(j);
       int id = idInteger.intValue();
       SegToken t = segTokenList.get(id);
       resultPath.add(t);
     }
     return resultPath;

   }

   @Override
   public String toString() {
     StringBuilder sb = new StringBuilder();
     Collection<ArrayList<SegTokenPair>>  values = tokenPairListTable.values();
     for (ArrayList<SegTokenPair> segList : values) {
       for (SegTokenPair pair : segList) {
         sb.append(pair).append("\n");
       }
     }
     return sb.toString();
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.cn.smart.hhmm;

	import java.util.ArrayList;
	import java.util.Collection;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;

	import org.apache.lucene.analysis.cn.smart.Utility;

	/**
	* Graph representing possible token pairs (bigrams) at each start offset in the sentence.
	* <p>
	* For each start offset, a list of possible token pairs is stored.
	* </p>
	* @lucene.experimental
	*/
	class BiSegGraph {

	private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<>();

	private List<SegToken> segTokenList;

	private static BigramDictionary bigramDict = BigramDictionary.getInstance();

	public BiSegGraph(SegGraph segGraph) {
	segTokenList = segGraph.makeIndex();
	generateBiSegGraph(segGraph);
	}

	/*
	* Generate a BiSegGraph based upon a SegGraph
	*/
	private void generateBiSegGraph(SegGraph segGraph) {
	double smooth = 0.1;
	int wordPairFreq = 0;
	int maxStart = segGraph.getMaxStart();
	double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE;

	int next;
	char[] idBuffer;
	// get the list of tokens ordered and indexed
	segTokenList = segGraph.makeIndex();
	// Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1
	int key = -1;
	List<SegToken> nextTokens = null;
	while (key < maxStart) {
	if (segGraph.isStartExist(key)) {

	List<SegToken> tokenList = segGraph.getStartList(key);

	// Calculate all tokens for a given key.
	for (SegToken t1 : tokenList) {
	oneWordFreq = t1.weight;
	next = t1.endOffset;
	nextTokens = null;
	// Find the next corresponding Token.
	// For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore".
	// If we cannot find the next Token, then go to the end and repeat the same cycle.
	while (next <= maxStart) {
	// Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken.
	if (segGraph.isStartExist(next)) {
	nextTokens = segGraph.getStartList(next);
	break;
	}
	next++;
	}
	if (nextTokens == null) {
	break;
	}
	for (SegToken t2 : nextTokens) {
	idBuffer = new char[t1.charArray.length + t2.charArray.length + 1];
	System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length);
	idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR;
	System.arraycopy(t2.charArray, 0, idBuffer,
	t1.charArray.length + 1, t2.charArray.length);

	// Two linked Words frequency
	wordPairFreq = bigramDict.getFrequency(idBuffer);

	// Smoothing

	// -log{a*P(Ci-1)+(1-a)P(Ci\|Ci-1)} Note 0<a<1
	weight = -Math
	.log(smooth
	* (1.0 + oneWordFreq)
	/ (Utility.MAX_FREQUENCE + 0.0)
	+ (1.0 - smooth)
	* ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble));

	SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index,
	t2.index, weight);
	this.addSegTokenPair(tokenPair);
	}
	}
	}
	key++;
	}

	}

	/**
	* Returns true if their is a list of token pairs at this offset (index of the second token)
	*
	* @param to index of the second token in the token pair
	* @return true if a token pair exists
	*/
	public boolean isToExist(int to) {
	return tokenPairListTable.get(Integer.valueOf(to)) != null;
	}

	/**
	* Return a {@link List} of all token pairs at this offset (index of the second token)
	*
	* @param to index of the second token in the token pair
	* @return {@link List} of token pairs.
	*/
	public List<SegTokenPair> getToList(int to) {
	return tokenPairListTable.get(to);
	}

	/**
	* Add a {@link SegTokenPair}
	*
	* @param tokenPair {@link SegTokenPair}
	*/
	public void addSegTokenPair(SegTokenPair tokenPair) {
	int to = tokenPair.to;
	if (!isToExist(to)) {
	ArrayList<SegTokenPair> newlist = new ArrayList<>();
	newlist.add(tokenPair);
	tokenPairListTable.put(to, newlist);
	} else {
	List<SegTokenPair> tokenPairList = tokenPairListTable.get(to);
	tokenPairList.add(tokenPair);
	}
	}

	/**
	* Get the number of {@link SegTokenPair} entries in the table.
	* @return number of {@link SegTokenPair} entries
	*/
	public int getToCount() {
	return tokenPairListTable.size();
	}

	/**
	* Find the shortest path with the Viterbi algorithm.
	* @return {@link List}
	*/
	public List<SegToken> getShortPath() {
	int current;
	int nodeCount = getToCount();
	List<PathNode> path = new ArrayList<>();
	PathNode zeroPath = new PathNode();
	zeroPath.weight = 0;
	zeroPath.preNode = 0;
	path.add(zeroPath);
	for (current = 1; current <= nodeCount; current++) {
	double weight;
	List<SegTokenPair> edges = getToList(current);

	double minWeight = Double.MAX_VALUE;
	SegTokenPair minEdge = null;
	for (SegTokenPair edge : edges) {
	weight = edge.weight;
	PathNode preNode = path.get(edge.from);
	if (preNode.weight + weight < minWeight) {
	minWeight = preNode.weight + weight;
	minEdge = edge;
	}
	}
	PathNode newNode = new PathNode();
	newNode.weight = minWeight;
	newNode.preNode = minEdge.from;
	path.add(newNode);
	}

	// Calculate PathNodes
	int preNode, lastNode;
	lastNode = path.size() - 1;
	current = lastNode;
	List<Integer> rpath = new ArrayList<>();
	List<SegToken> resultPath = new ArrayList<>();

	rpath.add(current);
	while (current != 0) {
	PathNode currentPathNode = path.get(current);
	preNode = currentPathNode.preNode;
	rpath.add(Integer.valueOf(preNode));
	current = preNode;
	}
	for (int j = rpath.size() - 1; j >= 0; j--) {
	Integer idInteger = rpath.get(j);
	int id = idInteger.intValue();
	SegToken t = segTokenList.get(id);
	resultPath.add(t);
	}
	return resultPath;

	}

	@Override
	public String toString() {
	StringBuilder sb = new StringBuilder();
	Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values();
	for (ArrayList<SegTokenPair> segList : values) {
	for (SegTokenPair pair : segList) {
	sb.append(pair).append("\n");
	}
	}
	return sb.toString();
	}

	}