| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.cn.smart.hhmm; |
| |
| import java.util.ArrayList; |
| import java.util.Collection; |
| import java.util.HashMap; |
| import java.util.List; |
| import java.util.Map; |
| |
| import org.apache.lucene.analysis.cn.smart.Utility; |
| |
| /** |
| * Graph representing possible token pairs (bigrams) at each start offset in the sentence. |
| * <p> |
| * For each start offset, a list of possible token pairs is stored. |
| * </p> |
| * @lucene.experimental |
| */ |
| class BiSegGraph { |
| |
| private Map<Integer,ArrayList<SegTokenPair>> tokenPairListTable = new HashMap<>(); |
| |
| private List<SegToken> segTokenList; |
| |
| private static BigramDictionary bigramDict = BigramDictionary.getInstance(); |
| |
| public BiSegGraph(SegGraph segGraph) { |
| segTokenList = segGraph.makeIndex(); |
| generateBiSegGraph(segGraph); |
| } |
| |
| /* |
| * Generate a BiSegGraph based upon a SegGraph |
| */ |
| private void generateBiSegGraph(SegGraph segGraph) { |
| double smooth = 0.1; |
| int wordPairFreq = 0; |
| int maxStart = segGraph.getMaxStart(); |
| double oneWordFreq, weight, tinyDouble = 1.0 / Utility.MAX_FREQUENCE; |
| |
| int next; |
| char[] idBuffer; |
| // get the list of tokens ordered and indexed |
| segTokenList = segGraph.makeIndex(); |
| // Because the beginning position of startToken is -1, therefore startToken can be obtained when key = -1 |
| int key = -1; |
| List<SegToken> nextTokens = null; |
| while (key < maxStart) { |
| if (segGraph.isStartExist(key)) { |
| |
| List<SegToken> tokenList = segGraph.getStartList(key); |
| |
| // Calculate all tokens for a given key. |
| for (SegToken t1 : tokenList) { |
| oneWordFreq = t1.weight; |
| next = t1.endOffset; |
| nextTokens = null; |
| // Find the next corresponding Token. |
| // For example: "Sunny seashore", the present Token is "sunny", next one should be "sea" or "seashore". |
| // If we cannot find the next Token, then go to the end and repeat the same cycle. |
| while (next <= maxStart) { |
| // Because the beginning position of endToken is sentenceLen, so equal to sentenceLen can find endToken. |
| if (segGraph.isStartExist(next)) { |
| nextTokens = segGraph.getStartList(next); |
| break; |
| } |
| next++; |
| } |
| if (nextTokens == null) { |
| break; |
| } |
| for (SegToken t2 : nextTokens) { |
| idBuffer = new char[t1.charArray.length + t2.charArray.length + 1]; |
| System.arraycopy(t1.charArray, 0, idBuffer, 0, t1.charArray.length); |
| idBuffer[t1.charArray.length] = BigramDictionary.WORD_SEGMENT_CHAR; |
| System.arraycopy(t2.charArray, 0, idBuffer, |
| t1.charArray.length + 1, t2.charArray.length); |
| |
| // Two linked Words frequency |
| wordPairFreq = bigramDict.getFrequency(idBuffer); |
| |
| // Smoothing |
| |
| // -log{a*P(Ci-1)+(1-a)P(Ci|Ci-1)} Note 0<a<1 |
| weight = -Math |
| .log(smooth |
| * (1.0 + oneWordFreq) |
| / (Utility.MAX_FREQUENCE + 0.0) |
| + (1.0 - smooth) |
| * ((1.0 - tinyDouble) * wordPairFreq / (1.0 + oneWordFreq) + tinyDouble)); |
| |
| SegTokenPair tokenPair = new SegTokenPair(idBuffer, t1.index, |
| t2.index, weight); |
| this.addSegTokenPair(tokenPair); |
| } |
| } |
| } |
| key++; |
| } |
| |
| } |
| |
| /** |
| * Returns true if their is a list of token pairs at this offset (index of the second token) |
| * |
| * @param to index of the second token in the token pair |
| * @return true if a token pair exists |
| */ |
| public boolean isToExist(int to) { |
| return tokenPairListTable.get(Integer.valueOf(to)) != null; |
| } |
| |
| /** |
| * Return a {@link List} of all token pairs at this offset (index of the second token) |
| * |
| * @param to index of the second token in the token pair |
| * @return {@link List} of token pairs. |
| */ |
| public List<SegTokenPair> getToList(int to) { |
| return tokenPairListTable.get(to); |
| } |
| |
| /** |
| * Add a {@link SegTokenPair} |
| * |
| * @param tokenPair {@link SegTokenPair} |
| */ |
| public void addSegTokenPair(SegTokenPair tokenPair) { |
| int to = tokenPair.to; |
| if (!isToExist(to)) { |
| ArrayList<SegTokenPair> newlist = new ArrayList<>(); |
| newlist.add(tokenPair); |
| tokenPairListTable.put(to, newlist); |
| } else { |
| List<SegTokenPair> tokenPairList = tokenPairListTable.get(to); |
| tokenPairList.add(tokenPair); |
| } |
| } |
| |
| /** |
| * Get the number of {@link SegTokenPair} entries in the table. |
| * @return number of {@link SegTokenPair} entries |
| */ |
| public int getToCount() { |
| return tokenPairListTable.size(); |
| } |
| |
| /** |
| * Find the shortest path with the Viterbi algorithm. |
| * @return {@link List} |
| */ |
| public List<SegToken> getShortPath() { |
| int current; |
| int nodeCount = getToCount(); |
| List<PathNode> path = new ArrayList<>(); |
| PathNode zeroPath = new PathNode(); |
| zeroPath.weight = 0; |
| zeroPath.preNode = 0; |
| path.add(zeroPath); |
| for (current = 1; current <= nodeCount; current++) { |
| double weight; |
| List<SegTokenPair> edges = getToList(current); |
| |
| double minWeight = Double.MAX_VALUE; |
| SegTokenPair minEdge = null; |
| for (SegTokenPair edge : edges) { |
| weight = edge.weight; |
| PathNode preNode = path.get(edge.from); |
| if (preNode.weight + weight < minWeight) { |
| minWeight = preNode.weight + weight; |
| minEdge = edge; |
| } |
| } |
| PathNode newNode = new PathNode(); |
| newNode.weight = minWeight; |
| newNode.preNode = minEdge.from; |
| path.add(newNode); |
| } |
| |
| // Calculate PathNodes |
| int preNode, lastNode; |
| lastNode = path.size() - 1; |
| current = lastNode; |
| List<Integer> rpath = new ArrayList<>(); |
| List<SegToken> resultPath = new ArrayList<>(); |
| |
| rpath.add(current); |
| while (current != 0) { |
| PathNode currentPathNode = path.get(current); |
| preNode = currentPathNode.preNode; |
| rpath.add(Integer.valueOf(preNode)); |
| current = preNode; |
| } |
| for (int j = rpath.size() - 1; j >= 0; j--) { |
| Integer idInteger = rpath.get(j); |
| int id = idInteger.intValue(); |
| SegToken t = segTokenList.get(id); |
| resultPath.add(t); |
| } |
| return resultPath; |
| |
| } |
| |
| @Override |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| Collection<ArrayList<SegTokenPair>> values = tokenPairListTable.values(); |
| for (ArrayList<SegTokenPair> segList : values) { |
| for (SegTokenPair pair : segList) { |
| sb.append(pair).append("\n"); |
| } |
| } |
| return sb.toString(); |
| } |
| |
| } |