| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.cn.smart.hhmm; |
| |
| import java.io.DataInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.ObjectInputStream; |
| import java.io.ObjectOutputStream; |
| import java.nio.ByteBuffer; |
| import java.nio.ByteOrder; |
| import java.nio.file.Files; |
| import java.nio.file.Path; |
| import java.nio.file.Paths; |
| |
| import org.apache.lucene.analysis.cn.smart.AnalyzerProfile; |
| import org.apache.lucene.analysis.cn.smart.Utility; |
| import org.apache.lucene.util.SuppressForbidden; |
| |
| /** |
| * SmartChineseAnalyzer Word Dictionary |
| * @lucene.experimental |
| */ |
| class WordDictionary extends AbstractDictionary { |
| |
| private WordDictionary() { |
| } |
| |
| private static WordDictionary singleInstance; |
| |
| /** |
| * Large prime number for hash function |
| */ |
| public static final int PRIME_INDEX_LENGTH = 12071; |
| |
| /** |
| * wordIndexTable guarantees to hash all Chinese characters in Unicode into |
| * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this |
| * program only handles the 6768 characters found in GB2312 plus some |
| * ASCII characters. Therefore in order to guarantee better precision, it is |
| * necessary to retain the original symbol in the charIndexTable. |
| */ |
| private short[] wordIndexTable; |
| |
| private char[] charIndexTable; |
| |
| /** |
| * To avoid taking too much space, the data structure needed to store the |
| * lexicon requires two multidimensional arrays to store word and frequency. |
| * Each word is placed in a char[]. Each char represents a Chinese char or |
| * other symbol. Each frequency is put into an int. These two arrays |
| * correspond to each other one-to-one. Therefore, one can use |
| * wordItem_charArrayTable[i][j] to look up word from lexicon, and |
| * wordItem_frequencyTable[i][j] to look up the corresponding frequency. |
| */ |
| private char[][][] wordItem_charArrayTable; |
| |
| private int[][] wordItem_frequencyTable; |
| |
| // static Logger log = Logger.getLogger(WordDictionary.class); |
| |
| /** |
| * Get the singleton dictionary instance. |
| * @return singleton |
| */ |
| public synchronized static WordDictionary getInstance() { |
| if (singleInstance == null) { |
| singleInstance = new WordDictionary(); |
| try { |
| singleInstance.load(); |
| } catch (IOException e) { |
| String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR; |
| singleInstance.load(wordDictRoot); |
| } catch (ClassNotFoundException e) { |
| throw new RuntimeException(e); |
| } |
| } |
| return singleInstance; |
| } |
| |
| /** |
| * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct |
| * |
| * @param dctFileRoot path to dictionary directory |
| */ |
| public void load(String dctFileRoot) { |
| String dctFilePath = dctFileRoot + "/coredict.dct"; |
| Path serialObj = Paths.get(dctFileRoot + "/coredict.mem"); |
| |
| if (Files.exists(serialObj) && loadFromObj(serialObj)) { |
| |
| } else { |
| try { |
| wordIndexTable = new short[PRIME_INDEX_LENGTH]; |
| charIndexTable = new char[PRIME_INDEX_LENGTH]; |
| for (int i = 0; i < PRIME_INDEX_LENGTH; i++) { |
| charIndexTable[i] = 0; |
| wordIndexTable[i] = -1; |
| } |
| wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][]; |
| wordItem_frequencyTable = new int[GB2312_CHAR_NUM][]; |
| // int total = |
| loadMainDataFromFile(dctFilePath); |
| expandDelimiterData(); |
| mergeSameWords(); |
| sortEachItems(); |
| // log.info("load dictionary: " + dctFilePath + " total:" + total); |
| } catch (IOException e) { |
| throw new RuntimeException(e.getMessage()); |
| } |
| |
| saveToObj(serialObj); |
| } |
| |
| } |
| |
| /** |
| * Load coredict.mem internally from the jar file. |
| * |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| public void load() throws IOException, ClassNotFoundException { |
| InputStream input = this.getClass().getResourceAsStream("coredict.mem"); |
| loadFromObjectInputStream(input); |
| } |
| |
| private boolean loadFromObj(Path serialObj) { |
| try { |
| loadFromObjectInputStream(Files.newInputStream(serialObj)); |
| return true; |
| } catch (Exception e) { |
| throw new RuntimeException(e); |
| } |
| } |
| |
| @SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase") |
| private void loadFromObjectInputStream(InputStream serialObjectInputStream) |
| throws IOException, ClassNotFoundException { |
| try (ObjectInputStream input = new ObjectInputStream(serialObjectInputStream)) { |
| wordIndexTable = (short[]) input.readObject(); |
| charIndexTable = (char[]) input.readObject(); |
| wordItem_charArrayTable = (char[][][]) input.readObject(); |
| wordItem_frequencyTable = (int[][]) input.readObject(); |
| // log.info("load core dict from serialization."); |
| } |
| } |
| |
| @SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase") |
| private void saveToObj(Path serialObj) { |
| try (ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(serialObj))) { |
| output.writeObject(wordIndexTable); |
| output.writeObject(charIndexTable); |
| output.writeObject(wordItem_charArrayTable); |
| output.writeObject(wordItem_frequencyTable); |
| // log.info("serialize core dict."); |
| } catch (Exception e) { |
| // log.warn(e.getMessage()); |
| } |
| } |
| |
| /** |
| * Load the datafile into this WordDictionary |
| * |
| * @param dctFilePath path to word dictionary (coredict.dct) |
| * @return number of words read |
| * @throws IOException If there is a low-level I/O error. |
| */ |
| private int loadMainDataFromFile(String dctFilePath) throws IOException { |
| int i, cnt, length, total = 0; |
| // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760. |
| // The 3756th is used (as a header) to store information. |
| int[] buffer = new int[3]; |
| byte[] intBuffer = new byte[4]; |
| String tmpword; |
| DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath))); |
| |
| // GB2312 characters 0 - 6768 |
| for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { |
| // if (i == 5231) |
| // System.out.println(i); |
| |
| dctFile.read(intBuffer); |
| // the dictionary was developed for C, and byte order must be converted to work with Java |
| cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt(); |
| if (cnt <= 0) { |
| wordItem_charArrayTable[i] = null; |
| wordItem_frequencyTable[i] = null; |
| continue; |
| } |
| wordItem_charArrayTable[i] = new char[cnt][]; |
| wordItem_frequencyTable[i] = new int[cnt]; |
| total += cnt; |
| int j = 0; |
| while (j < cnt) { |
| // wordItemTable[i][j] = new WordItem(); |
| dctFile.read(intBuffer); |
| buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) |
| .getInt();// frequency |
| dctFile.read(intBuffer); |
| buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) |
| .getInt();// length |
| dctFile.read(intBuffer); |
| buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN) |
| .getInt();// handle |
| |
| // wordItemTable[i][j].frequency = buffer[0]; |
| wordItem_frequencyTable[i][j] = buffer[0]; |
| |
| length = buffer[1]; |
| if (length > 0) { |
| byte[] lchBuffer = new byte[length]; |
| dctFile.read(lchBuffer); |
| tmpword = new String(lchBuffer, "GB2312"); |
| // indexTable[i].wordItems[j].word = tmpword; |
| // wordItemTable[i][j].charArray = tmpword.toCharArray(); |
| wordItem_charArrayTable[i][j] = tmpword.toCharArray(); |
| } else { |
| // wordItemTable[i][j].charArray = null; |
| wordItem_charArrayTable[i][j] = null; |
| } |
| // System.out.println(indexTable[i].wordItems[j]); |
| j++; |
| } |
| |
| String str = getCCByGB2312Id(i); |
| setTableIndex(str.charAt(0), i); |
| } |
| dctFile.close(); |
| return total; |
| } |
| |
| /** |
| * The original lexicon puts all information with punctuation into a |
| * chart (from 1 to 3755). Here it then gets expanded, separately being |
| * placed into the chart that has the corresponding symbol. |
| */ |
| private void expandDelimiterData() { |
| int i; |
| int cnt; |
| // Punctuation then treating index 3755 as 1, |
| // distribute the original punctuation corresponding dictionary into |
| int delimiterIndex = 3755 + GB2312_FIRST_CHAR; |
| i = 0; |
| while (i < wordItem_charArrayTable[delimiterIndex].length) { |
| char c = wordItem_charArrayTable[delimiterIndex][i][0]; |
| int j = getGB2312Id(c);// the id value of the punctuation |
| if (wordItem_charArrayTable[j] == null) { |
| |
| int k = i; |
| // Starting from i, count the number of the following worditem symbol from j |
| while (k < wordItem_charArrayTable[delimiterIndex].length |
| && wordItem_charArrayTable[delimiterIndex][k][0] == c) { |
| k++; |
| } |
| // c is the punctuation character, j is the id value of c |
| // k-1 represents the index of the last punctuation character |
| cnt = k - i; |
| if (cnt != 0) { |
| wordItem_charArrayTable[j] = new char[cnt][]; |
| wordItem_frequencyTable[j] = new int[cnt]; |
| } |
| |
| // Assign value for each wordItem. |
| for (k = 0; k < cnt; k++, i++) { |
| // wordItemTable[j][k] = new WordItem(); |
| wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i]; |
| wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1]; |
| System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1, |
| wordItem_charArrayTable[j][k], 0, |
| wordItem_charArrayTable[j][k].length); |
| } |
| setTableIndex(c, j); |
| } |
| } |
| // Delete the original corresponding symbol array. |
| wordItem_charArrayTable[delimiterIndex] = null; |
| wordItem_frequencyTable[delimiterIndex] = null; |
| } |
| |
| /* |
| * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS) |
| */ |
| private void mergeSameWords() { |
| int i; |
| for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) { |
| if (wordItem_charArrayTable[i] == null) |
| continue; |
| int len = 1; |
| for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { |
| if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, |
| wordItem_charArrayTable[i][j - 1], 0) != 0) |
| len++; |
| |
| } |
| if (len < wordItem_charArrayTable[i].length) { |
| char[][] tempArray = new char[len][]; |
| int[] tempFreq = new int[len]; |
| int k = 0; |
| tempArray[0] = wordItem_charArrayTable[i][0]; |
| tempFreq[0] = wordItem_frequencyTable[i][0]; |
| for (int j = 1; j < wordItem_charArrayTable[i].length; j++) { |
| if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, |
| tempArray[k], 0) != 0) { |
| k++; |
| // temp[k] = wordItemTable[i][j]; |
| tempArray[k] = wordItem_charArrayTable[i][j]; |
| tempFreq[k] = wordItem_frequencyTable[i][j]; |
| } else { |
| // temp[k].frequency += wordItemTable[i][j].frequency; |
| tempFreq[k] += wordItem_frequencyTable[i][j]; |
| } |
| } |
| // wordItemTable[i] = temp; |
| wordItem_charArrayTable[i] = tempArray; |
| wordItem_frequencyTable[i] = tempFreq; |
| } |
| } |
| } |
| |
| private void sortEachItems() { |
| char[] tmpArray; |
| int tmpFreq; |
| for (int i = 0; i < wordItem_charArrayTable.length; i++) { |
| if (wordItem_charArrayTable[i] != null |
| && wordItem_charArrayTable[i].length > 1) { |
| for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) { |
| for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) { |
| if (Utility.compareArray(wordItem_charArrayTable[i][j], 0, |
| wordItem_charArrayTable[i][j2], 0) > 0) { |
| tmpArray = wordItem_charArrayTable[i][j]; |
| tmpFreq = wordItem_frequencyTable[i][j]; |
| wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2]; |
| wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2]; |
| wordItem_charArrayTable[i][j2] = tmpArray; |
| wordItem_frequencyTable[i][j2] = tmpFreq; |
| } |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * Calculate character c's position in hash table, |
| * then initialize the value of that position in the address table. |
| */ |
| private boolean setTableIndex(char c, int j) { |
| int index = getAvaliableTableIndex(c); |
| if (index != -1) { |
| charIndexTable[index] = c; |
| wordIndexTable[index] = (short) j; |
| return true; |
| } else |
| return false; |
| } |
| |
| private short getAvaliableTableIndex(char c) { |
| int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); |
| int hash2 = hash2(c) % PRIME_INDEX_LENGTH; |
| if (hash1 < 0) |
| hash1 = PRIME_INDEX_LENGTH + hash1; |
| if (hash2 < 0) |
| hash2 = PRIME_INDEX_LENGTH + hash2; |
| int index = hash1; |
| int i = 1; |
| while (charIndexTable[index] != 0 && charIndexTable[index] != c |
| && i < PRIME_INDEX_LENGTH) { |
| index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; |
| i++; |
| } |
| // System.out.println(i - 1); |
| |
| if (i < PRIME_INDEX_LENGTH |
| && (charIndexTable[index] == 0 || charIndexTable[index] == c)) { |
| return (short) index; |
| } else |
| return -1; |
| } |
| |
| private short getWordItemTableIndex(char c) { |
| int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH); |
| int hash2 = hash2(c) % PRIME_INDEX_LENGTH; |
| if (hash1 < 0) |
| hash1 = PRIME_INDEX_LENGTH + hash1; |
| if (hash2 < 0) |
| hash2 = PRIME_INDEX_LENGTH + hash2; |
| int index = hash1; |
| int i = 1; |
| while (charIndexTable[index] != 0 && charIndexTable[index] != c |
| && i < PRIME_INDEX_LENGTH) { |
| index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH; |
| i++; |
| } |
| |
| if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) { |
| return (short) index; |
| } else |
| return -1; |
| } |
| |
| /** |
| * Look up the text string corresponding with the word char array, |
| * and return the position of the word list. |
| * |
| * @param knownHashIndex already figure out position of the first word |
| * symbol charArray[0] in hash table. If not calculated yet, can be |
| * replaced with function int findInTable(char[] charArray). |
| * @param charArray look up the char array corresponding with the word. |
| * @return word location in word array. If not found, then return -1. |
| */ |
| private int findInTable(short knownHashIndex, char[] charArray) { |
| if (charArray == null || charArray.length == 0) |
| return -1; |
| |
| char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]]; |
| int start = 0, end = items.length - 1; |
| int mid = (start + end) / 2, cmpResult; |
| |
| // Binary search for the index of idArray |
| while (start <= end) { |
| cmpResult = Utility.compareArray(items[mid], 0, charArray, 1); |
| |
| if (cmpResult == 0) |
| return mid;// find it |
| else if (cmpResult < 0) |
| start = mid + 1; |
| else if (cmpResult > 0) |
| end = mid - 1; |
| |
| mid = (start + end) / 2; |
| } |
| return -1; |
| } |
| |
| /** |
| * Find the first word in the dictionary that starts with the supplied prefix |
| * |
| * @see #getPrefixMatch(char[], int) |
| * @param charArray input prefix |
| * @return index of word, or -1 if not found |
| */ |
| public int getPrefixMatch(char[] charArray) { |
| return getPrefixMatch(charArray, 0); |
| } |
| |
| /** |
| * Find the nth word in the dictionary that starts with the supplied prefix |
| * |
| * @see #getPrefixMatch(char[]) |
| * @param charArray input prefix |
| * @param knownStart relative position in the dictionary to start |
| * @return index of word, or -1 if not found |
| */ |
| public int getPrefixMatch(char[] charArray, int knownStart) { |
| short index = getWordItemTableIndex(charArray[0]); |
| if (index == -1) |
| return -1; |
| char[][] items = wordItem_charArrayTable[wordIndexTable[index]]; |
| int start = knownStart, end = items.length - 1; |
| |
| int mid = (start + end) / 2, cmpResult; |
| |
| // Binary search for the index of idArray |
| while (start <= end) { |
| cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0); |
| if (cmpResult == 0) { |
| // Get the first item which match the current word |
| while (mid >= 0 |
| && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0) |
| mid--; |
| mid++; |
| return mid;// Find the first word that uses charArray as prefix. |
| } else if (cmpResult < 0) |
| end = mid - 1; |
| else |
| start = mid + 1; |
| mid = (start + end) / 2; |
| } |
| return -1; |
| } |
| |
| /** |
| * Get the frequency of a word from the dictionary |
| * |
| * @param charArray input word |
| * @return word frequency, or zero if the word is not found |
| */ |
| public int getFrequency(char[] charArray) { |
| short hashIndex = getWordItemTableIndex(charArray[0]); |
| if (hashIndex == -1) |
| return 0; |
| int itemIndex = findInTable(hashIndex, charArray); |
| if (itemIndex != -1) |
| return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex]; |
| return 0; |
| |
| } |
| |
| /** |
| * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray |
| * |
| * @param charArray input word |
| * @param itemIndex item index for table charArray[0] |
| * @return true if the entry exists |
| */ |
| public boolean isEqual(char[] charArray, int itemIndex) { |
| short hashIndex = getWordItemTableIndex(charArray[0]); |
| return Utility.compareArray(charArray, 1, |
| wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0; |
| } |
| |
| } |