lucene/analysis/smartcn/src/java/org/apache/lucene/analysis/cn/smart/hhmm/WordDictionary.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.analysis.cn.smart.hhmm;

 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.nio.ByteBuffer;
 import java.nio.ByteOrder;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import java.nio.file.Paths;

 import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
 import org.apache.lucene.analysis.cn.smart.Utility;
 import org.apache.lucene.util.SuppressForbidden;

 /**
  * SmartChineseAnalyzer Word Dictionary
  * @lucene.experimental
  */
 class WordDictionary extends AbstractDictionary {

   private WordDictionary() {
   }

   private static WordDictionary singleInstance;

   /**
    * Large prime number for hash function
    */
   public static final int PRIME_INDEX_LENGTH = 12071;

   /**
    * wordIndexTable guarantees to hash all Chinese characters in Unicode into
    * PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
    * program only handles the 6768 characters found in GB2312 plus some
    * ASCII characters. Therefore in order to guarantee better precision, it is
    * necessary to retain the original symbol in the charIndexTable.
    */
   private short[] wordIndexTable;

   private char[] charIndexTable;

   /**
    * To avoid taking too much space, the data structure needed to store the
    * lexicon requires two multidimensional arrays to store word and frequency.
    * Each word is placed in a char[]. Each char represents a Chinese char or
    * other symbol.  Each frequency is put into an int. These two arrays
    * correspond to each other one-to-one. Therefore, one can use
    * wordItem_charArrayTable[i][j] to look up word from lexicon, and
    * wordItem_frequencyTable[i][j] to look up the corresponding frequency.
    */
   private char[][][] wordItem_charArrayTable;

   private int[][] wordItem_frequencyTable;

   // static Logger log = Logger.getLogger(WordDictionary.class);

   /**
    * Get the singleton dictionary instance.
    * @return singleton
    */
   public synchronized static WordDictionary getInstance() {
     if (singleInstance == null) {
       singleInstance = new WordDictionary();
       try {
         singleInstance.load();
       } catch (IOException e) {
         String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
         singleInstance.load(wordDictRoot);
       } catch (ClassNotFoundException e) {
         throw new RuntimeException(e);
       }
     }
     return singleInstance;
   }

   /**
    * Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
    *
    * @param dctFileRoot path to dictionary directory
    */
   public void load(String dctFileRoot) {
     String dctFilePath = dctFileRoot + "/coredict.dct";
     Path serialObj = Paths.get(dctFileRoot + "/coredict.mem");

     if (Files.exists(serialObj) && loadFromObj(serialObj)) {

     } else {
       try {
         wordIndexTable = new short[PRIME_INDEX_LENGTH];
         charIndexTable = new char[PRIME_INDEX_LENGTH];
         for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
           charIndexTable[i] = 0;
           wordIndexTable[i] = -1;
         }
         wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
         wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
         // int total =
         loadMainDataFromFile(dctFilePath);
         expandDelimiterData();
         mergeSameWords();
         sortEachItems();
         // log.info("load dictionary: " + dctFilePath + " total:" + total);
       } catch (IOException e) {
         throw new RuntimeException(e.getMessage());
       }

       saveToObj(serialObj);
     }

   }

   /**
    * Load coredict.mem internally from the jar file.
    *
    * @throws IOException If there is a low-level I/O error.
    */
   public void load() throws IOException, ClassNotFoundException {
     InputStream input = this.getClass().getResourceAsStream("coredict.mem");
     loadFromObjectInputStream(input);
   }

   private boolean loadFromObj(Path serialObj) {
     try {
       loadFromObjectInputStream(Files.newInputStream(serialObj));
       return true;
     } catch (Exception e) {
       throw new RuntimeException(e);
     }
   }

   @SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
   private void loadFromObjectInputStream(InputStream serialObjectInputStream)
       throws IOException, ClassNotFoundException {
     try (ObjectInputStream input = new ObjectInputStream(serialObjectInputStream)) {
       wordIndexTable = (short[]) input.readObject();
       charIndexTable = (char[]) input.readObject();
       wordItem_charArrayTable = (char[][][]) input.readObject();
       wordItem_frequencyTable = (int[][]) input.readObject();
       // log.info("load core dict from serialization.");
     }
   }

   @SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
   private void saveToObj(Path serialObj) {
     try (ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(serialObj))) {
       output.writeObject(wordIndexTable);
       output.writeObject(charIndexTable);
       output.writeObject(wordItem_charArrayTable);
       output.writeObject(wordItem_frequencyTable);
       // log.info("serialize core dict.");
     } catch (Exception e) {
       // log.warn(e.getMessage());
     }
   }

   /**
    * Load the datafile into this WordDictionary
    *
    * @param dctFilePath path to word dictionary (coredict.dct)
    * @return number of words read
    * @throws IOException If there is a low-level I/O error.
    */
   private int loadMainDataFromFile(String dctFilePath) throws IOException {
     int i, cnt, length, total = 0;
     // The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
     // The 3756th is used (as a header) to store information.
     int[] buffer = new int[3];
     byte[] intBuffer = new byte[4];
     String tmpword;
     DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath)));

     // GB2312 characters 0 - 6768
     for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
       // if (i == 5231)
       // System.out.println(i);

       dctFile.read(intBuffer);
       // the dictionary was developed for C, and byte order must be converted to work with Java
       cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
       if (cnt <= 0) {
         wordItem_charArrayTable[i] = null;
         wordItem_frequencyTable[i] = null;
         continue;
       }
       wordItem_charArrayTable[i] = new char[cnt][];
       wordItem_frequencyTable[i] = new int[cnt];
       total += cnt;
       int j = 0;
       while (j < cnt) {
         // wordItemTable[i][j] = new WordItem();
         dctFile.read(intBuffer);
         buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
             .getInt();// frequency
         dctFile.read(intBuffer);
         buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
             .getInt();// length
         dctFile.read(intBuffer);
         buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
             .getInt();// handle

         // wordItemTable[i][j].frequency = buffer[0];
         wordItem_frequencyTable[i][j] = buffer[0];

         length = buffer[1];
         if (length > 0) {
           byte[] lchBuffer = new byte[length];
           dctFile.read(lchBuffer);
           tmpword = new String(lchBuffer, "GB2312");
           // indexTable[i].wordItems[j].word = tmpword;
           // wordItemTable[i][j].charArray = tmpword.toCharArray();
           wordItem_charArrayTable[i][j] = tmpword.toCharArray();
         } else {
           // wordItemTable[i][j].charArray = null;
           wordItem_charArrayTable[i][j] = null;
         }
         // System.out.println(indexTable[i].wordItems[j]);
         j++;
       }

       String str = getCCByGB2312Id(i);
       setTableIndex(str.charAt(0), i);
     }
     dctFile.close();
     return total;
   }

   /**
    * The original lexicon puts all information with punctuation into a
    * chart (from 1 to 3755). Here it then gets expanded, separately being
    * placed into the chart that has the corresponding symbol.
    */
   private void expandDelimiterData() {
     int i;
     int cnt;
     // Punctuation then treating index 3755 as 1,
     // distribute the original punctuation corresponding dictionary into
     int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
     i = 0;
     while (i < wordItem_charArrayTable[delimiterIndex].length) {
       char c = wordItem_charArrayTable[delimiterIndex][i][0];
       int j = getGB2312Id(c);// the id value of the punctuation
       if (wordItem_charArrayTable[j] == null) {

         int k = i;
         // Starting from i, count the number of the following worditem symbol from j
         while (k < wordItem_charArrayTable[delimiterIndex].length
             && wordItem_charArrayTable[delimiterIndex][k][0] == c) {
           k++;
         }
         // c is the punctuation character, j is the id value of c
         // k-1 represents the index of the last punctuation character
         cnt = k - i;
         if (cnt != 0) {
           wordItem_charArrayTable[j] = new char[cnt][];
           wordItem_frequencyTable[j] = new int[cnt];
         }

         // Assign value for each wordItem.
         for (k = 0; k < cnt; k++, i++) {
           // wordItemTable[j][k] = new WordItem();
           wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
           wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
           System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
               wordItem_charArrayTable[j][k], 0,
               wordItem_charArrayTable[j][k].length);
         }
         setTableIndex(c, j);
       }
     }
     // Delete the original corresponding symbol array.
     wordItem_charArrayTable[delimiterIndex] = null;
     wordItem_frequencyTable[delimiterIndex] = null;
   }

   /*
    * since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
    */
   private void mergeSameWords() {
     int i;
     for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
       if (wordItem_charArrayTable[i] == null)
         continue;
       int len = 1;
       for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
         if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
             wordItem_charArrayTable[i][j - 1], 0) != 0)
           len++;

       }
       if (len < wordItem_charArrayTable[i].length) {
         char[][] tempArray = new char[len][];
         int[] tempFreq = new int[len];
         int k = 0;
         tempArray[0] = wordItem_charArrayTable[i][0];
         tempFreq[0] = wordItem_frequencyTable[i][0];
         for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
           if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
               tempArray[k], 0) != 0) {
             k++;
             // temp[k] = wordItemTable[i][j];
             tempArray[k] = wordItem_charArrayTable[i][j];
             tempFreq[k] = wordItem_frequencyTable[i][j];
           } else {
             // temp[k].frequency += wordItemTable[i][j].frequency;
             tempFreq[k] += wordItem_frequencyTable[i][j];
           }
         }
         // wordItemTable[i] = temp;
         wordItem_charArrayTable[i] = tempArray;
         wordItem_frequencyTable[i] = tempFreq;
       }
     }
   }

   private void sortEachItems() {
     char[] tmpArray;
     int tmpFreq;
     for (int i = 0; i < wordItem_charArrayTable.length; i++) {
       if (wordItem_charArrayTable[i] != null
           && wordItem_charArrayTable[i].length > 1) {
         for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
           for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
             if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
                 wordItem_charArrayTable[i][j2], 0) > 0) {
               tmpArray = wordItem_charArrayTable[i][j];
               tmpFreq = wordItem_frequencyTable[i][j];
               wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
               wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
               wordItem_charArrayTable[i][j2] = tmpArray;
               wordItem_frequencyTable[i][j2] = tmpFreq;
             }
           }
         }
       }
     }
   }

   /*
    * Calculate character c's position in hash table,
    * then initialize the value of that position in the address table.
    */
   private boolean setTableIndex(char c, int j) {
     int index = getAvaliableTableIndex(c);
     if (index != -1) {
       charIndexTable[index] = c;
       wordIndexTable[index] = (short) j;
       return true;
     } else
       return false;
   }

   private short getAvaliableTableIndex(char c) {
     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
     if (hash1 < 0)
       hash1 = PRIME_INDEX_LENGTH + hash1;
     if (hash2 < 0)
       hash2 = PRIME_INDEX_LENGTH + hash2;
     int index = hash1;
     int i = 1;
     while (charIndexTable[index] != 0 && charIndexTable[index] != c
         && i < PRIME_INDEX_LENGTH) {
       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
       i++;
     }
     // System.out.println(i - 1);

     if (i < PRIME_INDEX_LENGTH
         && (charIndexTable[index] == 0 || charIndexTable[index] == c)) {
       return (short) index;
     } else
       return -1;
   }

   private short getWordItemTableIndex(char c) {
     int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
     int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
     if (hash1 < 0)
       hash1 = PRIME_INDEX_LENGTH + hash1;
     if (hash2 < 0)
       hash2 = PRIME_INDEX_LENGTH + hash2;
     int index = hash1;
     int i = 1;
     while (charIndexTable[index] != 0 && charIndexTable[index] != c
         && i < PRIME_INDEX_LENGTH) {
       index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
       i++;
     }

     if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
       return (short) index;
     } else
       return -1;
   }

   /**
    * Look up the text string corresponding with the word char array,
    * and return the position of the word list.
    *
    * @param knownHashIndex already figure out position of the first word
    *   symbol charArray[0] in hash table. If not calculated yet, can be
    *   replaced with function int findInTable(char[] charArray).
    * @param charArray look up the char array corresponding with the word.
    * @return word location in word array.  If not found, then return -1.
    */
   private int findInTable(short knownHashIndex, char[] charArray) {
     if (charArray == null || charArray.length == 0)
       return -1;

     char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
     int start = 0, end = items.length - 1;
     int mid = (start + end) / 2, cmpResult;

     // Binary search for the index of idArray
     while (start <= end) {
       cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);

       if (cmpResult == 0)
         return mid;// find it
       else if (cmpResult < 0)
         start = mid + 1;
       else if (cmpResult > 0)
         end = mid - 1;

       mid = (start + end) / 2;
     }
     return -1;
   }

   /**
    * Find the first word in the dictionary that starts with the supplied prefix
    *
    * @see #getPrefixMatch(char[], int)
    * @param charArray input prefix
    * @return index of word, or -1 if not found
    */
   public int getPrefixMatch(char[] charArray) {
     return getPrefixMatch(charArray, 0);
   }

   /**
    * Find the nth word in the dictionary that starts with the supplied prefix
    *
    * @see #getPrefixMatch(char[])
    * @param charArray input prefix
    * @param knownStart relative position in the dictionary to start
    * @return index of word, or -1 if not found
    */
   public int getPrefixMatch(char[] charArray, int knownStart) {
     short index = getWordItemTableIndex(charArray[0]);
     if (index == -1)
       return -1;
     char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
     int start = knownStart, end = items.length - 1;

     int mid = (start + end) / 2, cmpResult;

     // Binary search for the index of idArray
     while (start <= end) {
       cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
       if (cmpResult == 0) {
         // Get the first item which match the current word
         while (mid >= 0
             && Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
           mid--;
         mid++;
         return mid;// Find the first word that uses charArray as prefix.
       } else if (cmpResult < 0)
         end = mid - 1;
       else
         start = mid + 1;
       mid = (start + end) / 2;
     }
     return -1;
   }

   /**
    * Get the frequency of a word from the dictionary
    *
    * @param charArray input word
    * @return word frequency, or zero if the word is not found
    */
   public int getFrequency(char[] charArray) {
     short hashIndex = getWordItemTableIndex(charArray[0]);
     if (hashIndex == -1)
       return 0;
     int itemIndex = findInTable(hashIndex, charArray);
     if (itemIndex != -1)
       return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
     return 0;

   }

   /**
    * Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
    *
    * @param charArray input word
    * @param itemIndex item index for table charArray[0]
    * @return true if the entry exists
    */
   public boolean isEqual(char[] charArray, int itemIndex) {
     short hashIndex = getWordItemTableIndex(charArray[0]);
     return Utility.compareArray(charArray, 1,
         wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.analysis.cn.smart.hhmm;

	import java.io.DataInputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.ObjectInputStream;
	import java.io.ObjectOutputStream;
	import java.nio.ByteBuffer;
	import java.nio.ByteOrder;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import java.nio.file.Paths;

	import org.apache.lucene.analysis.cn.smart.AnalyzerProfile;
	import org.apache.lucene.analysis.cn.smart.Utility;
	import org.apache.lucene.util.SuppressForbidden;

	/**
	* SmartChineseAnalyzer Word Dictionary
	* @lucene.experimental
	*/
	class WordDictionary extends AbstractDictionary {

	private WordDictionary() {
	}

	private static WordDictionary singleInstance;

	/**
	* Large prime number for hash function
	*/
	public static final int PRIME_INDEX_LENGTH = 12071;

	/**
	* wordIndexTable guarantees to hash all Chinese characters in Unicode into
	* PRIME_INDEX_LENGTH array. There will be conflict, but in reality this
	* program only handles the 6768 characters found in GB2312 plus some
	* ASCII characters. Therefore in order to guarantee better precision, it is
	* necessary to retain the original symbol in the charIndexTable.
	*/
	private short[] wordIndexTable;

	private char[] charIndexTable;

	/**
	* To avoid taking too much space, the data structure needed to store the
	* lexicon requires two multidimensional arrays to store word and frequency.
	* Each word is placed in a char[]. Each char represents a Chinese char or
	* other symbol. Each frequency is put into an int. These two arrays
	* correspond to each other one-to-one. Therefore, one can use
	* wordItem_charArrayTable[i][j] to look up word from lexicon, and
	* wordItem_frequencyTable[i][j] to look up the corresponding frequency.
	*/
	private char[][][] wordItem_charArrayTable;

	private int[][] wordItem_frequencyTable;

	// static Logger log = Logger.getLogger(WordDictionary.class);

	/**
	* Get the singleton dictionary instance.
	* @return singleton
	*/
	public synchronized static WordDictionary getInstance() {
	if (singleInstance == null) {
	singleInstance = new WordDictionary();
	try {
	singleInstance.load();
	} catch (IOException e) {
	String wordDictRoot = AnalyzerProfile.ANALYSIS_DATA_DIR;
	singleInstance.load(wordDictRoot);
	} catch (ClassNotFoundException e) {
	throw new RuntimeException(e);
	}
	}
	return singleInstance;
	}

	/**
	* Attempt to load dictionary from provided directory, first trying coredict.mem, failing back on coredict.dct
	*
	* @param dctFileRoot path to dictionary directory
	*/
	public void load(String dctFileRoot) {
	String dctFilePath = dctFileRoot + "/coredict.dct";
	Path serialObj = Paths.get(dctFileRoot + "/coredict.mem");

	if (Files.exists(serialObj) && loadFromObj(serialObj)) {

	} else {
	try {
	wordIndexTable = new short[PRIME_INDEX_LENGTH];
	charIndexTable = new char[PRIME_INDEX_LENGTH];
	for (int i = 0; i < PRIME_INDEX_LENGTH; i++) {
	charIndexTable[i] = 0;
	wordIndexTable[i] = -1;
	}
	wordItem_charArrayTable = new char[GB2312_CHAR_NUM][][];
	wordItem_frequencyTable = new int[GB2312_CHAR_NUM][];
	// int total =
	loadMainDataFromFile(dctFilePath);
	expandDelimiterData();
	mergeSameWords();
	sortEachItems();
	// log.info("load dictionary: " + dctFilePath + " total:" + total);
	} catch (IOException e) {
	throw new RuntimeException(e.getMessage());
	}

	saveToObj(serialObj);
	}

	}

	/**
	* Load coredict.mem internally from the jar file.
	*
	* @throws IOException If there is a low-level I/O error.
	*/
	public void load() throws IOException, ClassNotFoundException {
	InputStream input = this.getClass().getResourceAsStream("coredict.mem");
	loadFromObjectInputStream(input);
	}

	private boolean loadFromObj(Path serialObj) {
	try {
	loadFromObjectInputStream(Files.newInputStream(serialObj));
	return true;
	} catch (Exception e) {
	throw new RuntimeException(e);
	}
	}

	@SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
	private void loadFromObjectInputStream(InputStream serialObjectInputStream)
	throws IOException, ClassNotFoundException {
	try (ObjectInputStream input = new ObjectInputStream(serialObjectInputStream)) {
	wordIndexTable = (short[]) input.readObject();
	charIndexTable = (char[]) input.readObject();
	wordItem_charArrayTable = (char[][][]) input.readObject();
	wordItem_frequencyTable = (int[][]) input.readObject();
	// log.info("load core dict from serialization.");
	}
	}

	@SuppressForbidden(reason = "TODO: fix code to serialize its own dictionary vs. a binary blob in the codebase")
	private void saveToObj(Path serialObj) {
	try (ObjectOutputStream output = new ObjectOutputStream(Files.newOutputStream(serialObj))) {
	output.writeObject(wordIndexTable);
	output.writeObject(charIndexTable);
	output.writeObject(wordItem_charArrayTable);
	output.writeObject(wordItem_frequencyTable);
	// log.info("serialize core dict.");
	} catch (Exception e) {
	// log.warn(e.getMessage());
	}
	}

	/**
	* Load the datafile into this WordDictionary
	*
	* @param dctFilePath path to word dictionary (coredict.dct)
	* @return number of words read
	* @throws IOException If there is a low-level I/O error.
	*/
	private int loadMainDataFromFile(String dctFilePath) throws IOException {
	int i, cnt, length, total = 0;
	// The file only counted 6763 Chinese characters plus 5 reserved slots 3756~3760.
	// The 3756th is used (as a header) to store information.
	int[] buffer = new int[3];
	byte[] intBuffer = new byte[4];
	String tmpword;
	DataInputStream dctFile = new DataInputStream(Files.newInputStream(Paths.get(dctFilePath)));

	// GB2312 characters 0 - 6768
	for (i = GB2312_FIRST_CHAR; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
	// if (i == 5231)
	// System.out.println(i);

	dctFile.read(intBuffer);
	// the dictionary was developed for C, and byte order must be converted to work with Java
	cnt = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN).getInt();
	if (cnt <= 0) {
	wordItem_charArrayTable[i] = null;
	wordItem_frequencyTable[i] = null;
	continue;
	}
	wordItem_charArrayTable[i] = new char[cnt][];
	wordItem_frequencyTable[i] = new int[cnt];
	total += cnt;
	int j = 0;
	while (j < cnt) {
	// wordItemTable[i][j] = new WordItem();
	dctFile.read(intBuffer);
	buffer[0] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
	.getInt();// frequency
	dctFile.read(intBuffer);
	buffer[1] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
	.getInt();// length
	dctFile.read(intBuffer);
	buffer[2] = ByteBuffer.wrap(intBuffer).order(ByteOrder.LITTLE_ENDIAN)
	.getInt();// handle

	// wordItemTable[i][j].frequency = buffer[0];
	wordItem_frequencyTable[i][j] = buffer[0];

	length = buffer[1];
	if (length > 0) {
	byte[] lchBuffer = new byte[length];
	dctFile.read(lchBuffer);
	tmpword = new String(lchBuffer, "GB2312");
	// indexTable[i].wordItems[j].word = tmpword;
	// wordItemTable[i][j].charArray = tmpword.toCharArray();
	wordItem_charArrayTable[i][j] = tmpword.toCharArray();
	} else {
	// wordItemTable[i][j].charArray = null;
	wordItem_charArrayTable[i][j] = null;
	}
	// System.out.println(indexTable[i].wordItems[j]);
	j++;
	}

	String str = getCCByGB2312Id(i);
	setTableIndex(str.charAt(0), i);
	}
	dctFile.close();
	return total;
	}

	/**
	* The original lexicon puts all information with punctuation into a
	* chart (from 1 to 3755). Here it then gets expanded, separately being
	* placed into the chart that has the corresponding symbol.
	*/
	private void expandDelimiterData() {
	int i;
	int cnt;
	// Punctuation then treating index 3755 as 1,
	// distribute the original punctuation corresponding dictionary into
	int delimiterIndex = 3755 + GB2312_FIRST_CHAR;
	i = 0;
	while (i < wordItem_charArrayTable[delimiterIndex].length) {
	char c = wordItem_charArrayTable[delimiterIndex][i][0];
	int j = getGB2312Id(c);// the id value of the punctuation
	if (wordItem_charArrayTable[j] == null) {

	int k = i;
	// Starting from i, count the number of the following worditem symbol from j
	while (k < wordItem_charArrayTable[delimiterIndex].length
	&& wordItem_charArrayTable[delimiterIndex][k][0] == c) {
	k++;
	}
	// c is the punctuation character, j is the id value of c
	// k-1 represents the index of the last punctuation character
	cnt = k - i;
	if (cnt != 0) {
	wordItem_charArrayTable[j] = new char[cnt][];
	wordItem_frequencyTable[j] = new int[cnt];
	}

	// Assign value for each wordItem.
	for (k = 0; k < cnt; k++, i++) {
	// wordItemTable[j][k] = new WordItem();
	wordItem_frequencyTable[j][k] = wordItem_frequencyTable[delimiterIndex][i];
	wordItem_charArrayTable[j][k] = new char[wordItem_charArrayTable[delimiterIndex][i].length - 1];
	System.arraycopy(wordItem_charArrayTable[delimiterIndex][i], 1,
	wordItem_charArrayTable[j][k], 0,
	wordItem_charArrayTable[j][k].length);
	}
	setTableIndex(c, j);
	}
	}
	// Delete the original corresponding symbol array.
	wordItem_charArrayTable[delimiterIndex] = null;
	wordItem_frequencyTable[delimiterIndex] = null;
	}

	/*
	* since we aren't doing POS-tagging, merge the frequencies for entries of the same word (with different POS)
	*/
	private void mergeSameWords() {
	int i;
	for (i = 0; i < GB2312_FIRST_CHAR + CHAR_NUM_IN_FILE; i++) {
	if (wordItem_charArrayTable[i] == null)
	continue;
	int len = 1;
	for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
	if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
	wordItem_charArrayTable[i][j - 1], 0) != 0)
	len++;

	}
	if (len < wordItem_charArrayTable[i].length) {
	char[][] tempArray = new char[len][];
	int[] tempFreq = new int[len];
	int k = 0;
	tempArray[0] = wordItem_charArrayTable[i][0];
	tempFreq[0] = wordItem_frequencyTable[i][0];
	for (int j = 1; j < wordItem_charArrayTable[i].length; j++) {
	if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
	tempArray[k], 0) != 0) {
	k++;
	// temp[k] = wordItemTable[i][j];
	tempArray[k] = wordItem_charArrayTable[i][j];
	tempFreq[k] = wordItem_frequencyTable[i][j];
	} else {
	// temp[k].frequency += wordItemTable[i][j].frequency;
	tempFreq[k] += wordItem_frequencyTable[i][j];
	}
	}
	// wordItemTable[i] = temp;
	wordItem_charArrayTable[i] = tempArray;
	wordItem_frequencyTable[i] = tempFreq;
	}
	}
	}

	private void sortEachItems() {
	char[] tmpArray;
	int tmpFreq;
	for (int i = 0; i < wordItem_charArrayTable.length; i++) {
	if (wordItem_charArrayTable[i] != null
	&& wordItem_charArrayTable[i].length > 1) {
	for (int j = 0; j < wordItem_charArrayTable[i].length - 1; j++) {
	for (int j2 = j + 1; j2 < wordItem_charArrayTable[i].length; j2++) {
	if (Utility.compareArray(wordItem_charArrayTable[i][j], 0,
	wordItem_charArrayTable[i][j2], 0) > 0) {
	tmpArray = wordItem_charArrayTable[i][j];
	tmpFreq = wordItem_frequencyTable[i][j];
	wordItem_charArrayTable[i][j] = wordItem_charArrayTable[i][j2];
	wordItem_frequencyTable[i][j] = wordItem_frequencyTable[i][j2];
	wordItem_charArrayTable[i][j2] = tmpArray;
	wordItem_frequencyTable[i][j2] = tmpFreq;
	}
	}
	}
	}
	}
	}

	/*
	* Calculate character c's position in hash table,
	* then initialize the value of that position in the address table.
	*/
	private boolean setTableIndex(char c, int j) {
	int index = getAvaliableTableIndex(c);
	if (index != -1) {
	charIndexTable[index] = c;
	wordIndexTable[index] = (short) j;
	return true;
	} else
	return false;
	}

	private short getAvaliableTableIndex(char c) {
	int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
	int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
	if (hash1 < 0)
	hash1 = PRIME_INDEX_LENGTH + hash1;
	if (hash2 < 0)
	hash2 = PRIME_INDEX_LENGTH + hash2;
	int index = hash1;
	int i = 1;
	while (charIndexTable[index] != 0 && charIndexTable[index] != c
	&& i < PRIME_INDEX_LENGTH) {
	index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
	i++;
	}
	// System.out.println(i - 1);

	if (i < PRIME_INDEX_LENGTH
	&& (charIndexTable[index] == 0 \|\| charIndexTable[index] == c)) {
	return (short) index;
	} else
	return -1;
	}

	private short getWordItemTableIndex(char c) {
	int hash1 = (int) (hash1(c) % PRIME_INDEX_LENGTH);
	int hash2 = hash2(c) % PRIME_INDEX_LENGTH;
	if (hash1 < 0)
	hash1 = PRIME_INDEX_LENGTH + hash1;
	if (hash2 < 0)
	hash2 = PRIME_INDEX_LENGTH + hash2;
	int index = hash1;
	int i = 1;
	while (charIndexTable[index] != 0 && charIndexTable[index] != c
	&& i < PRIME_INDEX_LENGTH) {
	index = (hash1 + i * hash2) % PRIME_INDEX_LENGTH;
	i++;
	}

	if (i < PRIME_INDEX_LENGTH && charIndexTable[index] == c) {
	return (short) index;
	} else
	return -1;
	}

	/**
	* Look up the text string corresponding with the word char array,
	* and return the position of the word list.
	*
	* @param knownHashIndex already figure out position of the first word
	* symbol charArray[0] in hash table. If not calculated yet, can be
	* replaced with function int findInTable(char[] charArray).
	* @param charArray look up the char array corresponding with the word.
	* @return word location in word array. If not found, then return -1.
	*/
	private int findInTable(short knownHashIndex, char[] charArray) {
	if (charArray == null \|\| charArray.length == 0)
	return -1;

	char[][] items = wordItem_charArrayTable[wordIndexTable[knownHashIndex]];
	int start = 0, end = items.length - 1;
	int mid = (start + end) / 2, cmpResult;

	// Binary search for the index of idArray
	while (start <= end) {
	cmpResult = Utility.compareArray(items[mid], 0, charArray, 1);

	if (cmpResult == 0)
	return mid;// find it
	else if (cmpResult < 0)
	start = mid + 1;
	else if (cmpResult > 0)
	end = mid - 1;

	mid = (start + end) / 2;
	}
	return -1;
	}

	/**
	* Find the first word in the dictionary that starts with the supplied prefix
	*
	* @see #getPrefixMatch(char[], int)
	* @param charArray input prefix
	* @return index of word, or -1 if not found
	*/
	public int getPrefixMatch(char[] charArray) {
	return getPrefixMatch(charArray, 0);
	}

	/**
	* Find the nth word in the dictionary that starts with the supplied prefix
	*
	* @see #getPrefixMatch(char[])
	* @param charArray input prefix
	* @param knownStart relative position in the dictionary to start
	* @return index of word, or -1 if not found
	*/
	public int getPrefixMatch(char[] charArray, int knownStart) {
	short index = getWordItemTableIndex(charArray[0]);
	if (index == -1)
	return -1;
	char[][] items = wordItem_charArrayTable[wordIndexTable[index]];
	int start = knownStart, end = items.length - 1;

	int mid = (start + end) / 2, cmpResult;

	// Binary search for the index of idArray
	while (start <= end) {
	cmpResult = Utility.compareArrayByPrefix(charArray, 1, items[mid], 0);
	if (cmpResult == 0) {
	// Get the first item which match the current word
	while (mid >= 0
	&& Utility.compareArrayByPrefix(charArray, 1, items[mid], 0) == 0)
	mid--;
	mid++;
	return mid;// Find the first word that uses charArray as prefix.
	} else if (cmpResult < 0)
	end = mid - 1;
	else
	start = mid + 1;
	mid = (start + end) / 2;
	}
	return -1;
	}

	/**
	* Get the frequency of a word from the dictionary
	*
	* @param charArray input word
	* @return word frequency, or zero if the word is not found
	*/
	public int getFrequency(char[] charArray) {
	short hashIndex = getWordItemTableIndex(charArray[0]);
	if (hashIndex == -1)
	return 0;
	int itemIndex = findInTable(hashIndex, charArray);
	if (itemIndex != -1)
	return wordItem_frequencyTable[wordIndexTable[hashIndex]][itemIndex];
	return 0;

	}

	/**
	* Return true if the dictionary entry at itemIndex for table charArray[0] is charArray
	*
	* @param charArray input word
	* @param itemIndex item index for table charArray[0]
	* @return true if the entry exists
	*/
	public boolean isEqual(char[] charArray, int itemIndex) {
	short hashIndex = getWordItemTableIndex(charArray[0]);
	return Utility.compareArray(charArray, 1,
	wordItem_charArrayTable[wordIndexTable[hashIndex]][itemIndex], 0) == 0;
	}

	}