src/joshua/decoder/ff/lm/buildin_lm/LMGrammarJAVA.java - joshua - Git at Google

 /*
  * This file is part of the Joshua Machine Translation System.
  *
  * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU
  * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of
  * the License, or (at your option) any later version.
  *
  * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without
  * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  * Lesser General Public License for more details.
  *
  * You should have received a copy of the GNU Lesser General Public License along with this library;
  * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA
  * 02111-1307 USA
  */
 package joshua.decoder.ff.lm.buildin_lm;

 import java.io.IOException;
 import java.util.HashMap;
 import java.util.logging.Level;
 import java.util.logging.Logger;

 import joshua.corpus.Vocabulary;
 import joshua.decoder.JoshuaConfiguration;
 import joshua.decoder.Support;
 import joshua.decoder.ff.lm.AbstractLM;
 import joshua.decoder.ff.lm.LanguageModelFF;
 import joshua.util.Regex;
 import joshua.util.io.LineReader;

 /**
  * this class implement (1) read the LM file into a Trie data structure (2) get LM probablity for a
  * given n-grm (3) get equivilent state
  *
  * @author Zhifei Li, <zhifei.work@gmail.com>
  * @version $LastChangedDate:2008-07-28 18:44:45 -0400 (Mon, 28 Jul 2008) $
  */
 public class LMGrammarJAVA extends AbstractLM {

   // BUG: Why are the IDs not static? Why are the strings not final?
   static String BACKOFF_WGHT_SYM = "<bow>";
   int BACKOFF_WGHT_SYM_ID; // used by LMModel

   // to indicate that lm trie node has children
   static String LM_HAVE_PREFIX_SYM = "<havelzfprefix>";
   int LM_HAVE_PREFIX_SYM_ID;

   static String UNK_SYM = "<unk>"; // unknown lm word
   int UNK_SYM_ID;

   /** Used for logging the time cost for things */
   private long start_loading_time;

   /*
    * A backoff node is a hashtable, it may include: (1) probabilities for next words (2) pointers to
    * a next-layer backoff node (hashtable); the key lookup the value is: sym_id + highestID (3)
    * backoff weight for this node (4) suffix/prefix flag to indicate that there is ngrams start from
    * this suffix
    */
   private LMHash root = null;
   private int g_n_bow_nodes = 0;
   private int g_n_suffix_nodes = 0;
   // ngram prob must be smaller than this number
   static private float MIN_LOG_P = -9999.0f;
   // ngram prob must be smaller than this number
   static private double SUFFIX_ONLY = MIN_LOG_P * 3;

   private double NON_EXIST_WEIGHT = 0; // the history has not appeared at all
   private int num_rule_read = 0;
   boolean g_is_add_prefix_infor = false;
   boolean g_is_add_suffix_infor = false;

   // cmd with result
   HashMap<String, int[]> request_cache_prob = new HashMap<String, int[]>();
   HashMap<String, int[]> request_cache_backoff = new HashMap<String, int[]>();
   HashMap<String, int[]> request_cache_left_equiv = new HashMap<String, int[]>();
   HashMap<String, int[]> request_cache_right_equiv = new HashMap<String, int[]>();
   int cache_size_limit = 250000;

   private static final Logger logger = Logger.getLogger(LMGrammarJAVA.class.getName());

   public LMGrammarJAVA(int order, String lm_file, boolean is_add_suffix_infor,
       boolean is_add_prefix_infor) throws IOException {
     super(order);
     logger.info("use java lm");

     this.BACKOFF_WGHT_SYM_ID = Vocabulary.id(BACKOFF_WGHT_SYM);
     this.LM_HAVE_PREFIX_SYM_ID = Vocabulary.id(LM_HAVE_PREFIX_SYM);
     this.UNK_SYM_ID = Vocabulary.id(UNK_SYM);

     g_is_add_prefix_infor = is_add_prefix_infor;
     g_is_add_suffix_infor = is_add_suffix_infor;

     read_lm_grammar_from_file(lm_file);// TODO: what about sentence-specific?
   }

   // signature of this item: i, j, lhs, states (in fact, we do not need i, j)
   private String get_signature(int[] words) {
     StringBuffer s = new StringBuffer(words.length);
     for (int i = 0; i < words.length; i++) {
       s.append(' ').append(words[i]);
     }
     return s.toString();
   }

   /*
    * Note: the mismatch between srilm and our java implemtation is in: when unk words used as
    * context, in java it will be replaced with "<unk>", but srilm will not, therefore thelm cost by
    * srilm may be smaller than by java, this happens only when the LM file have "<unk>" in backoff
    * state
    */
   protected double ngramLogProbability_helper(int[] ngram, int order) {
     Double res;

     int[] ngram_wrds = replace_with_unk(ngram); // TODO
     // TODO: wrong implementation in hiero
     if (ngram_wrds[ngram_wrds.length - 1] == UNK_SYM_ID) {

       res = -JoshuaConfiguration.lm_ceiling_cost;
     } else {
       // TODO: untranslated words
       if (null == root) {
         throw new RuntimeException("root is null");
       }
       int last_word_id = ngram_wrds[ngram_wrds.length - 1];
       LMHash pos = root;
       Double prob = get_valid_prob(pos, last_word_id);
       double bow_sum = 0;
       // reverse search, start from the second-last word
       for (int i = ngram_wrds.length - 2; i >= 0; i--) {
         LMHash next_layer = (LMHash) pos.get(ngram_wrds[i] + Vocabulary.size());

         if (null != next_layer) { // have context/bow node
           pos = next_layer;
           Double prob2 = get_valid_prob(pos, last_word_id);
           if (null != prob2) { // reset, if backoff, will at least back off to
                                // here
             prob = prob2;
             bow_sum = 0;
           } else {
             Double bow = (Double) pos.get(BACKOFF_WGHT_SYM_ID);
             if (null != bow) {
               bow_sum += bow;
             }
           }
         } else { // do not have context/bow node
           break;
         }
       }
       res = prob + bow_sum;
     }
     return res;
   }

   private Double get_valid_prob(LMHash pos, int wrd) {
     Double res = (Double) pos.get(wrd);
     if (!g_is_add_suffix_infor) {
       return res;
     }

     if (null != res) {
       if (res == SUFFIX_ONLY) {
         return null;
       } else if (res > MIN_LOG_P) { // logP without suffix flag
         return res;
       } else { // logP with suffix flag
         return res - MIN_LOG_P;
       }
     }
     return null;
   }

   // ##################### begin right equivalent state #############
   // idea: from right to left, if a span does not have a backoff weight, which
   // means all ngram having this span will backoff, and we can safely remove
   // this state
   // the absence of backoff weight for low-order ngram implies the absence of
   // higher-order ngram
   // the absence of backoff weight for low-order ngram implies the absence of
   // backoff weight for high order ngram ????????????????
   /*
    * e.g., if we do not have bow node for A, then we can say there is no bow nodes for (1)*A:
    * implied by the trie structure (2)A*: if we have a BOW node for A* (with bow weight), due to the
    * representantion of ARPA format, then we must have a probability for A*, which implies we have a
    * BOW node for A (3)*A*
    */

   // the returned array length must be the same the len of original_state
   // the only change to the original_state is: replace with more non-null state
   // words to null state
   // O(n^2)
   public int[] rightEquivalentState(int[] original_state_in, int order) {
     if (!JoshuaConfiguration.use_right_equivalent_state
         || original_state_in.length != ngramOrder - 1) {
       return original_state_in;
     }
     int[] res;
     // cache
     String sig = get_signature(original_state_in);
     res = (int[]) request_cache_right_equiv.get(sig);
     if (null != res) {
       // System.out.println("right cache hit");
       return res;
     }

     // we do not put this statement at the beging to match the SRILM condition
     // (who does not have replace_with_unk)
     int[] original_state = replace_with_unk(original_state_in);

     res = new int[original_state.length];
     for (int i = 1; i <= original_state.length; i++) { // forward search
       int[] cur_wrds = Support.sub_int_array(original_state, i - 1, original_state.length);
       if (!have_prefix(cur_wrds)) {
         res[i - 1] = LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID;
       } else {
         for (int j = i; j <= original_state.length; j++) {
           res[j - 1] = original_state[j - 1];
         }
         break;
       }
     }
     // cache
     if (request_cache_right_equiv.size() > cache_size_limit) {
       request_cache_right_equiv.clear();
     }
     request_cache_right_equiv.put(sig, res);

     // System.out.println("right org state: " +
     // Symbol.get_string(original_state) +"; equiv state: " +
     // Symbol.get_string(res));
     return res;
   }

   // O(n)
   private boolean have_prefix(int[] words) {
     LMHash pos = root;
     int i = words.length - 1;
     for (; i >= 0; i--) { // reverse search
       LMHash next_layer = (LMHash) pos.get(words[i] + Vocabulary.size());
       if (null != next_layer) {
         pos = next_layer;
       } else {
         break;
       }
     }
     return (i == -1 && pos.containsKey(LM_HAVE_PREFIX_SYM_ID));
   }

   // ##################### end right equivalent state #############

   // ############################ begin left equivalent state
   // ##############################

   /*
    * several observation: In general: (1) In general, there might be more than one <bo> or <null>,
    * and they can be in any position (2) in general, whenever there is a <bo> or <null> in a given
    * ngram, then it will definitely backoff since it has same/more context
    */
   // return: (1) the equivlant state vector; (2) the finalized cost; (3) the
   // estimated cost
   // O(n^2)
   public int[] leftEquivalentState(int[] original_state_wrds_in, int order, double[] cost) {
     if (!JoshuaConfiguration.use_left_equivalent_state) {
       return original_state_wrds_in;
     }

     // we do not put this statement at the beging to match the SRILM condition
     // (who does not have replace_with_unk)
     int[] original_state_wrds = replace_with_unk(original_state_wrds_in);

     // ## deal with case overlap state
     if (original_state_wrds.length < ngramOrder - 1) {
       for (int i = 0; i < original_state_wrds.length; i++) {
         int[] currentWords = Support.sub_int_array(original_state_wrds, 0, i + 1);

         // add estimated cost
         cost[1] += -ngramLogProbability(currentWords, currentWords.length);
       }
       return original_state_wrds;
     }

     // ## non-overlaping state
     int[] res_equi_state = new int[original_state_wrds.length];
     double res_final_cost = 0.0; // finalized cost
     double res_est_cost = 0.0; // estimated cost

     BACKWORD_SEARCH: for (int i = original_state_wrds.length; i > 0; i--) {
       int[] cur_wrds = Support.sub_int_array(original_state_wrds, 0, i);
       if (!have_suffix(cur_wrds)) {
         int last_wrd = cur_wrds[i - 1];
         if (last_wrd == UNK_SYM_ID) {
           res_equi_state[i - 1] = last_wrd;

           // add estimated cost
           res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
         } else {
           if (last_wrd != LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) {
             res_final_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
           }

           res_equi_state[i - 1] = LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID;

           /*
            * //TODO: for simplicity, we may just need BACKOFF_LEFT_LM_STATE_SYM_ID?? int[]
            * backoff_history = Support.sub_int_array(cur_wrds, 0, cur_wrds.length-1);//ignore last
            * wrd double[] bow = new double[1]; boolean finalized_backoff =
            * check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside
            * this function? if(finalized_backoff==true){
            * res_equi_state[i-1]=Symbol.NULL_LEFT_LM_STATE_SYM_ID;//no state, no bow, no est_cost
            * }else{ res_equi_state[i-1]=Symbol.BACKOFF_LEFT_LM_STATE_SYM_ID; }
            */
         }

       } else { // we do have a suffix
         for (int j = i; j > 0; j--) {
           res_equi_state[j - 1] = original_state_wrds[j - 1];
           cur_wrds = Support.sub_int_array(original_state_wrds, 0, j);

           // Estimated cost
           res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length);
         }
         break BACKWORD_SEARCH;
       }
     }

     cost[0] = res_final_cost;
     cost[1] = res_est_cost;
     return res_equi_state;
   }

   private boolean have_suffix(int[] words) {
     LMHash pos = root;
     // reverse search, start from the second-last word
     for (int i = words.length - 2; i >= 0; i--) {
       LMHash next_layer = (LMHash) pos.get(words[i] + Vocabulary.size());
       if (null != next_layer) {
         pos = next_layer;
       } else {
         return false;
       }
     }
     Double prob = (Double) pos.get(words[words.length - 1]);
     return (null != prob && prob <= MIN_LOG_P);
   }

   protected double logProbabilityOfBackoffState_helper(int[] ngram_wrds, int order,
       int n_additional_bow) {
     int[] backoff_wrds = Support.sub_int_array(ngram_wrds, 0, ngram_wrds.length - 1);
     double[] sum_bow = new double[1];
     check_backoff_weight(backoff_wrds, sum_bow, n_additional_bow);
     return sum_bow[0];
   }

   // if exist backoff weight for backoff_words, then return the accumated
   // backoff weight
   // if there is no backoff weight for backoff_words, then, we can return the
   // finalized backoff weight
   private boolean check_backoff_weight(int[] backoff_words, double[] sum_bow, int num_backoff) {
     if (backoff_words.length <= 0) return false;

     double sum = 0;
     LMHash pos = root;

     // the start index that backoff should be applied
     int start_use_i = num_backoff - 1;

     Double bow = null;
     int i = backoff_words.length - 1;
     for (; i >= 0; i--) {
       LMHash next_layer = (LMHash) pos.get(backoff_words[i] + Vocabulary.size());

       if (null != next_layer) {
         bow = (Double) next_layer.get(BACKOFF_WGHT_SYM_ID);
         if (null != bow && i <= start_use_i) {
           sum += bow;
         }
         pos = next_layer;
       } else {
         break;
       }
     }
     sum_bow[0] = sum;

     // the higest order have backoff weight, so we cannot finalize
     return (i != -1 || null == bow);
   }

   // ######################################## end left equiv state
   // ###########################################

   // ######################################## general helper function
   // ###########################################
   protected final int[] replace_with_unk(int[] in) {
     int[] res = new int[in.length];
     for (int i = 0; i < in.length; i++) {
       res[i] = replace_with_unk(in[i]);
     }
     return res;
   }

   protected int replace_with_unk(int in) {
     if (root.containsKey(in) || in == LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID
         || in == LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) {
       return in;
     } else {
       return UNK_SYM_ID;
     }
   }

   // ######################################## read LM grammar by the Java
   // implementation ###########################################

   /*
    * a backoff node is a hashtable, it may include: (1) probability for next words: key id is
    * positive (2) pointer to a next-layer backoff node (hashtable): key id is negative!!! (3)
    * backoff weight for this node (4) suffix flag to indicate that there is ngrams start from this
    * suffix
    */

   // read grammar locally by the Java implementation
   private void read_lm_grammar_from_file(String grammar_file) throws IOException {
     start_loading_time = System.currentTimeMillis();
     root = new LMHash();
     root.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);

     if (logger.isLoggable(Level.INFO)) logger.info("Reading grammar from file " + grammar_file);

     boolean start = false;
     int order = 0;

     Regex blankLine = new Regex("^\\s*$");
     Regex ngramsLine = new Regex("^\\\\\\d-grams:\\s*$");

     LineReader grammarReader = new LineReader(grammar_file);
     try {
       for (String line : grammarReader) {
         line = line.trim();
         if (blankLine.matches(line)) {
           continue;
         }
         if (ngramsLine.matches(line)) { // \1-grams:
           start = true;
           order = Integer.parseInt(line.substring(1, 2));
           if (order > ngramOrder) {
             break;
           }
           if (logger.isLoggable(Level.INFO))
             logger.info("begin to read ngrams with order " + order);

           continue; // skip this line
         }
         if (start) {
           add_rule(line, order, g_is_add_suffix_infor, g_is_add_prefix_infor);
         }
       }
     } finally {
       grammarReader.close();
     }

     if (logger.isLoggable(Level.FINE)) {
       logger.fine("# of bow nodes: " + g_n_bow_nodes + " ; # of suffix nodes: " + g_n_suffix_nodes);
       logger.fine("add LMHash  " + g_n_bow_nodes);
       logger.fine("##### mem used (kb): " + Support.getMemoryUse());
       logger.fine("##### time used (seconds): " + (System.currentTimeMillis() - start_loading_time)
           / 1000);
     }
   }

   // format: prob \t ngram \t backoff-weight
   private void add_rule(String line, int order, boolean is_add_suffix_infor,
       boolean is_add_prefix_infor) {
     num_rule_read++;
     if (num_rule_read % 1000000 == 0) {
       if (logger.isLoggable(Level.FINE)) logger.fine("read rules " + num_rule_read);

       // System.out.println("##### mem used (kb): " + Support.getMemoryUse());
       if (logger.isLoggable(Level.FINE))
         logger.fine("##### time used (seconds): "
             + (System.currentTimeMillis() - start_loading_time) / 1000);
     }
     String[] wrds = Regex.spaces.split(line.trim());

     if (wrds.length < order + 1 || wrds.length > order + 2) { // TODO: error
       // logger.severe("wrong line: "+ line);
       return;
     }
     int last_word_id = Vocabulary.id(wrds[order]);

     // ##### identify the BOW position, insert the backoff node if necessary,
     // and add suffix information
     LMHash pos = root;
     // reverse search, start from the second-last word
     for (int i = order - 1; i > 0; i--) {
       if (is_add_suffix_infor) {
         Double t_prob = (Double) pos.get(last_word_id);
         if (null != t_prob) {
           if (t_prob > MIN_LOG_P) { // have prob, but not suffix flag
             double tem = t_prob + MIN_LOG_P;
             pos.put(last_word_id, tem); // overwrite
           }
         } else {
           pos.put(last_word_id, SUFFIX_ONLY);
         }
       }
       int cur_sym_id = Vocabulary.id(wrds[i]);
       // System.out.println(Vocabulary.size());
       LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size());
       if (null != next_layer) {
         pos = next_layer;
       } else {
         LMHash new_tnode = new LMHash(); // create new bow node
         pos.put(cur_sym_id + Vocabulary.size(), new_tnode);
         pos = new_tnode;

         g_n_bow_nodes++;
         if (g_n_bow_nodes % 1000000 == 0) {
           if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash  " + g_n_bow_nodes);

           // System.out.println("##### mem used (kb): " +
           // Support.getMemoryUse());
           if (logger.isLoggable(Level.FINE))
             logger.fine("##### time used (seconds): "
                 + (System.currentTimeMillis() - start_loading_time) / 1000);
         }
       }
       if (!pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
         // indicate it is a backoof node, to distinguish from a pure prefix node
         pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
       }
     }

     // ##### add probability
     if (is_add_suffix_infor && pos.containsKey(last_word_id)) {
       double tem = Double.parseDouble(wrds[0]) + MIN_LOG_P;
       pos.put(last_word_id, tem); // add probability and suffix flag
     } else {
       // add probability
       pos.put(last_word_id, Double.parseDouble(wrds[0]));
     }

     // ##### add prefix infor, a prefix node is just like a BOW node
     if (is_add_prefix_infor) {
       pos.put(LM_HAVE_PREFIX_SYM_ID, 1); // for preifx [1,order-1]
       for (int i = 1; i < order - 1; i++) { // ignore the last prefix
         pos = root; // reset pos
         for (int j = i; j >= 1; j--) { // reverse search: [1,i]
           int cur_sym_id = Vocabulary.id(wrds[j]);
           LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size());

           if (null != next_layer) {
             pos = next_layer;
           } else {
             LMHash new_tnode = new LMHash();// create new prefix node
             pos.put(cur_sym_id + Vocabulary.size(), new_tnode);
             pos = new_tnode;

             g_n_bow_nodes++;
             if (g_n_bow_nodes % 1000000 == 0) {
               if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash  " + g_n_bow_nodes);

               // System.out.println("##### mem used (kb): " +
               // Support.getMemoryUse());
               if (logger.isLoggable(Level.FINE))
                 logger.fine("##### time used (seconds): "
                     + (System.currentTimeMillis() - start_loading_time) / 1000);
             }
           }
         }
         pos.put(LM_HAVE_PREFIX_SYM_ID, 1);// only the last node should have this
                                           // flag
       }
     }

     // ##### add bow
     if (wrds.length == order + 2) { // have bow weight to add
       pos = root;
       // reverse search, start from the last word
       for (int i = order; i >= 1; i--) {
         int cur_sym_id = Vocabulary.id(wrds[i]);
         LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size());
         if (null != next_layer) {
           pos = next_layer;
         } else {
           LMHash new_tnode = new LMHash(); // create new bow node
           pos.put(cur_sym_id + Vocabulary.size(), new_tnode);
           pos = new_tnode;

           g_n_bow_nodes++;
           if (g_n_bow_nodes % 1000000 == 0) {
             if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash  " + g_n_bow_nodes);
             // System.out.println("##### mem used (kb): " +
             // Support.getMemoryUse());
             if (logger.isLoggable(Level.FINE))
               logger.fine("##### time used (seconds): "
                   + (System.currentTimeMillis() - start_loading_time) / 1000);
           }
         }

         // add bow weight here
         if (i == 1) { // force to override the backoff weight
           double backoff_weight = Double.parseDouble(wrds[order + 1]);
           pos.put(BACKOFF_WGHT_SYM_ID, backoff_weight);
         } else {
           if (!pos.containsKey(BACKOFF_WGHT_SYM_ID)) {
             // indicate it is a backoof node, to distinguish from a pure prefix
             // node
             pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT);
           }
         }
       }
     }
   }

   /*
    * ###################### not used private boolean have_suffix_old(int[] words){ LMHash pos=root;
    * int i=words.length-1; for(; i>=0; i--){//reverse search LMHash next_layer=(LMHash)
    * pos.get(words[i]+p_symbol.getLMEndID()); if(next_layer!=null){ pos=next_layer; }else{ break; }
    * } if(i==-1 && pos.containsKey(Symbol.LM_HAVE_SUFFIX_SYM_ID)) return true; else return false; }
    */

   // in theory: 64bytes (init size is 5)
   // in practice: 86 bytes (init size is 5)
   // in practice: 132 bytes (init size is 10)
   // in practice: 211 bytes (init size is 20)
   // important note: if we use tbl.put(key, new Integer(1)) instead of
   // tbl.put(key, (new Integer(1)).intValue()), then for each element, we waste
   // 16 bytes for the Integer object,
   // and the GC will not collect this Double object, because the hashtable ref
   // it
   private static class LMHash // 4bytes
   {

     // ######note: key must be positive integer, and value must not be null
     /*
      * if key can be both positive and negative, then lot of collision, or will take very long to
      * call get() imagine, we put all numbers in [1,20000] in hashtable, but try to call get() by
      * numbers [-20000,-1], it will take very long time
      */

     // TODO: should round the array size to a prime number?
     static double load_factor = 0.6;
     static int default_init_size = 5;

     int size = 0; // 8 bytes?
     int[] key_array; // pointer itself is 4 bytes?, when it is newed, then add
                      // 10 more bytes, and the int itself
     Object[] val_array; // pointer itself is 4 bytes?, when it is newed, then
                         // add 10 more bytes, and the object itself

     public LMHash(int init_size) {
       key_array = new int[init_size];
       val_array = new Object[init_size];
     }

     public LMHash() {
       key_array = new int[default_init_size];
       val_array = new Object[default_init_size];
     }

     // return the positive position for the key
     private int hash_pos(int key, int length) {
       // return Math.abs(key % length);
       return key % length;
     }

     public Object get(int key) {
       Object res = null;
       int pos = hash_pos(key, key_array.length);
       while (key_array[pos] != 0) { // search until empty cell,
         if (key_array[pos] == key) {
           return val_array[pos]; // found
         }
         pos++; // linear search
         pos = hash_pos(pos, key_array.length);
       }
       return res;
     }

     public boolean containsKey(int key) {
       return (null != get(key));
     }

     public int size() {
       return size;
     }

     public void put(int key, Object value) {
       if (null == value) {
         throw new IllegalArgumentException("LMHash, value is null");
       }

       int pos = hash_pos(key, key_array.length);
       while (key_array[pos] != 0) { // search until empty cell,
         if (key_array[pos] == key) {
           val_array[pos] = value; // found, and overwrite
           return;
         }
         pos++; // linear search
         pos = hash_pos(pos, key_array.length);
       }

       // we get to here, means we do not have this key, need to insert it
       // data_array[pos] = new LMItem(key, value);
       key_array[pos] = key;
       val_array[pos] = value;

       size++;
       if (size >= key_array.length * load_factor) {
         expand_tbl();
       }
     }

     private void expand_tbl() {
       int new_size = key_array.length * 2 + 1; // TODO
       int[] new_key_array = new int[new_size];
       Object[] new_val_array = new Object[new_size];

       for (int i = 0; i < key_array.length; i++) {
         if (key_array[i] != 0) { // add the element
           int pos = hash_pos(key_array[i], new_key_array.length);

           // find first empty postition, note that it is not possible that we
           // need to overwrite
           while (new_key_array[pos] != 0) {
             pos++; // linear search
             pos = hash_pos(pos, new_key_array.length);
           }
           new_key_array[pos] = key_array[i];
           new_val_array[pos] = val_array[i];
         }
       }
       key_array = new_key_array;
       val_array = new_val_array;
     }
   }
 }