| /* |
| * This file is part of the Joshua Machine Translation System. |
| * |
| * Joshua is free software; you can redistribute it and/or modify it under the terms of the GNU |
| * Lesser General Public License as published by the Free Software Foundation; either version 2.1 of |
| * the License, or (at your option) any later version. |
| * |
| * This library is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without |
| * even the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU |
| * Lesser General Public License for more details. |
| * |
| * You should have received a copy of the GNU Lesser General Public License along with this library; |
| * if not, write to the Free Software Foundation, Inc., 59 Temple Place, Suite 330, Boston, MA |
| * 02111-1307 USA |
| */ |
| package joshua.decoder.ff.lm.buildin_lm; |
| |
| import java.io.IOException; |
| import java.util.HashMap; |
| import java.util.logging.Level; |
| import java.util.logging.Logger; |
| |
| import joshua.corpus.Vocabulary; |
| import joshua.decoder.JoshuaConfiguration; |
| import joshua.decoder.Support; |
| import joshua.decoder.ff.lm.AbstractLM; |
| import joshua.decoder.ff.lm.LanguageModelFF; |
| import joshua.util.Regex; |
| import joshua.util.io.LineReader; |
| |
| /** |
| * this class implement (1) read the LM file into a Trie data structure (2) get LM probablity for a |
| * given n-grm (3) get equivilent state |
| * |
| * @author Zhifei Li, <zhifei.work@gmail.com> |
| * @version $LastChangedDate:2008-07-28 18:44:45 -0400 (Mon, 28 Jul 2008) $ |
| */ |
| public class LMGrammarJAVA extends AbstractLM { |
| |
| // BUG: Why are the IDs not static? Why are the strings not final? |
| static String BACKOFF_WGHT_SYM = "<bow>"; |
| int BACKOFF_WGHT_SYM_ID; // used by LMModel |
| |
| // to indicate that lm trie node has children |
| static String LM_HAVE_PREFIX_SYM = "<havelzfprefix>"; |
| int LM_HAVE_PREFIX_SYM_ID; |
| |
| static String UNK_SYM = "<unk>"; // unknown lm word |
| int UNK_SYM_ID; |
| |
| /** Used for logging the time cost for things */ |
| private long start_loading_time; |
| |
| /* |
| * A backoff node is a hashtable, it may include: (1) probabilities for next words (2) pointers to |
| * a next-layer backoff node (hashtable); the key lookup the value is: sym_id + highestID (3) |
| * backoff weight for this node (4) suffix/prefix flag to indicate that there is ngrams start from |
| * this suffix |
| */ |
| private LMHash root = null; |
| private int g_n_bow_nodes = 0; |
| private int g_n_suffix_nodes = 0; |
| // ngram prob must be smaller than this number |
| static private float MIN_LOG_P = -9999.0f; |
| // ngram prob must be smaller than this number |
| static private double SUFFIX_ONLY = MIN_LOG_P * 3; |
| |
| private double NON_EXIST_WEIGHT = 0; // the history has not appeared at all |
| private int num_rule_read = 0; |
| boolean g_is_add_prefix_infor = false; |
| boolean g_is_add_suffix_infor = false; |
| |
| // cmd with result |
| HashMap<String, int[]> request_cache_prob = new HashMap<String, int[]>(); |
| HashMap<String, int[]> request_cache_backoff = new HashMap<String, int[]>(); |
| HashMap<String, int[]> request_cache_left_equiv = new HashMap<String, int[]>(); |
| HashMap<String, int[]> request_cache_right_equiv = new HashMap<String, int[]>(); |
| int cache_size_limit = 250000; |
| |
| private static final Logger logger = Logger.getLogger(LMGrammarJAVA.class.getName()); |
| |
| public LMGrammarJAVA(int order, String lm_file, boolean is_add_suffix_infor, |
| boolean is_add_prefix_infor) throws IOException { |
| super(order); |
| logger.info("use java lm"); |
| |
| this.BACKOFF_WGHT_SYM_ID = Vocabulary.id(BACKOFF_WGHT_SYM); |
| this.LM_HAVE_PREFIX_SYM_ID = Vocabulary.id(LM_HAVE_PREFIX_SYM); |
| this.UNK_SYM_ID = Vocabulary.id(UNK_SYM); |
| |
| g_is_add_prefix_infor = is_add_prefix_infor; |
| g_is_add_suffix_infor = is_add_suffix_infor; |
| |
| read_lm_grammar_from_file(lm_file);// TODO: what about sentence-specific? |
| } |
| |
| // signature of this item: i, j, lhs, states (in fact, we do not need i, j) |
| private String get_signature(int[] words) { |
| StringBuffer s = new StringBuffer(words.length); |
| for (int i = 0; i < words.length; i++) { |
| s.append(' ').append(words[i]); |
| } |
| return s.toString(); |
| } |
| |
| /* |
| * Note: the mismatch between srilm and our java implemtation is in: when unk words used as |
| * context, in java it will be replaced with "<unk>", but srilm will not, therefore thelm cost by |
| * srilm may be smaller than by java, this happens only when the LM file have "<unk>" in backoff |
| * state |
| */ |
| protected double ngramLogProbability_helper(int[] ngram, int order) { |
| Double res; |
| |
| int[] ngram_wrds = replace_with_unk(ngram); // TODO |
| // TODO: wrong implementation in hiero |
| if (ngram_wrds[ngram_wrds.length - 1] == UNK_SYM_ID) { |
| |
| res = -JoshuaConfiguration.lm_ceiling_cost; |
| } else { |
| // TODO: untranslated words |
| if (null == root) { |
| throw new RuntimeException("root is null"); |
| } |
| int last_word_id = ngram_wrds[ngram_wrds.length - 1]; |
| LMHash pos = root; |
| Double prob = get_valid_prob(pos, last_word_id); |
| double bow_sum = 0; |
| // reverse search, start from the second-last word |
| for (int i = ngram_wrds.length - 2; i >= 0; i--) { |
| LMHash next_layer = (LMHash) pos.get(ngram_wrds[i] + Vocabulary.size()); |
| |
| if (null != next_layer) { // have context/bow node |
| pos = next_layer; |
| Double prob2 = get_valid_prob(pos, last_word_id); |
| if (null != prob2) { // reset, if backoff, will at least back off to |
| // here |
| prob = prob2; |
| bow_sum = 0; |
| } else { |
| Double bow = (Double) pos.get(BACKOFF_WGHT_SYM_ID); |
| if (null != bow) { |
| bow_sum += bow; |
| } |
| } |
| } else { // do not have context/bow node |
| break; |
| } |
| } |
| res = prob + bow_sum; |
| } |
| return res; |
| } |
| |
| private Double get_valid_prob(LMHash pos, int wrd) { |
| Double res = (Double) pos.get(wrd); |
| if (!g_is_add_suffix_infor) { |
| return res; |
| } |
| |
| if (null != res) { |
| if (res == SUFFIX_ONLY) { |
| return null; |
| } else if (res > MIN_LOG_P) { // logP without suffix flag |
| return res; |
| } else { // logP with suffix flag |
| return res - MIN_LOG_P; |
| } |
| } |
| return null; |
| } |
| |
| // ##################### begin right equivalent state ############# |
| // idea: from right to left, if a span does not have a backoff weight, which |
| // means all ngram having this span will backoff, and we can safely remove |
| // this state |
| // the absence of backoff weight for low-order ngram implies the absence of |
| // higher-order ngram |
| // the absence of backoff weight for low-order ngram implies the absence of |
| // backoff weight for high order ngram ???????????????? |
| /* |
| * e.g., if we do not have bow node for A, then we can say there is no bow nodes for (1)*A: |
| * implied by the trie structure (2)A*: if we have a BOW node for A* (with bow weight), due to the |
| * representantion of ARPA format, then we must have a probability for A*, which implies we have a |
| * BOW node for A (3)*A* |
| */ |
| |
| // the returned array length must be the same the len of original_state |
| // the only change to the original_state is: replace with more non-null state |
| // words to null state |
| // O(n^2) |
| public int[] rightEquivalentState(int[] original_state_in, int order) { |
| if (!JoshuaConfiguration.use_right_equivalent_state |
| || original_state_in.length != ngramOrder - 1) { |
| return original_state_in; |
| } |
| int[] res; |
| // cache |
| String sig = get_signature(original_state_in); |
| res = (int[]) request_cache_right_equiv.get(sig); |
| if (null != res) { |
| // System.out.println("right cache hit"); |
| return res; |
| } |
| |
| // we do not put this statement at the beging to match the SRILM condition |
| // (who does not have replace_with_unk) |
| int[] original_state = replace_with_unk(original_state_in); |
| |
| res = new int[original_state.length]; |
| for (int i = 1; i <= original_state.length; i++) { // forward search |
| int[] cur_wrds = Support.sub_int_array(original_state, i - 1, original_state.length); |
| if (!have_prefix(cur_wrds)) { |
| res[i - 1] = LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID; |
| } else { |
| for (int j = i; j <= original_state.length; j++) { |
| res[j - 1] = original_state[j - 1]; |
| } |
| break; |
| } |
| } |
| // cache |
| if (request_cache_right_equiv.size() > cache_size_limit) { |
| request_cache_right_equiv.clear(); |
| } |
| request_cache_right_equiv.put(sig, res); |
| |
| // System.out.println("right org state: " + |
| // Symbol.get_string(original_state) +"; equiv state: " + |
| // Symbol.get_string(res)); |
| return res; |
| } |
| |
| // O(n) |
| private boolean have_prefix(int[] words) { |
| LMHash pos = root; |
| int i = words.length - 1; |
| for (; i >= 0; i--) { // reverse search |
| LMHash next_layer = (LMHash) pos.get(words[i] + Vocabulary.size()); |
| if (null != next_layer) { |
| pos = next_layer; |
| } else { |
| break; |
| } |
| } |
| return (i == -1 && pos.containsKey(LM_HAVE_PREFIX_SYM_ID)); |
| } |
| |
| // ##################### end right equivalent state ############# |
| |
| // ############################ begin left equivalent state |
| // ############################## |
| |
| /* |
| * several observation: In general: (1) In general, there might be more than one <bo> or <null>, |
| * and they can be in any position (2) in general, whenever there is a <bo> or <null> in a given |
| * ngram, then it will definitely backoff since it has same/more context |
| */ |
| // return: (1) the equivlant state vector; (2) the finalized cost; (3) the |
| // estimated cost |
| // O(n^2) |
| public int[] leftEquivalentState(int[] original_state_wrds_in, int order, double[] cost) { |
| if (!JoshuaConfiguration.use_left_equivalent_state) { |
| return original_state_wrds_in; |
| } |
| |
| // we do not put this statement at the beging to match the SRILM condition |
| // (who does not have replace_with_unk) |
| int[] original_state_wrds = replace_with_unk(original_state_wrds_in); |
| |
| // ## deal with case overlap state |
| if (original_state_wrds.length < ngramOrder - 1) { |
| for (int i = 0; i < original_state_wrds.length; i++) { |
| int[] currentWords = Support.sub_int_array(original_state_wrds, 0, i + 1); |
| |
| // add estimated cost |
| cost[1] += -ngramLogProbability(currentWords, currentWords.length); |
| } |
| return original_state_wrds; |
| } |
| |
| // ## non-overlaping state |
| int[] res_equi_state = new int[original_state_wrds.length]; |
| double res_final_cost = 0.0; // finalized cost |
| double res_est_cost = 0.0; // estimated cost |
| |
| BACKWORD_SEARCH: for (int i = original_state_wrds.length; i > 0; i--) { |
| int[] cur_wrds = Support.sub_int_array(original_state_wrds, 0, i); |
| if (!have_suffix(cur_wrds)) { |
| int last_wrd = cur_wrds[i - 1]; |
| if (last_wrd == UNK_SYM_ID) { |
| res_equi_state[i - 1] = last_wrd; |
| |
| // add estimated cost |
| res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length); |
| } else { |
| if (last_wrd != LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) { |
| res_final_cost += -ngramLogProbability(cur_wrds, cur_wrds.length); |
| } |
| |
| res_equi_state[i - 1] = LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID; |
| |
| /* |
| * //TODO: for simplicity, we may just need BACKOFF_LEFT_LM_STATE_SYM_ID?? int[] |
| * backoff_history = Support.sub_int_array(cur_wrds, 0, cur_wrds.length-1);//ignore last |
| * wrd double[] bow = new double[1]; boolean finalized_backoff = |
| * check_backoff_weight(backoff_history, bow, 0);//backoff weight is already added outside |
| * this function? if(finalized_backoff==true){ |
| * res_equi_state[i-1]=Symbol.NULL_LEFT_LM_STATE_SYM_ID;//no state, no bow, no est_cost |
| * }else{ res_equi_state[i-1]=Symbol.BACKOFF_LEFT_LM_STATE_SYM_ID; } |
| */ |
| } |
| |
| } else { // we do have a suffix |
| for (int j = i; j > 0; j--) { |
| res_equi_state[j - 1] = original_state_wrds[j - 1]; |
| cur_wrds = Support.sub_int_array(original_state_wrds, 0, j); |
| |
| // Estimated cost |
| res_est_cost += -ngramLogProbability(cur_wrds, cur_wrds.length); |
| } |
| break BACKWORD_SEARCH; |
| } |
| } |
| |
| cost[0] = res_final_cost; |
| cost[1] = res_est_cost; |
| return res_equi_state; |
| } |
| |
| private boolean have_suffix(int[] words) { |
| LMHash pos = root; |
| // reverse search, start from the second-last word |
| for (int i = words.length - 2; i >= 0; i--) { |
| LMHash next_layer = (LMHash) pos.get(words[i] + Vocabulary.size()); |
| if (null != next_layer) { |
| pos = next_layer; |
| } else { |
| return false; |
| } |
| } |
| Double prob = (Double) pos.get(words[words.length - 1]); |
| return (null != prob && prob <= MIN_LOG_P); |
| } |
| |
| protected double logProbabilityOfBackoffState_helper(int[] ngram_wrds, int order, |
| int n_additional_bow) { |
| int[] backoff_wrds = Support.sub_int_array(ngram_wrds, 0, ngram_wrds.length - 1); |
| double[] sum_bow = new double[1]; |
| check_backoff_weight(backoff_wrds, sum_bow, n_additional_bow); |
| return sum_bow[0]; |
| } |
| |
| // if exist backoff weight for backoff_words, then return the accumated |
| // backoff weight |
| // if there is no backoff weight for backoff_words, then, we can return the |
| // finalized backoff weight |
| private boolean check_backoff_weight(int[] backoff_words, double[] sum_bow, int num_backoff) { |
| if (backoff_words.length <= 0) return false; |
| |
| double sum = 0; |
| LMHash pos = root; |
| |
| // the start index that backoff should be applied |
| int start_use_i = num_backoff - 1; |
| |
| Double bow = null; |
| int i = backoff_words.length - 1; |
| for (; i >= 0; i--) { |
| LMHash next_layer = (LMHash) pos.get(backoff_words[i] + Vocabulary.size()); |
| |
| if (null != next_layer) { |
| bow = (Double) next_layer.get(BACKOFF_WGHT_SYM_ID); |
| if (null != bow && i <= start_use_i) { |
| sum += bow; |
| } |
| pos = next_layer; |
| } else { |
| break; |
| } |
| } |
| sum_bow[0] = sum; |
| |
| // the higest order have backoff weight, so we cannot finalize |
| return (i != -1 || null == bow); |
| } |
| |
| // ######################################## end left equiv state |
| // ########################################### |
| |
| // ######################################## general helper function |
| // ########################################### |
| protected final int[] replace_with_unk(int[] in) { |
| int[] res = new int[in.length]; |
| for (int i = 0; i < in.length; i++) { |
| res[i] = replace_with_unk(in[i]); |
| } |
| return res; |
| } |
| |
| protected int replace_with_unk(int in) { |
| if (root.containsKey(in) || in == LanguageModelFF.NULL_RIGHT_LM_STATE_SYM_ID |
| || in == LanguageModelFF.BACKOFF_LEFT_LM_STATE_SYM_ID) { |
| return in; |
| } else { |
| return UNK_SYM_ID; |
| } |
| } |
| |
| // ######################################## read LM grammar by the Java |
| // implementation ########################################### |
| |
| /* |
| * a backoff node is a hashtable, it may include: (1) probability for next words: key id is |
| * positive (2) pointer to a next-layer backoff node (hashtable): key id is negative!!! (3) |
| * backoff weight for this node (4) suffix flag to indicate that there is ngrams start from this |
| * suffix |
| */ |
| |
| // read grammar locally by the Java implementation |
| private void read_lm_grammar_from_file(String grammar_file) throws IOException { |
| start_loading_time = System.currentTimeMillis(); |
| root = new LMHash(); |
| root.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT); |
| |
| if (logger.isLoggable(Level.INFO)) logger.info("Reading grammar from file " + grammar_file); |
| |
| boolean start = false; |
| int order = 0; |
| |
| Regex blankLine = new Regex("^\\s*$"); |
| Regex ngramsLine = new Regex("^\\\\\\d-grams:\\s*$"); |
| |
| LineReader grammarReader = new LineReader(grammar_file); |
| try { |
| for (String line : grammarReader) { |
| line = line.trim(); |
| if (blankLine.matches(line)) { |
| continue; |
| } |
| if (ngramsLine.matches(line)) { // \1-grams: |
| start = true; |
| order = Integer.parseInt(line.substring(1, 2)); |
| if (order > ngramOrder) { |
| break; |
| } |
| if (logger.isLoggable(Level.INFO)) |
| logger.info("begin to read ngrams with order " + order); |
| |
| continue; // skip this line |
| } |
| if (start) { |
| add_rule(line, order, g_is_add_suffix_infor, g_is_add_prefix_infor); |
| } |
| } |
| } finally { |
| grammarReader.close(); |
| } |
| |
| if (logger.isLoggable(Level.FINE)) { |
| logger.fine("# of bow nodes: " + g_n_bow_nodes + " ; # of suffix nodes: " + g_n_suffix_nodes); |
| logger.fine("add LMHash " + g_n_bow_nodes); |
| logger.fine("##### mem used (kb): " + Support.getMemoryUse()); |
| logger.fine("##### time used (seconds): " + (System.currentTimeMillis() - start_loading_time) |
| / 1000); |
| } |
| } |
| |
| // format: prob \t ngram \t backoff-weight |
| private void add_rule(String line, int order, boolean is_add_suffix_infor, |
| boolean is_add_prefix_infor) { |
| num_rule_read++; |
| if (num_rule_read % 1000000 == 0) { |
| if (logger.isLoggable(Level.FINE)) logger.fine("read rules " + num_rule_read); |
| |
| // System.out.println("##### mem used (kb): " + Support.getMemoryUse()); |
| if (logger.isLoggable(Level.FINE)) |
| logger.fine("##### time used (seconds): " |
| + (System.currentTimeMillis() - start_loading_time) / 1000); |
| } |
| String[] wrds = Regex.spaces.split(line.trim()); |
| |
| if (wrds.length < order + 1 || wrds.length > order + 2) { // TODO: error |
| // logger.severe("wrong line: "+ line); |
| return; |
| } |
| int last_word_id = Vocabulary.id(wrds[order]); |
| |
| // ##### identify the BOW position, insert the backoff node if necessary, |
| // and add suffix information |
| LMHash pos = root; |
| // reverse search, start from the second-last word |
| for (int i = order - 1; i > 0; i--) { |
| if (is_add_suffix_infor) { |
| Double t_prob = (Double) pos.get(last_word_id); |
| if (null != t_prob) { |
| if (t_prob > MIN_LOG_P) { // have prob, but not suffix flag |
| double tem = t_prob + MIN_LOG_P; |
| pos.put(last_word_id, tem); // overwrite |
| } |
| } else { |
| pos.put(last_word_id, SUFFIX_ONLY); |
| } |
| } |
| int cur_sym_id = Vocabulary.id(wrds[i]); |
| // System.out.println(Vocabulary.size()); |
| LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size()); |
| if (null != next_layer) { |
| pos = next_layer; |
| } else { |
| LMHash new_tnode = new LMHash(); // create new bow node |
| pos.put(cur_sym_id + Vocabulary.size(), new_tnode); |
| pos = new_tnode; |
| |
| g_n_bow_nodes++; |
| if (g_n_bow_nodes % 1000000 == 0) { |
| if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash " + g_n_bow_nodes); |
| |
| // System.out.println("##### mem used (kb): " + |
| // Support.getMemoryUse()); |
| if (logger.isLoggable(Level.FINE)) |
| logger.fine("##### time used (seconds): " |
| + (System.currentTimeMillis() - start_loading_time) / 1000); |
| } |
| } |
| if (!pos.containsKey(BACKOFF_WGHT_SYM_ID)) { |
| // indicate it is a backoof node, to distinguish from a pure prefix node |
| pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT); |
| } |
| } |
| |
| // ##### add probability |
| if (is_add_suffix_infor && pos.containsKey(last_word_id)) { |
| double tem = Double.parseDouble(wrds[0]) + MIN_LOG_P; |
| pos.put(last_word_id, tem); // add probability and suffix flag |
| } else { |
| // add probability |
| pos.put(last_word_id, Double.parseDouble(wrds[0])); |
| } |
| |
| // ##### add prefix infor, a prefix node is just like a BOW node |
| if (is_add_prefix_infor) { |
| pos.put(LM_HAVE_PREFIX_SYM_ID, 1); // for preifx [1,order-1] |
| for (int i = 1; i < order - 1; i++) { // ignore the last prefix |
| pos = root; // reset pos |
| for (int j = i; j >= 1; j--) { // reverse search: [1,i] |
| int cur_sym_id = Vocabulary.id(wrds[j]); |
| LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size()); |
| |
| if (null != next_layer) { |
| pos = next_layer; |
| } else { |
| LMHash new_tnode = new LMHash();// create new prefix node |
| pos.put(cur_sym_id + Vocabulary.size(), new_tnode); |
| pos = new_tnode; |
| |
| g_n_bow_nodes++; |
| if (g_n_bow_nodes % 1000000 == 0) { |
| if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash " + g_n_bow_nodes); |
| |
| // System.out.println("##### mem used (kb): " + |
| // Support.getMemoryUse()); |
| if (logger.isLoggable(Level.FINE)) |
| logger.fine("##### time used (seconds): " |
| + (System.currentTimeMillis() - start_loading_time) / 1000); |
| } |
| } |
| } |
| pos.put(LM_HAVE_PREFIX_SYM_ID, 1);// only the last node should have this |
| // flag |
| } |
| } |
| |
| // ##### add bow |
| if (wrds.length == order + 2) { // have bow weight to add |
| pos = root; |
| // reverse search, start from the last word |
| for (int i = order; i >= 1; i--) { |
| int cur_sym_id = Vocabulary.id(wrds[i]); |
| LMHash next_layer = (LMHash) pos.get(cur_sym_id + Vocabulary.size()); |
| if (null != next_layer) { |
| pos = next_layer; |
| } else { |
| LMHash new_tnode = new LMHash(); // create new bow node |
| pos.put(cur_sym_id + Vocabulary.size(), new_tnode); |
| pos = new_tnode; |
| |
| g_n_bow_nodes++; |
| if (g_n_bow_nodes % 1000000 == 0) { |
| if (logger.isLoggable(Level.FINE)) logger.fine("add LMHash " + g_n_bow_nodes); |
| // System.out.println("##### mem used (kb): " + |
| // Support.getMemoryUse()); |
| if (logger.isLoggable(Level.FINE)) |
| logger.fine("##### time used (seconds): " |
| + (System.currentTimeMillis() - start_loading_time) / 1000); |
| } |
| } |
| |
| // add bow weight here |
| if (i == 1) { // force to override the backoff weight |
| double backoff_weight = Double.parseDouble(wrds[order + 1]); |
| pos.put(BACKOFF_WGHT_SYM_ID, backoff_weight); |
| } else { |
| if (!pos.containsKey(BACKOFF_WGHT_SYM_ID)) { |
| // indicate it is a backoof node, to distinguish from a pure prefix |
| // node |
| pos.put(BACKOFF_WGHT_SYM_ID, NON_EXIST_WEIGHT); |
| } |
| } |
| } |
| } |
| } |
| |
| /* |
| * ###################### not used private boolean have_suffix_old(int[] words){ LMHash pos=root; |
| * int i=words.length-1; for(; i>=0; i--){//reverse search LMHash next_layer=(LMHash) |
| * pos.get(words[i]+p_symbol.getLMEndID()); if(next_layer!=null){ pos=next_layer; }else{ break; } |
| * } if(i==-1 && pos.containsKey(Symbol.LM_HAVE_SUFFIX_SYM_ID)) return true; else return false; } |
| */ |
| |
| // in theory: 64bytes (init size is 5) |
| // in practice: 86 bytes (init size is 5) |
| // in practice: 132 bytes (init size is 10) |
| // in practice: 211 bytes (init size is 20) |
| // important note: if we use tbl.put(key, new Integer(1)) instead of |
| // tbl.put(key, (new Integer(1)).intValue()), then for each element, we waste |
| // 16 bytes for the Integer object, |
| // and the GC will not collect this Double object, because the hashtable ref |
| // it |
| private static class LMHash // 4bytes |
| { |
| |
| // ######note: key must be positive integer, and value must not be null |
| /* |
| * if key can be both positive and negative, then lot of collision, or will take very long to |
| * call get() imagine, we put all numbers in [1,20000] in hashtable, but try to call get() by |
| * numbers [-20000,-1], it will take very long time |
| */ |
| |
| // TODO: should round the array size to a prime number? |
| static double load_factor = 0.6; |
| static int default_init_size = 5; |
| |
| int size = 0; // 8 bytes? |
| int[] key_array; // pointer itself is 4 bytes?, when it is newed, then add |
| // 10 more bytes, and the int itself |
| Object[] val_array; // pointer itself is 4 bytes?, when it is newed, then |
| // add 10 more bytes, and the object itself |
| |
| public LMHash(int init_size) { |
| key_array = new int[init_size]; |
| val_array = new Object[init_size]; |
| } |
| |
| public LMHash() { |
| key_array = new int[default_init_size]; |
| val_array = new Object[default_init_size]; |
| } |
| |
| // return the positive position for the key |
| private int hash_pos(int key, int length) { |
| // return Math.abs(key % length); |
| return key % length; |
| } |
| |
| public Object get(int key) { |
| Object res = null; |
| int pos = hash_pos(key, key_array.length); |
| while (key_array[pos] != 0) { // search until empty cell, |
| if (key_array[pos] == key) { |
| return val_array[pos]; // found |
| } |
| pos++; // linear search |
| pos = hash_pos(pos, key_array.length); |
| } |
| return res; |
| } |
| |
| public boolean containsKey(int key) { |
| return (null != get(key)); |
| } |
| |
| public int size() { |
| return size; |
| } |
| |
| public void put(int key, Object value) { |
| if (null == value) { |
| throw new IllegalArgumentException("LMHash, value is null"); |
| } |
| |
| int pos = hash_pos(key, key_array.length); |
| while (key_array[pos] != 0) { // search until empty cell, |
| if (key_array[pos] == key) { |
| val_array[pos] = value; // found, and overwrite |
| return; |
| } |
| pos++; // linear search |
| pos = hash_pos(pos, key_array.length); |
| } |
| |
| // we get to here, means we do not have this key, need to insert it |
| // data_array[pos] = new LMItem(key, value); |
| key_array[pos] = key; |
| val_array[pos] = value; |
| |
| size++; |
| if (size >= key_array.length * load_factor) { |
| expand_tbl(); |
| } |
| } |
| |
| private void expand_tbl() { |
| int new_size = key_array.length * 2 + 1; // TODO |
| int[] new_key_array = new int[new_size]; |
| Object[] new_val_array = new Object[new_size]; |
| |
| for (int i = 0; i < key_array.length; i++) { |
| if (key_array[i] != 0) { // add the element |
| int pos = hash_pos(key_array[i], new_key_array.length); |
| |
| // find first empty postition, note that it is not possible that we |
| // need to overwrite |
| while (new_key_array[pos] != 0) { |
| pos++; // linear search |
| pos = hash_pos(pos, new_key_array.length); |
| } |
| new_key_array[pos] = key_array[i]; |
| new_val_array[pos] = val_array[i]; |
| } |
| } |
| key_array = new_key_array; |
| val_array = new_val_array; |
| } |
| } |
| } |