| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.lucene.analysis.compound.hyphenation; |
| |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.util.ArrayList; |
| import java.util.HashMap; |
| |
| import org.xml.sax.InputSource; |
| |
| /** |
| * This tree structure stores the hyphenation patterns in an efficient way for |
| * fast lookup. It provides the provides the method to hyphenate a word. |
| * |
| * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified. |
| */ |
| public class HyphenationTree extends TernaryTree implements PatternConsumer { |
| |
| /** |
| * value space: stores the interletter values |
| */ |
| protected ByteVector vspace; |
| |
| /** |
| * This map stores hyphenation exceptions |
| */ |
| protected HashMap<String,ArrayList<Object>> stoplist; |
| |
| /** |
| * This map stores the character classes |
| */ |
| protected TernaryTree classmap; |
| |
| /** |
| * Temporary map to store interletter values on pattern loading. |
| */ |
| private transient TernaryTree ivalues; |
| |
| public HyphenationTree() { |
| stoplist = new HashMap<>(23); // usually a small table |
| classmap = new TernaryTree(); |
| vspace = new ByteVector(); |
| vspace.alloc(1); // this reserves index 0, which we don't use |
| } |
| |
| /** |
| * Packs the values by storing them in 4 bits, two values into a byte Values |
| * range is from 0 to 9. We use zero as terminator, so we'll add 1 to the |
| * value. |
| * |
| * @param values a string of digits from '0' to '9' representing the |
| * interletter values. |
| * @return the index into the vspace array where the packed values are stored. |
| */ |
| protected int packValues(String values) { |
| int i, n = values.length(); |
| int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1; |
| int offset = vspace.alloc(m); |
| byte[] va = vspace.getArray(); |
| for (i = 0; i < n; i++) { |
| int j = i >> 1; |
| byte v = (byte) ((values.charAt(i) - '0' + 1) & 0x0f); |
| if ((i & 1) == 1) { |
| va[j + offset] = (byte) (va[j + offset] | v); |
| } else { |
| va[j + offset] = (byte) (v << 4); // big endian |
| } |
| } |
| va[m - 1 + offset] = 0; // terminator |
| return offset; |
| } |
| |
| protected String unpackValues(int k) { |
| StringBuilder buf = new StringBuilder(); |
| byte v = vspace.get(k++); |
| while (v != 0) { |
| char c = (char) (((v & 0xf0 )>>> 4) - 1 + '0'); |
| buf.append(c); |
| c = (char) (v & 0x0f); |
| if (c == 0) { |
| break; |
| } |
| c = (char) (c - 1 + '0'); |
| buf.append(c); |
| v = vspace.get(k++); |
| } |
| return buf.toString(); |
| } |
| |
| /** |
| * Read hyphenation patterns from an XML file. |
| * |
| * @param source the InputSource for the file |
| * @throws IOException In case the parsing fails |
| */ |
| public void loadPatterns(InputSource source) throws IOException { |
| PatternParser pp = new PatternParser(this); |
| ivalues = new TernaryTree(); |
| |
| pp.parse(source); |
| |
| // patterns/values should be now in the tree |
| // let's optimize a bit |
| trimToSize(); |
| vspace.trimToSize(); |
| classmap.trimToSize(); |
| |
| // get rid of the auxiliary map |
| ivalues = null; |
| } |
| |
| public String findPattern(String pat) { |
| int k = super.find(pat); |
| if (k >= 0) { |
| return unpackValues(k); |
| } |
| return ""; |
| } |
| |
| /** |
| * String compare, returns 0 if equal or t is a substring of s |
| */ |
| protected int hstrcmp(char[] s, int si, char[] t, int ti) { |
| for (; s[si] == t[ti]; si++, ti++) { |
| if (s[si] == 0) { |
| return 0; |
| } |
| } |
| if (t[ti] == 0) { |
| return 0; |
| } |
| return s[si] - t[ti]; |
| } |
| |
| protected byte[] getValues(int k) { |
| StringBuilder buf = new StringBuilder(); |
| byte v = vspace.get(k++); |
| while (v != 0) { |
| char c = (char) (((v & 0xf0 )>>> 4) - 1); |
| buf.append(c); |
| c = (char) (v & 0x0f); |
| if (c == 0) { |
| break; |
| } |
| c = (char) (c - 1); |
| buf.append(c); |
| v = vspace.get(k++); |
| } |
| byte[] res = new byte[buf.length()]; |
| for (int i = 0; i < res.length; i++) { |
| res[i] = (byte) buf.charAt(i); |
| } |
| return res; |
| } |
| |
| /** |
| * <p> |
| * Search for all possible partial matches of word starting at index an update |
| * interletter values. In other words, it does something like: |
| * </p> |
| * <code> |
| * for(i=0; i<patterns.length; i++) { |
| * if ( word.substring(index).startsWidth(patterns[i]) ) |
| * update_interletter_values(patterns[i]); |
| * } |
| * </code> |
| * <p> |
| * But it is done in an efficient way since the patterns are stored in a |
| * ternary tree. In fact, this is the whole purpose of having the tree: doing |
| * this search without having to test every single pattern. The number of |
| * patterns for languages such as English range from 4000 to 10000. Thus, |
| * doing thousands of string comparisons for each word to hyphenate would be |
| * really slow without the tree. The tradeoff is memory, but using a ternary |
| * tree instead of a trie, almost halves the the memory used by Lout or TeX. |
| * It's also faster than using a hash table |
| * </p> |
| * |
| * @param word null terminated word to match |
| * @param index start index from word |
| * @param il interletter values array to update |
| */ |
| protected void searchPatterns(char[] word, int index, byte[] il) { |
| byte[] values; |
| int i = index; |
| char p, q; |
| char sp = word[i]; |
| p = root; |
| |
| while (p > 0 && p < sc.length) { |
| if (sc[p] == 0xFFFF) { |
| if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) { |
| values = getValues(eq[p]); // data pointer is in eq[] |
| int j = index; |
| for (int k = 0; k < values.length; k++) { |
| if (j < il.length && values[k] > il[j]) { |
| il[j] = values[k]; |
| } |
| j++; |
| } |
| } |
| return; |
| } |
| int d = sp - sc[p]; |
| if (d == 0) { |
| if (sp == 0) { |
| break; |
| } |
| sp = word[++i]; |
| p = eq[p]; |
| q = p; |
| |
| // look for a pattern ending at this position by searching for |
| // the null char ( splitchar == 0 ) |
| while (q > 0 && q < sc.length) { |
| if (sc[q] == 0xFFFF) { // stop at compressed branch |
| break; |
| } |
| if (sc[q] == 0) { |
| values = getValues(eq[q]); |
| int j = index; |
| for (int k = 0; k < values.length; k++) { |
| if (j < il.length && values[k] > il[j]) { |
| il[j] = values[k]; |
| } |
| j++; |
| } |
| break; |
| } else { |
| q = lo[q]; |
| |
| /** |
| * actually the code should be: q = sc[q] < 0 ? hi[q] : lo[q]; but |
| * java chars are unsigned |
| */ |
| } |
| } |
| } else { |
| p = d < 0 ? lo[p] : hi[p]; |
| } |
| } |
| } |
| |
| /** |
| * Hyphenate word and return a Hyphenation object. |
| * |
| * @param word the word to be hyphenated |
| * @param remainCharCount Minimum number of characters allowed before the |
| * hyphenation point. |
| * @param pushCharCount Minimum number of characters allowed after the |
| * hyphenation point. |
| * @return a {@link Hyphenation Hyphenation} object representing the |
| * hyphenated word or null if word is not hyphenated. |
| */ |
| public Hyphenation hyphenate(String word, int remainCharCount, |
| int pushCharCount) { |
| char[] w = word.toCharArray(); |
| return hyphenate(w, 0, w.length, remainCharCount, pushCharCount); |
| } |
| |
| /** |
| * w = "****nnllllllnnn*****", where n is a non-letter, l is a letter, all n |
| * may be absent, the first n is at offset, the first l is at offset + |
| * iIgnoreAtBeginning; word = ".llllll.'\0'***", where all l in w are copied |
| * into word. In the first part of the routine len = w.length, in the second |
| * part of the routine len = word.length. Three indices are used: index(w), |
| * the index in w, index(word), the index in word, letterindex(word), the |
| * index in the letter part of word. The following relations exist: index(w) = |
| * offset + i - 1 index(word) = i - iIgnoreAtBeginning letterindex(word) = |
| * index(word) - 1 (see first loop). It follows that: index(w) - index(word) = |
| * offset - 1 + iIgnoreAtBeginning index(w) = letterindex(word) + offset + |
| * iIgnoreAtBeginning |
| */ |
| |
| /** |
| * Hyphenate word and return an array of hyphenation points. |
| * |
| * @param w char array that contains the word |
| * @param offset Offset to first character in word |
| * @param len Length of word |
| * @param remainCharCount Minimum number of characters allowed before the |
| * hyphenation point. |
| * @param pushCharCount Minimum number of characters allowed after the |
| * hyphenation point. |
| * @return a {@link Hyphenation Hyphenation} object representing the |
| * hyphenated word or null if word is not hyphenated. |
| */ |
| public Hyphenation hyphenate(char[] w, int offset, int len, |
| int remainCharCount, int pushCharCount) { |
| int i; |
| char[] word = new char[len + 3]; |
| |
| // normalize word |
| char[] c = new char[2]; |
| int iIgnoreAtBeginning = 0; |
| int iLength = len; |
| boolean bEndOfLetters = false; |
| for (i = 1; i <= len; i++) { |
| c[0] = w[offset + i - 1]; |
| int nc = classmap.find(c, 0); |
| if (nc < 0) { // found a non-letter character ... |
| if (i == (1 + iIgnoreAtBeginning)) { |
| // ... before any letter character |
| iIgnoreAtBeginning++; |
| } else { |
| // ... after a letter character |
| bEndOfLetters = true; |
| } |
| iLength--; |
| } else { |
| if (!bEndOfLetters) { |
| word[i - iIgnoreAtBeginning] = (char) nc; |
| } else { |
| return null; |
| } |
| } |
| } |
| len = iLength; |
| if (len < (remainCharCount + pushCharCount)) { |
| // word is too short to be hyphenated |
| return null; |
| } |
| int[] result = new int[len + 1]; |
| int k = 0; |
| |
| // check exception list first |
| String sw = new String(word, 1, len); |
| if (stoplist.containsKey(sw)) { |
| // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = |
| // null) |
| ArrayList<Object> hw = stoplist.get(sw); |
| int j = 0; |
| for (i = 0; i < hw.size(); i++) { |
| Object o = hw.get(i); |
| // j = index(sw) = letterindex(word)? |
| // result[k] = corresponding index(w) |
| if (o instanceof String) { |
| j += ((String) o).length(); |
| if (j >= remainCharCount && j < (len - pushCharCount)) { |
| result[k++] = j + iIgnoreAtBeginning; |
| } |
| } |
| } |
| } else { |
| // use algorithm to get hyphenation points |
| word[0] = '.'; // word start marker |
| word[len + 1] = '.'; // word end marker |
| word[len + 2] = 0; // null terminated |
| byte[] il = new byte[len + 3]; // initialized to zero |
| for (i = 0; i < len + 1; i++) { |
| searchPatterns(word, i, il); |
| } |
| |
| // hyphenation points are located where interletter value is odd |
| // i is letterindex(word), |
| // i + 1 is index(word), |
| // result[k] = corresponding index(w) |
| for (i = 0; i < len; i++) { |
| if (((il[i + 1] & 1) == 1) && i >= remainCharCount |
| && i <= (len - pushCharCount)) { |
| result[k++] = i + iIgnoreAtBeginning; |
| } |
| } |
| } |
| |
| if (k > 0) { |
| // trim result array |
| int[] res = new int[k+2]; |
| System.arraycopy(result, 0, res, 1, k); |
| // We add the synthetical hyphenation points |
| // at the beginning and end of the word |
| res[0]=0; |
| res[k+1]=len; |
| return new Hyphenation(res); |
| } else { |
| return null; |
| } |
| } |
| |
| /** |
| * Add a character class to the tree. It is used by |
| * {@link PatternParser PatternParser} as callback to add character classes. |
| * Character classes define the valid word characters for hyphenation. If a |
| * word contains a character not defined in any of the classes, it is not |
| * hyphenated. It also defines a way to normalize the characters in order to |
| * compare them with the stored patterns. Usually pattern files use only lower |
| * case characters, in this case a class for letter 'a', for example, should |
| * be defined as "aA", the first character being the normalization char. |
| */ |
| @Override |
| public void addClass(String chargroup) { |
| if (chargroup.length() > 0) { |
| char equivChar = chargroup.charAt(0); |
| char[] key = new char[2]; |
| key[1] = 0; |
| for (int i = 0; i < chargroup.length(); i++) { |
| key[0] = chargroup.charAt(i); |
| classmap.insert(key, 0, equivChar); |
| } |
| } |
| } |
| |
| /** |
| * Add an exception to the tree. It is used by |
| * {@link PatternParser PatternParser} class as callback to store the |
| * hyphenation exceptions. |
| * |
| * @param word normalized word |
| * @param hyphenatedword a vector of alternating strings and |
| * {@link Hyphen hyphen} objects. |
| */ |
| @Override |
| public void addException(String word, ArrayList<Object> hyphenatedword) { |
| stoplist.put(word, hyphenatedword); |
| } |
| |
| /** |
| * Add a pattern to the tree. Mainly, to be used by |
| * {@link PatternParser PatternParser} class as callback to add a pattern to |
| * the tree. |
| * |
| * @param pattern the hyphenation pattern |
| * @param ivalue interletter weight values indicating the desirability and |
| * priority of hyphenating at a given point within the pattern. It |
| * should contain only digit characters. (i.e. '0' to '9'). |
| */ |
| @Override |
| public void addPattern(String pattern, String ivalue) { |
| int k = ivalues.find(ivalue); |
| if (k <= 0) { |
| k = packValues(ivalue); |
| ivalues.insert(ivalue, (char) k); |
| } |
| insert(pattern, (char) k); |
| } |
| |
| @Override |
| public void printStats(PrintStream out) { |
| out.println("Value space size = " |
| + Integer.toString(vspace.length())); |
| super.printStats(out); |
| |
| } |
| } |