src/java/org/apache/fop/hyphenation/HyphenationTree.java - xmlgraphics-fop - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /* $Id$ */

 package org.apache.fop.hyphenation;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileInputStream;
 import java.io.FileOutputStream;
 import java.io.FileReader;
 import java.io.IOException;
 import java.io.ObjectInputStream;
 import java.io.ObjectOutputStream;
 import java.io.Serializable;
 import java.net.MalformedURLException;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.List;

 import org.xml.sax.InputSource;

 import org.apache.commons.io.IOUtils;

 /**
  * <p>This tree structure stores the hyphenation patterns in an efficient
  * way for fast lookup. It provides the provides the method to
  * hyphenate a word.</p>
  *
  * <p>This work was authored by Carlos Villegas (cav@uniscope.co.jp).</p>
  */
 public class HyphenationTree extends TernaryTree
             implements PatternConsumer, Serializable {

     private static final long serialVersionUID = -7842107987915665573L;

     /**
      * value space: stores the interletter values
      */
     protected ByteVector vspace;

     /**
      * This map stores hyphenation exceptions
      */
     protected HashMap stoplist;

     /**
      * This map stores the character classes
      */
     protected TernaryTree classmap;

     /**
      * Temporary map to store interletter values on pattern loading.
      */
     private transient TernaryTree ivalues;

     /** Default constructor. */
     public HyphenationTree() {
         stoplist = new HashMap(23);    // usually a small table
         classmap = new TernaryTree();
         vspace = new ByteVector();
         vspace.alloc(1);    // this reserves index 0, which we don't use
     }

     /**
      * Packs the values by storing them in 4 bits, two values into a byte
      * Values range is from 0 to 9. We use zero as terminator,
      * so we'll add 1 to the value.
      * @param values a string of digits from '0' to '9' representing the
      * interletter values.
      * @return the index into the vspace array where the packed values
      * are stored.
      */
     protected int packValues(String values) {
         int i;
         int n = values.length();
         int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
         int offset = vspace.alloc(m);
         byte[] va = vspace.getArray();
         for (i = 0; i < n; i++) {
             int j = i >> 1;
             byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
             if ((i & 1) == 1) {
                 va[j + offset] = (byte)(va[j + offset] | v);
             } else {
                 va[j + offset] = (byte)(v << 4);    // big endian
             }
         }
         va[m - 1 + offset] = 0;    // terminator
         return offset;
     }

     /**
      * Unpack values.
      * @param k an integer
      * @return a string
      */
     protected String unpackValues(int k) {
         StringBuffer buf = new StringBuffer();
         byte v = vspace.get(k++);
         while (v != 0) {
             char c = (char)((v >>> 4) - 1 + '0');
             buf.append(c);
             c = (char)(v & 0x0f);
             if (c == 0) {
                 break;
             }
             c = (char)(c - 1 + '0');
             buf.append(c);
             v = vspace.get(k++);
         }
         return buf.toString();
     }

     /**
      * Read hyphenation patterns from an XML file.
      * @param filename the filename
      * @throws HyphenationException In case the parsing fails
      */
     public void loadPatterns(String filename) throws HyphenationException {
         File f = new File(filename);
         try {
             InputSource src = new InputSource(f.toURI().toURL().toExternalForm());
             loadPatterns(src);
         } catch (MalformedURLException e) {
             throw new HyphenationException("Error converting the File '" + f + "' to a URL: "
                     + e.getMessage());
         }
     }

     /**
      * Read hyphenation patterns from an XML file.
      * @param source the InputSource for the file
      * @throws HyphenationException In case the parsing fails
      */
     public void loadPatterns(InputSource source) throws HyphenationException {
         PatternParser pp = new PatternParser(this);
         ivalues = new TernaryTree();

         pp.parse(source);

         // patterns/values should be now in the tree
         // let's optimize a bit
         trimToSize();
         vspace.trimToSize();
         classmap.trimToSize();

         // get rid of the auxiliary map
         ivalues = null;
     }

     /**
      * Find pattern.
      * @param pat a pattern
      * @return a string
      */
     public String findPattern(String pat) {
         int k = super.find(pat);
         if (k >= 0) {
             return unpackValues(k);
         }
         return "";
     }

     /**
      * String compare, returns 0 if equal or
      * t is a substring of s.
      * @param s first character array
      * @param si starting index into first array
      * @param t second character array
      * @param ti starting index into second array
      * @return an integer
      */
     protected int hstrcmp(char[] s, int si, char[] t, int ti) {
         for (; s[si] == t[ti]; si++, ti++) {
             if (s[si] == 0) {
                 return 0;
             }
         }
         if (t[ti] == 0) {
             return 0;
         }
         return s[si] - t[ti];
     }

     /**
      * Get values.
      * @param k an integer
      * @return a byte array
      */
     protected byte[] getValues(int k) {
         StringBuffer buf = new StringBuffer();
         byte v = vspace.get(k++);
         while (v != 0) {
             char c = (char)((v >>> 4) - 1);
             buf.append(c);
             c = (char)(v & 0x0f);
             if (c == 0) {
                 break;
             }
             c = (char)(c - 1);
             buf.append(c);
             v = vspace.get(k++);
         }
         byte[] res = new byte[buf.length()];
         for (int i = 0; i < res.length; i++) {
             res[i] = (byte)buf.charAt(i);
         }
         return res;
     }

     /**
      * <p>Search for all possible partial matches of word starting
      * at index an update interletter values. In other words, it
      * does something like:</p>
      * <code>
      * for(i=0; i<patterns.length; i++) {
      * if ( word.substring(index).startsWidth(patterns[i]) )
      * update_interletter_values(patterns[i]);
      * }
      * </code>
      * <p>But it is done in an efficient way since the patterns are
      * stored in a ternary tree. In fact, this is the whole purpose
      * of having the tree: doing this search without having to test
      * every single pattern. The number of patterns for languages
      * such as English range from 4000 to 10000. Thus, doing thousands
      * of string comparisons for each word to hyphenate would be
      * really slow without the tree. The tradeoff is memory, but
      * using a ternary tree instead of a trie, almost halves the
      * the memory used by Lout or TeX. It's also faster than using
      * a hash table</p>
      * @param word null terminated word to match
      * @param index start index from word
      * @param il interletter values array to update
      */
     protected void searchPatterns(char[] word, int index, byte[] il) {
         byte[] values;
         int i = index;
         char p;
         char q;
         char sp = word[i];
         p = root;

         while (p > 0 && p < sc.length) {
             if (sc[p] == 0xFFFF) {
                 if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
                     values = getValues(eq[p]);    // data pointer is in eq[]
                     int j = index;
                     for (int k = 0; k < values.length; k++) {
                         if (j < il.length && values[k] > il[j]) {
                             il[j] = values[k];
                         }
                         j++;
                     }
                 }
                 return;
             }
             int d = sp - sc[p];
             if (d == 0) {
                 if (sp == 0) {
                     break;
                 }
                 sp = word[++i];
                 p = eq[p];
                 q = p;

                 // look for a pattern ending at this position by searching for
                 // the null char ( splitchar == 0 )
                 while (q > 0 && q < sc.length) {
                     if (sc[q] == 0xFFFF) {        // stop at compressed branch
                         break;
                     }
                     if (sc[q] == 0) {
                         values = getValues(eq[q]);
                         int j = index;
                         for (int k = 0; k < values.length; k++) {
                             if (j < il.length && values[k] > il[j]) {
                                 il[j] = values[k];
                             }
                             j++;
                         }
                         break;
                     } else {
                         q = lo[q];

                         /**
                          * actually the code should be:
                          * q = sc[q] < 0 ? hi[q] : lo[q];
                          * but java chars are unsigned
                          */
                     }
                 }
             } else {
                 p = d < 0 ? lo[p] : hi[p];
             }
         }
     }

     /**
      * Hyphenate word and return a Hyphenation object.
      * @param word the word to be hyphenated
      * @param remainCharCount Minimum number of characters allowed
      * before the hyphenation point.
      * @param pushCharCount Minimum number of characters allowed after
      * the hyphenation point.
      * @return a {@link Hyphenation Hyphenation} object representing
      * the hyphenated word or null if word is not hyphenated.
      */
     public Hyphenation hyphenate(String word, int remainCharCount,
                                  int pushCharCount) {
         char[] w = word.toCharArray();
         if (isMultiPartWord(w, w.length)) {
             List<char[]> words = splitOnNonCharacters(w);
             return new Hyphenation(new String(w),
                     getHyphPointsForWords(words, remainCharCount, pushCharCount));
         } else {
             return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
         }
     }

     private boolean isMultiPartWord(char[] w, int len) {
         int wordParts = 0;
         for (int i = 0; i < len; i++) {
             char[] c = new char[2];
             c[0] = w[i];
             int nc = classmap.find(c, 0);
             if (nc > 0) {
                 if (wordParts > 1) {
                     return true;
                 }
                 wordParts = 1;
             } else {
                 if (wordParts == 1) {
                     wordParts++;
                 }
             }
         }
         return false;
     }

     private List<char[]> splitOnNonCharacters(char[] word) {
         List<Integer> breakPoints = getNonLetterBreaks(word);
         if (breakPoints.size() == 0) {
             return Collections.emptyList();
         }
         List<char[]> words = new ArrayList<char[]>();
         for (int ibreak = 0; ibreak < breakPoints.size(); ibreak++) {
             char[] newWord = getWordFromCharArray(word, ((ibreak == 0)
                     ? 0 : breakPoints.get(ibreak - 1)), breakPoints.get(ibreak));
             words.add(newWord);
         }
         if (word.length - breakPoints.get(breakPoints.size() - 1) - 1 > 1) {
             char[] newWord = getWordFromCharArray(word, breakPoints.get(breakPoints.size() - 1),
                     word.length);
             words.add(newWord);
         }
         return words;
     }

     private List<Integer> getNonLetterBreaks(char[] word) {
         char[] c = new char[2];
         List<Integer> breakPoints = new ArrayList<Integer>();
         boolean foundLetter = false;
         for (int i = 0; i < word.length; i++) {
             c[0] = word[i];
             if (classmap.find(c, 0) < 0) {
                 if (foundLetter) {
                     breakPoints.add(i);
                 }
             } else {
                 foundLetter = true;
             }
         }
         return breakPoints;
     }

     private char[] getWordFromCharArray(char[] word, int startIndex, int endIndex) {
         char[] newWord = new char[endIndex - ((startIndex == 0) ? startIndex : startIndex + 1)];
         int iChar = 0;
         for (int i = (startIndex == 0) ? 0 : startIndex + 1; i < endIndex; i++) {
             newWord[iChar++] = word[i];
         }
         return newWord;
     }

     private int[] getHyphPointsForWords(List<char[]> nonLetterWords, int remainCharCount,
             int pushCharCount) {
         int[] breaks = new int[0];
         for (int iNonLetterWord = 0; iNonLetterWord < nonLetterWords.size(); iNonLetterWord++) {
             char[] nonLetterWord = nonLetterWords.get(iNonLetterWord);
             Hyphenation curHyph = hyphenate(nonLetterWord, 0, nonLetterWord.length,
                     (iNonLetterWord == 0) ? remainCharCount : 1,
                     (iNonLetterWord == nonLetterWords.size() - 1) ? pushCharCount : 1);
             if (curHyph == null) {
                 continue;
             }
             int[] combined = new int[breaks.length + curHyph.getHyphenationPoints().length];
             int[] hyphPoints = curHyph.getHyphenationPoints();
             int foreWordsSize = calcForeWordsSize(nonLetterWords, iNonLetterWord);
             for (int i = 0; i < hyphPoints.length; i++) {
                 hyphPoints[i] += foreWordsSize;
             }
             System.arraycopy(breaks, 0, combined, 0, breaks.length);
             System.arraycopy(hyphPoints, 0, combined, breaks.length, hyphPoints.length);
             breaks = combined;
         }
         return breaks;
     }

     private int calcForeWordsSize(List<char[]> nonLetterWords, int iNonLetterWord) {
         int result = 0;
         for (int i = 0; i < iNonLetterWord; i++) {
             result += nonLetterWords.get(i).length + 1;
         }
         return result;
     }

     /**
      * w = "****nnllllllnnn*****",
      * where n is a non-letter, l is a letter,
      * all n may be absent, the first n is at offset,
      * the first l is at offset + iIgnoreAtBeginning;
      * word = ".llllll.'\0'***",
      * where all l in w are copied into word.
      * In the first part of the routine len = w.length,
      * in the second part of the routine len = word.length.
      * Three indices are used:
      * index(w), the index in w,
      * index(word), the index in word,
      * letterindex(word), the index in the letter part of word.
      * The following relations exist:
      * index(w) = offset + i - 1
      * index(word) = i - iIgnoreAtBeginning
      * letterindex(word) = index(word) - 1
      * (see first loop).
      * It follows that:
      * index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
      * index(w) = letterindex(word) + offset + iIgnoreAtBeginning
      */

     /**
      * Hyphenate word and return an array of hyphenation points.
      * @param w char array that contains the word
      * @param offset Offset to first character in word
      * @param len Length of word
      * @param remainCharCount Minimum number of characters allowed
      * before the hyphenation point.
      * @param pushCharCount Minimum number of characters allowed after
      * the hyphenation point.
      * @return a {@link Hyphenation Hyphenation} object representing
      * the hyphenated word or null if word is not hyphenated.
      */
     public Hyphenation hyphenate(char[] w, int offset, int len,
                                  int remainCharCount, int pushCharCount) {
         int i;
         char[] word = new char[len + 3];

         // normalize word
         char[] c = new char[2];
         int iIgnoreAtBeginning = 0;
         int iLength = len;
         boolean bEndOfLetters = false;
         for (i = 1; i <= len; i++) {
             c[0] = w[offset + i - 1];
             int nc = classmap.find(c, 0);
             if (nc < 0) {    // found a non-letter character ...
                 if (i == (1 + iIgnoreAtBeginning)) {
                     // ... before any letter character
                     iIgnoreAtBeginning++;
                 } else {
                     // ... after a letter character
                     bEndOfLetters = true;
                 }
                 iLength--;
             } else {
                 if (!bEndOfLetters) {
                     word[i - iIgnoreAtBeginning] = (char)nc;
                 } else {
                     return null;
                 }
             }
         }

         len = iLength;
         if (len < (remainCharCount + pushCharCount)) {
             // word is too short to be hyphenated
             return null;
         }
         int[] result = new int[len + 1];
         int k = 0;

         // check exception list first
         String sw = new String(word, 1, len);
         if (stoplist.containsKey(sw)) {
             // assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
             ArrayList hw = (ArrayList)stoplist.get(sw);
             int j = 0;
             for (i = 0; i < hw.size(); i++) {
                 Object o = hw.get(i);
                 // j = index(sw) = letterindex(word)?
                 // result[k] = corresponding index(w)
                 if (o instanceof String) {
                     j += ((String)o).length();
                     if (j >= remainCharCount && j < (len - pushCharCount)) {
                         result[k++] = j + iIgnoreAtBeginning;
                     }
                 }
             }
         } else {
             // use algorithm to get hyphenation points
             word[0] = '.';                    // word start marker
             word[len + 1] = '.';              // word end marker
             word[len + 2] = 0;                // null terminated
             byte[] il = new byte[len + 3];    // initialized to zero
             for (i = 0; i < len + 1; i++) {
                 searchPatterns(word, i, il);
             }

             // hyphenation points are located where interletter value is odd
             // i is letterindex(word),
             // i + 1 is index(word),
             // result[k] = corresponding index(w)
             for (i = 0; i < len; i++) {
                 if (((il[i + 1] & 1) == 1) && i >= remainCharCount
                         && i <= (len - pushCharCount)) {
                     result[k++] = i + iIgnoreAtBeginning;
                 }
             }
         }


         if (k > 0) {
             // trim result array
             int[] res = new int[k];
             System.arraycopy(result, 0, res, 0, k);
             return new Hyphenation(new String(w, offset, len), res);
         } else {
             return null;
         }
     }

     /**
      * Add a character class to the tree. It is used by
      * {@link PatternParser PatternParser} as callback to
      * add character classes. Character classes define the
      * valid word characters for hyphenation. If a word contains
      * a character not defined in any of the classes, it is not hyphenated.
      * It also defines a way to normalize the characters in order
      * to compare them with the stored patterns. Usually pattern
      * files use only lower case characters, in this case a class
      * for letter 'a', for example, should be defined as "aA", the first
      * character being the normalization char.
      * @param chargroup a character class (group)
      */
     public void addClass(String chargroup) {
         if (chargroup.length() > 0) {
             char equivChar = chargroup.charAt(0);
             char[] key = new char[2];
             key[1] = 0;
             for (int i = 0; i < chargroup.length(); i++) {
                 key[0] = chargroup.charAt(i);
                 classmap.insert(key, 0, equivChar);
             }
         }
     }

     /**
      * Add an exception to the tree. It is used by
      * {@link PatternParser PatternParser} class as callback to
      * store the hyphenation exceptions.
      * @param word normalized word
      * @param hyphenatedword a vector of alternating strings and
      * {@link Hyphen hyphen} objects.
      */
     public void addException(String word, ArrayList hyphenatedword) {
         stoplist.put(word, hyphenatedword);
     }

     /**
      * Add a pattern to the tree. Mainly, to be used by
      * {@link PatternParser PatternParser} class as callback to
      * add a pattern to the tree.
      * @param pattern the hyphenation pattern
      * @param ivalue interletter weight values indicating the
      * desirability and priority of hyphenating at a given point
      * within the pattern. It should contain only digit characters.
      * (i.e. '0' to '9').
      */
     public void addPattern(String pattern, String ivalue) {
         int k = ivalues.find(ivalue);
         if (k <= 0) {
             k = packValues(ivalue);
             ivalues.insert(ivalue, (char)k);
         }
         insert(pattern, (char)k);
     }

     /**
      * Print statistics.
      */
     public void printStats() {
         System.out.println("Value space size = "
                            + Integer.toString(vspace.length()));
         super.printStats();

     }

     /**
      * Main entry point for this hyphenation utility application.
      * @param argv array of command linee arguments
      * @throws Exception in case an exception is raised but not caught
      */
     public static void main(String[] argv) throws Exception {
         HyphenationTree ht = null;
         int minCharCount = 2;
         BufferedReader in
             = new BufferedReader(new java.io.InputStreamReader(System.in));
         while (true) {
             System.out.print("l:\tload patterns from XML\n"
                     + "L:\tload patterns from serialized object\n"
                     + "s:\tset minimum character count\n"
                     + "w:\twrite hyphenation tree to object file\n"
                     + "h:\thyphenate\n"
                     + "f:\tfind pattern\n"
                     + "b:\tbenchmark\n"
                     + "q:\tquit\n\n"
                     + "Command:");
             String token = in.readLine().trim();
             if (token.equals("f")) {
                 System.out.print("Pattern: ");
                 token = in.readLine().trim();
                 System.out.println("Values: " + ht.findPattern(token));
             } else if (token.equals("s")) {
                 System.out.print("Minimun value: ");
                 token = in.readLine().trim();
                 minCharCount = Integer.parseInt(token);
             } else if (token.equals("l")) {
                 ht = new HyphenationTree();
                 System.out.print("XML file name: ");
                 token = in.readLine().trim();
                 ht.loadPatterns(token);
             } else if (token.equals("L")) {
                 ObjectInputStream ois = null;
                 System.out.print("Object file name: ");
                 token = in.readLine().trim();
                 try {
                     ois = new ObjectInputStream(new FileInputStream(token));
                     ht = (HyphenationTree)ois.readObject();
                 } catch (Exception e) {
                     e.printStackTrace();
                 } finally {
                     if (ois != null) {
                         try {
                             ois.close();
                         } catch (IOException e) {
                             //ignore
                         }
                     }
                 }
             } else if (token.equals("w")) {
                 System.out.print("Object file name: ");
                 token = in.readLine().trim();
                 ObjectOutputStream oos = null;
                 try {
                     oos = new ObjectOutputStream(new FileOutputStream(token));
                     oos.writeObject(ht);
                 } catch (Exception e) {
                     e.printStackTrace();
                 } finally {
                     if (oos != null) {
                         try {
                             oos.flush();
                         } catch (IOException e) {
                             //ignore
                         }
                         try {
                             oos.close();
                         } catch (IOException e) {
                             //ignore
                         }
                     }
                 }
             } else if (token.equals("h")) {
                 System.out.print("Word: ");
                 token = in.readLine().trim();
                 System.out.print("Hyphenation points: ");
                 System.out.println(ht.hyphenate(token, minCharCount,
                                                 minCharCount));
             } else if (token.equals("b")) {
                 if (ht == null) {
                     System.out.println("No patterns have been loaded.");
                     break;
                 }
                 System.out.print("Word list filename: ");
                 token = in.readLine().trim();
                 long starttime = 0;
                 int counter = 0;
                 BufferedReader reader = null;
                 try {
                     reader = new BufferedReader(new FileReader(token));
                     String line;

                     starttime = System.currentTimeMillis();
                     while ((line = reader.readLine()) != null) {
                         // System.out.print("\nline: ");
                         Hyphenation hyp = ht.hyphenate(line, minCharCount,
                                                        minCharCount);
                         if (hyp != null) {
                             String hword = hyp.toString();
                             // System.out.println(line);
                             // System.out.println(hword);
                         } else {
                             // System.out.println("No hyphenation");
                         }
                         counter++;
                     }
                 } catch (Exception ioe) {
                     System.out.println("Exception " + ioe);
                     ioe.printStackTrace();
                 } finally {
                     IOUtils.closeQuietly(reader);
                 }
                 long endtime = System.currentTimeMillis();
                 long result = endtime - starttime;
                 System.out.println(counter + " words in " + result
                                    + " Milliseconds hyphenated");

             } else if (token.equals("q")) {
                 break;
             }
         }

     }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	/* $Id$ */

	package org.apache.fop.hyphenation;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileInputStream;
	import java.io.FileOutputStream;
	import java.io.FileReader;
	import java.io.IOException;
	import java.io.ObjectInputStream;
	import java.io.ObjectOutputStream;
	import java.io.Serializable;
	import java.net.MalformedURLException;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.List;

	import org.xml.sax.InputSource;

	import org.apache.commons.io.IOUtils;

	/**
	* <p>This tree structure stores the hyphenation patterns in an efficient
	* way for fast lookup. It provides the provides the method to
	* hyphenate a word.</p>
	*
	* <p>This work was authored by Carlos Villegas (cav@uniscope.co.jp).</p>
	*/
	public class HyphenationTree extends TernaryTree
	implements PatternConsumer, Serializable {

	private static final long serialVersionUID = -7842107987915665573L;

	/**
	* value space: stores the interletter values
	*/
	protected ByteVector vspace;

	/**
	* This map stores hyphenation exceptions
	*/
	protected HashMap stoplist;

	/**
	* This map stores the character classes
	*/
	protected TernaryTree classmap;

	/**
	* Temporary map to store interletter values on pattern loading.
	*/
	private transient TernaryTree ivalues;

	/** Default constructor. */
	public HyphenationTree() {
	stoplist = new HashMap(23); // usually a small table
	classmap = new TernaryTree();
	vspace = new ByteVector();
	vspace.alloc(1); // this reserves index 0, which we don't use
	}

	/**
	* Packs the values by storing them in 4 bits, two values into a byte
	* Values range is from 0 to 9. We use zero as terminator,
	* so we'll add 1 to the value.
	* @param values a string of digits from '0' to '9' representing the
	* interletter values.
	* @return the index into the vspace array where the packed values
	* are stored.
	*/
	protected int packValues(String values) {
	int i;
	int n = values.length();
	int m = (n & 1) == 1 ? (n >> 1) + 2 : (n >> 1) + 1;
	int offset = vspace.alloc(m);
	byte[] va = vspace.getArray();
	for (i = 0; i < n; i++) {
	int j = i >> 1;
	byte v = (byte)((values.charAt(i) - '0' + 1) & 0x0f);
	if ((i & 1) == 1) {
	va[j + offset] = (byte)(va[j + offset] \| v);
	} else {
	va[j + offset] = (byte)(v << 4); // big endian
	}
	}
	va[m - 1 + offset] = 0; // terminator
	return offset;
	}

	/**
	* Unpack values.
	* @param k an integer
	* @return a string
	*/
	protected String unpackValues(int k) {
	StringBuffer buf = new StringBuffer();
	byte v = vspace.get(k++);
	while (v != 0) {
	char c = (char)((v >>> 4) - 1 + '0');
	buf.append(c);
	c = (char)(v & 0x0f);
	if (c == 0) {
	break;
	}
	c = (char)(c - 1 + '0');
	buf.append(c);
	v = vspace.get(k++);
	}
	return buf.toString();
	}

	/**
	* Read hyphenation patterns from an XML file.
	* @param filename the filename
	* @throws HyphenationException In case the parsing fails
	*/
	public void loadPatterns(String filename) throws HyphenationException {
	File f = new File(filename);
	try {
	InputSource src = new InputSource(f.toURI().toURL().toExternalForm());
	loadPatterns(src);
	} catch (MalformedURLException e) {
	throw new HyphenationException("Error converting the File '" + f + "' to a URL: "
	+ e.getMessage());
	}
	}

	/**
	* Read hyphenation patterns from an XML file.
	* @param source the InputSource for the file
	* @throws HyphenationException In case the parsing fails
	*/
	public void loadPatterns(InputSource source) throws HyphenationException {
	PatternParser pp = new PatternParser(this);
	ivalues = new TernaryTree();

	pp.parse(source);

	// patterns/values should be now in the tree
	// let's optimize a bit
	trimToSize();
	vspace.trimToSize();
	classmap.trimToSize();

	// get rid of the auxiliary map
	ivalues = null;
	}

	/**
	* Find pattern.
	* @param pat a pattern
	* @return a string
	*/
	public String findPattern(String pat) {
	int k = super.find(pat);
	if (k >= 0) {
	return unpackValues(k);
	}
	return "";
	}

	/**
	* String compare, returns 0 if equal or
	* t is a substring of s.
	* @param s first character array
	* @param si starting index into first array
	* @param t second character array
	* @param ti starting index into second array
	* @return an integer
	*/
	protected int hstrcmp(char[] s, int si, char[] t, int ti) {
	for (; s[si] == t[ti]; si++, ti++) {
	if (s[si] == 0) {
	return 0;
	}
	}
	if (t[ti] == 0) {
	return 0;
	}
	return s[si] - t[ti];
	}

	/**
	* Get values.
	* @param k an integer
	* @return a byte array
	*/
	protected byte[] getValues(int k) {
	StringBuffer buf = new StringBuffer();
	byte v = vspace.get(k++);
	while (v != 0) {
	char c = (char)((v >>> 4) - 1);
	buf.append(c);
	c = (char)(v & 0x0f);
	if (c == 0) {
	break;
	}
	c = (char)(c - 1);
	buf.append(c);
	v = vspace.get(k++);
	}
	byte[] res = new byte[buf.length()];
	for (int i = 0; i < res.length; i++) {
	res[i] = (byte)buf.charAt(i);
	}
	return res;
	}

	/**
	* <p>Search for all possible partial matches of word starting
	* at index an update interletter values. In other words, it
	* does something like:</p>
	* <code>
	* for(i=0; i<patterns.length; i++) {
	* if ( word.substring(index).startsWidth(patterns[i]) )
	* update_interletter_values(patterns[i]);
	* }
	* </code>
	* <p>But it is done in an efficient way since the patterns are
	* stored in a ternary tree. In fact, this is the whole purpose
	* of having the tree: doing this search without having to test
	* every single pattern. The number of patterns for languages
	* such as English range from 4000 to 10000. Thus, doing thousands
	* of string comparisons for each word to hyphenate would be
	* really slow without the tree. The tradeoff is memory, but
	* using a ternary tree instead of a trie, almost halves the
	* the memory used by Lout or TeX. It's also faster than using
	* a hash table</p>
	* @param word null terminated word to match
	* @param index start index from word
	* @param il interletter values array to update
	*/
	protected void searchPatterns(char[] word, int index, byte[] il) {
	byte[] values;
	int i = index;
	char p;
	char q;
	char sp = word[i];
	p = root;

	while (p > 0 && p < sc.length) {
	if (sc[p] == 0xFFFF) {
	if (hstrcmp(word, i, kv.getArray(), lo[p]) == 0) {
	values = getValues(eq[p]); // data pointer is in eq[]
	int j = index;
	for (int k = 0; k < values.length; k++) {
	if (j < il.length && values[k] > il[j]) {
	il[j] = values[k];
	}
	j++;
	}
	}
	return;
	}
	int d = sp - sc[p];
	if (d == 0) {
	if (sp == 0) {
	break;
	}
	sp = word[++i];
	p = eq[p];
	q = p;

	// look for a pattern ending at this position by searching for
	// the null char ( splitchar == 0 )
	while (q > 0 && q < sc.length) {
	if (sc[q] == 0xFFFF) { // stop at compressed branch
	break;
	}
	if (sc[q] == 0) {
	values = getValues(eq[q]);
	int j = index;
	for (int k = 0; k < values.length; k++) {
	if (j < il.length && values[k] > il[j]) {
	il[j] = values[k];
	}
	j++;
	}
	break;
	} else {
	q = lo[q];

	/**
	* actually the code should be:
	* q = sc[q] < 0 ? hi[q] : lo[q];
	* but java chars are unsigned
	*/
	}
	}
	} else {
	p = d < 0 ? lo[p] : hi[p];
	}
	}
	}

	/**
	* Hyphenate word and return a Hyphenation object.
	* @param word the word to be hyphenated
	* @param remainCharCount Minimum number of characters allowed
	* before the hyphenation point.
	* @param pushCharCount Minimum number of characters allowed after
	* the hyphenation point.
	* @return a {@link Hyphenation Hyphenation} object representing
	* the hyphenated word or null if word is not hyphenated.
	*/
	public Hyphenation hyphenate(String word, int remainCharCount,
	int pushCharCount) {
	char[] w = word.toCharArray();
	if (isMultiPartWord(w, w.length)) {
	List<char[]> words = splitOnNonCharacters(w);
	return new Hyphenation(new String(w),
	getHyphPointsForWords(words, remainCharCount, pushCharCount));
	} else {
	return hyphenate(w, 0, w.length, remainCharCount, pushCharCount);
	}
	}

	private boolean isMultiPartWord(char[] w, int len) {
	int wordParts = 0;
	for (int i = 0; i < len; i++) {
	char[] c = new char[2];
	c[0] = w[i];
	int nc = classmap.find(c, 0);
	if (nc > 0) {
	if (wordParts > 1) {
	return true;
	}
	wordParts = 1;
	} else {
	if (wordParts == 1) {
	wordParts++;
	}
	}
	}
	return false;
	}

	private List<char[]> splitOnNonCharacters(char[] word) {
	List<Integer> breakPoints = getNonLetterBreaks(word);
	if (breakPoints.size() == 0) {
	return Collections.emptyList();
	}
	List<char[]> words = new ArrayList<char[]>();
	for (int ibreak = 0; ibreak < breakPoints.size(); ibreak++) {
	char[] newWord = getWordFromCharArray(word, ((ibreak == 0)
	? 0 : breakPoints.get(ibreak - 1)), breakPoints.get(ibreak));
	words.add(newWord);
	}
	if (word.length - breakPoints.get(breakPoints.size() - 1) - 1 > 1) {
	char[] newWord = getWordFromCharArray(word, breakPoints.get(breakPoints.size() - 1),
	word.length);
	words.add(newWord);
	}
	return words;
	}

	private List<Integer> getNonLetterBreaks(char[] word) {
	char[] c = new char[2];
	List<Integer> breakPoints = new ArrayList<Integer>();
	boolean foundLetter = false;
	for (int i = 0; i < word.length; i++) {
	c[0] = word[i];
	if (classmap.find(c, 0) < 0) {
	if (foundLetter) {
	breakPoints.add(i);
	}
	} else {
	foundLetter = true;
	}
	}
	return breakPoints;
	}

	private char[] getWordFromCharArray(char[] word, int startIndex, int endIndex) {
	char[] newWord = new char[endIndex - ((startIndex == 0) ? startIndex : startIndex + 1)];
	int iChar = 0;
	for (int i = (startIndex == 0) ? 0 : startIndex + 1; i < endIndex; i++) {
	newWord[iChar++] = word[i];
	}
	return newWord;
	}

	private int[] getHyphPointsForWords(List<char[]> nonLetterWords, int remainCharCount,
	int pushCharCount) {
	int[] breaks = new int[0];
	for (int iNonLetterWord = 0; iNonLetterWord < nonLetterWords.size(); iNonLetterWord++) {
	char[] nonLetterWord = nonLetterWords.get(iNonLetterWord);
	Hyphenation curHyph = hyphenate(nonLetterWord, 0, nonLetterWord.length,
	(iNonLetterWord == 0) ? remainCharCount : 1,
	(iNonLetterWord == nonLetterWords.size() - 1) ? pushCharCount : 1);
	if (curHyph == null) {
	continue;
	}
	int[] combined = new int[breaks.length + curHyph.getHyphenationPoints().length];
	int[] hyphPoints = curHyph.getHyphenationPoints();
	int foreWordsSize = calcForeWordsSize(nonLetterWords, iNonLetterWord);
	for (int i = 0; i < hyphPoints.length; i++) {
	hyphPoints[i] += foreWordsSize;
	}
	System.arraycopy(breaks, 0, combined, 0, breaks.length);
	System.arraycopy(hyphPoints, 0, combined, breaks.length, hyphPoints.length);
	breaks = combined;
	}
	return breaks;
	}

	private int calcForeWordsSize(List<char[]> nonLetterWords, int iNonLetterWord) {
	int result = 0;
	for (int i = 0; i < iNonLetterWord; i++) {
	result += nonLetterWords.get(i).length + 1;
	}
	return result;
	}

	/**
	* w = "**nnllllllnnn***",
	* where n is a non-letter, l is a letter,
	* all n may be absent, the first n is at offset,
	* the first l is at offset + iIgnoreAtBeginning;
	* word = ".llllll.'\0'***",
	* where all l in w are copied into word.
	* In the first part of the routine len = w.length,
	* in the second part of the routine len = word.length.
	* Three indices are used:
	* index(w), the index in w,
	* index(word), the index in word,
	* letterindex(word), the index in the letter part of word.
	* The following relations exist:
	* index(w) = offset + i - 1
	* index(word) = i - iIgnoreAtBeginning
	* letterindex(word) = index(word) - 1
	* (see first loop).
	* It follows that:
	* index(w) - index(word) = offset - 1 + iIgnoreAtBeginning
	* index(w) = letterindex(word) + offset + iIgnoreAtBeginning
	*/

	/**
	* Hyphenate word and return an array of hyphenation points.
	* @param w char array that contains the word
	* @param offset Offset to first character in word
	* @param len Length of word
	* @param remainCharCount Minimum number of characters allowed
	* before the hyphenation point.
	* @param pushCharCount Minimum number of characters allowed after
	* the hyphenation point.
	* @return a {@link Hyphenation Hyphenation} object representing
	* the hyphenated word or null if word is not hyphenated.
	*/
	public Hyphenation hyphenate(char[] w, int offset, int len,
	int remainCharCount, int pushCharCount) {
	int i;
	char[] word = new char[len + 3];

	// normalize word
	char[] c = new char[2];
	int iIgnoreAtBeginning = 0;
	int iLength = len;
	boolean bEndOfLetters = false;
	for (i = 1; i <= len; i++) {
	c[0] = w[offset + i - 1];
	int nc = classmap.find(c, 0);
	if (nc < 0) { // found a non-letter character ...
	if (i == (1 + iIgnoreAtBeginning)) {
	// ... before any letter character
	iIgnoreAtBeginning++;
	} else {
	// ... after a letter character
	bEndOfLetters = true;
	}
	iLength--;
	} else {
	if (!bEndOfLetters) {
	word[i - iIgnoreAtBeginning] = (char)nc;
	} else {
	return null;
	}
	}
	}

	len = iLength;
	if (len < (remainCharCount + pushCharCount)) {
	// word is too short to be hyphenated
	return null;
	}
	int[] result = new int[len + 1];
	int k = 0;

	// check exception list first
	String sw = new String(word, 1, len);
	if (stoplist.containsKey(sw)) {
	// assume only simple hyphens (Hyphen.pre="-", Hyphen.post = Hyphen.no = null)
	ArrayList hw = (ArrayList)stoplist.get(sw);
	int j = 0;
	for (i = 0; i < hw.size(); i++) {
	Object o = hw.get(i);
	// j = index(sw) = letterindex(word)?
	// result[k] = corresponding index(w)
	if (o instanceof String) {
	j += ((String)o).length();
	if (j >= remainCharCount && j < (len - pushCharCount)) {
	result[k++] = j + iIgnoreAtBeginning;
	}
	}
	}
	} else {
	// use algorithm to get hyphenation points
	word[0] = '.'; // word start marker
	word[len + 1] = '.'; // word end marker
	word[len + 2] = 0; // null terminated
	byte[] il = new byte[len + 3]; // initialized to zero
	for (i = 0; i < len + 1; i++) {
	searchPatterns(word, i, il);
	}

	// hyphenation points are located where interletter value is odd
	// i is letterindex(word),
	// i + 1 is index(word),
	// result[k] = corresponding index(w)
	for (i = 0; i < len; i++) {
	if (((il[i + 1] & 1) == 1) && i >= remainCharCount
	&& i <= (len - pushCharCount)) {
	result[k++] = i + iIgnoreAtBeginning;
	}
	}
	}


	if (k > 0) {
	// trim result array
	int[] res = new int[k];
	System.arraycopy(result, 0, res, 0, k);
	return new Hyphenation(new String(w, offset, len), res);
	} else {
	return null;
	}
	}

	/**
	* Add a character class to the tree. It is used by
	* {@link PatternParser PatternParser} as callback to
	* add character classes. Character classes define the
	* valid word characters for hyphenation. If a word contains
	* a character not defined in any of the classes, it is not hyphenated.
	* It also defines a way to normalize the characters in order
	* to compare them with the stored patterns. Usually pattern
	* files use only lower case characters, in this case a class
	* for letter 'a', for example, should be defined as "aA", the first
	* character being the normalization char.
	* @param chargroup a character class (group)
	*/
	public void addClass(String chargroup) {
	if (chargroup.length() > 0) {
	char equivChar = chargroup.charAt(0);
	char[] key = new char[2];
	key[1] = 0;
	for (int i = 0; i < chargroup.length(); i++) {
	key[0] = chargroup.charAt(i);
	classmap.insert(key, 0, equivChar);
	}
	}
	}

	/**
	* Add an exception to the tree. It is used by
	* {@link PatternParser PatternParser} class as callback to
	* store the hyphenation exceptions.
	* @param word normalized word
	* @param hyphenatedword a vector of alternating strings and
	* {@link Hyphen hyphen} objects.
	*/
	public void addException(String word, ArrayList hyphenatedword) {
	stoplist.put(word, hyphenatedword);
	}

	/**
	* Add a pattern to the tree. Mainly, to be used by
	* {@link PatternParser PatternParser} class as callback to
	* add a pattern to the tree.
	* @param pattern the hyphenation pattern
	* @param ivalue interletter weight values indicating the
	* desirability and priority of hyphenating at a given point
	* within the pattern. It should contain only digit characters.
	* (i.e. '0' to '9').
	*/
	public void addPattern(String pattern, String ivalue) {
	int k = ivalues.find(ivalue);
	if (k <= 0) {
	k = packValues(ivalue);
	ivalues.insert(ivalue, (char)k);
	}
	insert(pattern, (char)k);
	}

	/**
	* Print statistics.
	*/
	public void printStats() {
	System.out.println("Value space size = "
	+ Integer.toString(vspace.length()));
	super.printStats();

	}

	/**
	* Main entry point for this hyphenation utility application.
	* @param argv array of command linee arguments
	* @throws Exception in case an exception is raised but not caught
	*/
	public static void main(String[] argv) throws Exception {
	HyphenationTree ht = null;
	int minCharCount = 2;
	BufferedReader in
	= new BufferedReader(new java.io.InputStreamReader(System.in));
	while (true) {
	System.out.print("l:\tload patterns from XML\n"
	+ "L:\tload patterns from serialized object\n"
	+ "s:\tset minimum character count\n"
	+ "w:\twrite hyphenation tree to object file\n"
	+ "h:\thyphenate\n"
	+ "f:\tfind pattern\n"
	+ "b:\tbenchmark\n"
	+ "q:\tquit\n\n"
	+ "Command:");
	String token = in.readLine().trim();
	if (token.equals("f")) {
	System.out.print("Pattern: ");
	token = in.readLine().trim();
	System.out.println("Values: " + ht.findPattern(token));
	} else if (token.equals("s")) {
	System.out.print("Minimun value: ");
	token = in.readLine().trim();
	minCharCount = Integer.parseInt(token);
	} else if (token.equals("l")) {
	ht = new HyphenationTree();
	System.out.print("XML file name: ");
	token = in.readLine().trim();
	ht.loadPatterns(token);
	} else if (token.equals("L")) {
	ObjectInputStream ois = null;
	System.out.print("Object file name: ");
	token = in.readLine().trim();
	try {
	ois = new ObjectInputStream(new FileInputStream(token));
	ht = (HyphenationTree)ois.readObject();
	} catch (Exception e) {
	e.printStackTrace();
	} finally {
	if (ois != null) {
	try {
	ois.close();
	} catch (IOException e) {
	//ignore
	}
	}
	}
	} else if (token.equals("w")) {
	System.out.print("Object file name: ");
	token = in.readLine().trim();
	ObjectOutputStream oos = null;
	try {
	oos = new ObjectOutputStream(new FileOutputStream(token));
	oos.writeObject(ht);
	} catch (Exception e) {
	e.printStackTrace();
	} finally {
	if (oos != null) {
	try {
	oos.flush();
	} catch (IOException e) {
	//ignore
	}
	try {
	oos.close();
	} catch (IOException e) {
	//ignore
	}
	}
	}
	} else if (token.equals("h")) {
	System.out.print("Word: ");
	token = in.readLine().trim();
	System.out.print("Hyphenation points: ");
	System.out.println(ht.hyphenate(token, minCharCount,
	minCharCount));
	} else if (token.equals("b")) {
	if (ht == null) {
	System.out.println("No patterns have been loaded.");
	break;
	}
	System.out.print("Word list filename: ");
	token = in.readLine().trim();
	long starttime = 0;
	int counter = 0;
	BufferedReader reader = null;
	try {
	reader = new BufferedReader(new FileReader(token));
	String line;

	starttime = System.currentTimeMillis();
	while ((line = reader.readLine()) != null) {
	// System.out.print("\nline: ");
	Hyphenation hyp = ht.hyphenate(line, minCharCount,
	minCharCount);
	if (hyp != null) {
	String hword = hyp.toString();
	// System.out.println(line);
	// System.out.println(hword);
	} else {
	// System.out.println("No hyphenation");
	}
	counter++;
	}
	} catch (Exception ioe) {
	System.out.println("Exception " + ioe);
	ioe.printStackTrace();
	} finally {
	IOUtils.closeQuietly(reader);
	}
	long endtime = System.currentTimeMillis();
	long result = endtime - starttime;
	System.out.println(counter + " words in " + result
	+ " Milliseconds hyphenated");

	} else if (token.equals("q")) {
	break;
	}
	}

	}

	}