ruta-core/src/main/java/org/apache/uima/ruta/resource/MultiTreeWordList.java - uima-ruta - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.ruta.resource;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.InputStreamReader;
 import java.nio.file.Path;
 import java.util.ArrayList;
 import java.util.Collection;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.LinkedList;
 import java.util.List;
 import java.util.Map;
 import java.util.Map.Entry;
 import java.util.Set;

 import org.apache.commons.io.IOUtils;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.cas.Feature;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.TypeSystem;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.ruta.RutaStream;
 import org.apache.uima.ruta.type.RutaBasic;
 import org.springframework.core.io.FileSystemResource;
 import org.springframework.core.io.Resource;

 /**
  * Class MultiTreeWordList.
  *
  *
  */
 public class MultiTreeWordList implements RutaWordList {

   private static final String ENCODING = "UTF-8";

   private MultiTreeWordListPersistence persistence = new MultiTreeWordListPersistence();

   /** The root of the TreeWordList. */
   protected MultiTextNode root;

   /** The cost model we are using. */
   private EditDistanceCostMap costMap;

   /**
    * Default constructor.
    *
    * @throws IOException
    *           - should not happen but required by called constructor
    */
   public MultiTreeWordList() throws IOException {
     this(new String[] {}, null);
   }

   /**
    * Default constructor uses just one file.
    *
    * @param pathname
    *          the pathname of the used file.
    * @param base
    *          the relative base
    * @throws IOException
    *           When there is a problem reading pathname.
    */
   public MultiTreeWordList(String pathname, File base) throws IOException {
     this(new FileSystemResource(pathname));
   }

   /**
    * @param lists
    *          Resources to load.
    * @throws IOException
    *           When there is a problem reading a resource.
    */
   public MultiTreeWordList(Resource... lists) throws IOException {
     this.root = new MultiTextNode();
     this.costMap = new EditDistanceCostMap();

     if (lists == null) {
       return;
     }

     for (Resource list : lists) {
       // check if the resource is a directory
       File directory = null;
       try {
         directory = list.getFile();
       } catch (IOException e) {
         // resource is not on the file system
         directory = null;
       }

       if (directory != null && directory.isDirectory()) {
         // resource is a directory, load its content
         for (File data : directory.listFiles()) {
           load(new FileSystemResource(data));
         }
       } else {
         // resource is not a directory, load it normally
         load(list);
       }
     }
   }

   /**
    * Constructor from an open stream. This method will close the stream.
    *
    * @param stream
    *          the stream to read the file from.
    * @param name
    *          associated name
    * @throws IOException
    *           When there is a problem reading the input stream.
    */
   public MultiTreeWordList(InputStream stream, String name) throws IOException {
     this.root = new MultiTextNode();
     this.costMap = new EditDistanceCostMap();

     if (name.endsWith(".mtwl")) {
       persistence.readMTWL(root, stream, ENCODING);
     } else if (name.endsWith(".txt")) {
       buildNewTree(stream, name);
     }
   }

   /**
    * Constructs a TreeWordList from a file with path = filename
    *
    * @param pathnames
    *          path of the file to create a TextWordList from
    * @param base
    *          - the relative base
    * @throws IOException
    *           When there is a problem reading a path.
    */
   public MultiTreeWordList(String[] pathnames, File base) throws IOException {
     this.root = new MultiTextNode();
     this.costMap = new EditDistanceCostMap();

     if (pathnames == null) {
       return;
     }

     for (String pathname : pathnames) {
       String name = getRelativePath(new File(pathname), base);
       load(new FileSystemResource(pathname), name);
     }
   }

   /**
    * @param files
    *          - the input files
    * @param base
    *          - the relative base
    * @throws IOException
    *           - When there is a problem reading the files.
    */
   public MultiTreeWordList(List<File> files, File base) throws IOException {
     this.root = new MultiTextNode();
     this.costMap = new EditDistanceCostMap();

     if (files == null) {
       return;
     }

     for (File file : files) {
       String name = getRelativePath(file, base);
       load(new FileSystemResource(file), name);
     }
   }

   private String getRelativePath(File file, File base) {
     if (base == null) {
       return file.getName();
     }
     Path filePath = file.toPath();
     Path basePath = base.toPath();
     Path relativize = basePath.relativize(filePath);
     String result = relativize.toString().replaceAll("\\\\", "/");
     return result;
   }

   /**
    * Load a resource in this word list.
    *
    * @param resource
    *          Resource to load. The resource's name must end with .txt or .mtwl.
    * @throws IOException
    *           When there is a problem reading the resource.
    */

   private void load(Resource resource) throws IOException {
     load(resource, resource.getFilename());
   }

   /**
    * Load a resource in this word list.
    *
    * @param resource
    *          Resource to load.
    * @param name
    *          - The resource's name must end with .txt or .mtwl.
    * @throws IOException
    *           When there is a problem reading the resource.
    */
   private void load(Resource resource, String name) throws IOException {
     InputStream stream = null;
     try {
       stream = resource.getInputStream();
       if (name == null) {
         throw new IllegalArgumentException("List does not have a name.");
       } else if (name.endsWith(".txt")) {
         buildNewTree(stream, name);
       } else if (name.endsWith(".mtwl")) {
         persistence.readMTWL(root, stream, "UTF-8");
       } else {
         throw new IllegalArgumentException(
                 "File name should end with .mtwl or .txt, found " + name);
       }
     } finally {
       IOUtils.closeQuietly(stream);
     }
   }

   /**
    * Creates a new Tree in the existing treeWordList from a file with path pathname
    *
    * @param stream
    *          - Input stream for the file containing the words for the treeWordList
    * @param name
    *          - Associated name for the file
    * @throws IOException
    *           When there is a problem reading the inputstream.
    */
   public void buildNewTree(InputStream stream, String name) throws IOException {
     BufferedReader br = new BufferedReader(new InputStreamReader(stream, ENCODING));
     String s = null;

     while ((s = br.readLine()) != null) {
       addWord(s.trim(), name);
     }
     stream.close();
     br.close();
   }

   /**
    * Add a new String into the MultiTreeWordList.
    *
    * @param s
    *          The String to add
    * @param type
    *          The type of the string.
    */
   public void addWord(String s, String type) {

     // Create Nodes from all chars of the strings besides the last one
     MultiTextNode pointer = root;

     for (Character each : s.toCharArray()) {

       MultiTextNode childNode = pointer.getChildNode(each);

       if (childNode == null) {
         childNode = new MultiTextNode(each, false);
         pointer.addChild(childNode);
       }

       pointer = childNode;
     }
     pointer.setWordEnd(s.length() > 0);
     pointer.addType(type);
   }

   /**
    * Returns all Types contained by the MultiTreeWordList.
    *
    * @return all Types contained by the MultiTreeWordList.
    */
   public Collection<String> getTypes() {
     return getTypeCone(root);
   }

   /**
    * Returns all types contained by the cone of the MultiTextNode node, including the types of node
    * itself.
    *
    * @param node
    *          The node where we start, the root of the cone.
    * @return all types contained by the cone of the MultiTextNode node, including the types of node
    *         itself.
    */
   public Collection<String> getTypeCone(MultiTextNode node) {
     // TODO improve this method!
     List<String> returnList = new LinkedList<String>();

     if (node.getTypes() != null) {
       for (String s : node.getTypes()) {
         if (!returnList.contains(s)) {
           returnList.add(s);
         }
       }
     }

     for (Character c : node.getChildren().keySet()) {
       for (String s : getTypeCone(node.getChildNode(c))) {
         if (!returnList.contains(s)) {
           returnList.add(s);
         }
       }
     }

     return returnList;
   }

   /**
    * Returns all strings contained by the MultiTreeWordList.
    *
    * @return All strings contained by the MultiTreeWordList.
    */
   public Collection<String> keySet() {
     List<String> keySet = new LinkedList<String>(keySet(root, ""));
     Collections.sort(keySet);
     return keySet;
   }

   /**
    * Returns all strings contained by the cone of the MultiTextNode node and uses prefix as the
    * prefix of all the strings.
    *
    * @param node
    *          the node we are considering.
    * @param prefix
    *          the prefix until now.
    * @return All strings contained by the cone of the MultiTextNode node.
    */
   private Collection<String> keySet(MultiTextNode node, String prefix) {

     List<String> resultList = new LinkedList<String>();

     // Recursion stop.
     if (node.isWordEnd()) {
       resultList.add(prefix);
     }

     // Recursion step.
     for (Character c : node.getChildren().keySet()) {
       String temp = prefix + String.valueOf(c);
       resultList.addAll(keySet(node.getChildNode(c), temp));
     }

     return resultList;
   }

   /**
    * Returns all types of the very string s.
    *
    * @param s
    *          The string with the types.
    * @return All types from the very string s.
    */
   public Collection<String> getTypes(String s) {
     return getTypes(s, false);
   }

   /**
    * Returns the types of the string s.
    *
    * @param s
    *          The string with the types.
    * @param ignoreCase
    *          Indicates, whether we search case sensitive or not.
    * @return The types of the string s.
    */
   public Collection<String> getTypes(String s, boolean ignoreCase) {

     // Collection<Set<String>> types = editDistanceClever(root, s, "", 0.0,
     // 0,
     // ignoreCase, false, costMap, false, false).values();
     // Map<String, Set<String>> types = editDistanceClever(root, s, "", 0.0,
     // 0, ignoreCase, false, costMap, false, false);
     Map<String, Set<String>> types = editDistance(s, 0, ignoreCase, "");
     Set<String> returnSet = new HashSet<String>();

     for (Entry<String, Set<String>> each : types.entrySet()) {
       returnSet.addAll(each.getValue());
     }

     return returnSet;
   }

   /**
    * Returns a list of types which belong to a string.
    *
    * @param string
    *          The string which types we want to have.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param ignoreLength
    *          If the length of the string is less than of equal to this, we search case insensitive.
    * @param edit
    *          Indicates whether we use an edit distance or not.
    * @param distance
    *          The edit distance to a string contained by the MultiTreeWordList.
    * @param ignoreToken
    *          Characters which can be ignored.
    * @return Returns a list of types which belong to a string.
    */

   @Override
   public List<String> contains(String string, boolean ignoreCase, int ignoreLength, boolean edit,
           double distance, String ignoreToken) {

     List<String> resultList = new LinkedList<String>();
     Map<String, Set<String>> editDistance;

     if (string.length() >= ignoreLength && ignoreCase) {
       editDistance = editDistance(string, (int) distance, true, ignoreToken, false);
     } else {
       editDistance = editDistance(string, (int) distance, false, ignoreToken, false);
     }
     for (Entry<String, Set<String>> each : editDistance.entrySet()) {
       resultList.addAll(each.getValue());
     }
     return resultList;
   }

   /**
    * Checks whether a string is contained by the MultiTreeWordList or not.
    *
    * @param string
    *          The string which is contained or not.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param ignoreLength
    *          If the length of the string is less than of equal to this, we search case insensitive.
    * @param edit
    *          Indicates whether we use an edit distance or not.
    * @param distance
    *          The edit distance to a string contained by the MultiTreeWordList.
    * @param ignoreToken
    *          Characters which can be ignored.
    * @return true, if the string is contained by the MultiTreeWordList, false otherwise.
    */
   public boolean containsBool(String string, boolean ignoreCase, int ignoreLength, boolean edit,
           double distance, String ignoreToken) {
     return editDistanceBool(root, string, "", distance, 0, ignoreCase, false, costMap);
   }

   /**
    * Checks whether the tree contains exaclty the string s.
    *
    * @param s
    *          The string which is contained or not.
    * @return True, if the TreeWordList contains exactly the string s, false otherwise.
    */
   public boolean contains(String s) {
     return contains(s, false);
   }

   /**
    * Checks whether the tree contains the string s.
    *
    * @param s
    *          The string which is contained or not.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @return True, if the TreeWordList contains the string s, false otherwise.
    */
   public boolean contains(String s, boolean ignoreCase) {
     return contains(s, ignoreCase, 0, new char[] {}, 0, true);
   }

   /**
    * Checks if the MultiTreeWordList contains the string s.
    *
    * @param s
    *          The string which is contained or not.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param size
    *          The index of the string.
    * @param ignoreChars
    *          Characters which can be ignored.
    * @param maxIgnoreChars
    *          The maximum number of ignored characters.
    * @return true, if TreeWordList contains the string, false otherwise.
    */
   @Override
   public boolean contains(String s, boolean ignoreCase, int size, char[] ignoreChars,
           int maxIgnoreChars, boolean ignoreWS) {

     EditDistanceCostMap edm = new EditDistanceCostMap();

     for (Character c : ignoreChars) {
       edm.setDeleteCosts(c, 0.0);
     }

     return editDistanceBool(root, s, "", maxIgnoreChars, 0, ignoreCase, false, edm);
   }

   /**
    * Checks if the MultiTreeWordList contains a prefix of the string s.
    *
    * @param s
    *          The string which is contained or not.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param size
    *          The index of the string.
    * @param ignoreChars
    *          Characters which can be ignored.
    * @param maxIgnoreChars
    *          The maximum number of ignored characters.
    * @return true, if TreeWordList contains a prefix of the string, false otherwise.
    */
   @Override
   public boolean containsFragment(String s, boolean ignoreCase, int size, char[] ignoreChars,
           int maxIgnoreChars, boolean ignoreWS) {
     MultiTextNode pointer = root;
     return recursiveContains(pointer, s, 0, ignoreCase && s.length() > size, true, ignoreChars,
             maxIgnoreChars);
   }

   /**
    * Checks whether prefix of a string is contained by the MultiTreeWordList or not.
    *
    * @param string
    *          The string whose prefix is contained or not.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param ignoreLength
    *          If the length of the string is less than of equal to this, we search case insensitive.
    * @param edit
    *          Indicates whether we use an edit distance or not.
    * @param distance
    *          The edit distance to a string contained by the MultiTreeWordList.
    * @param ignoreToken
    *          Characters which can be ignored.
    * @return true, if a prefix of the string is contained by the MultiTreeWordList, false otherwise.
    */
   public boolean containsFragmentBool(String string, boolean ignoreCase, int ignoreLength,
           boolean edit, double distance, String ignoreToken) {

     if (string.length() >= ignoreLength && ignoreCase) {
       return editDistanceBool(root, string, "", distance, 0, true, true, costMap);
     } else {
       return editDistanceBool(root, string, "", distance, 0, false, true, costMap);
     }
   }

   /**
    * Returns a list of types which belong to a prefix of a string that is contained by the
    * MultiTreeWordList.
    *
    * @param string
    *          The string whose prefix's types we are interested in.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param ignoreLength
    *          If the length of the string is less than of equal to this, we search case insensitive.
    * @param edit
    *          Indicates whether we use an edit distance or not.
    * @param distance
    *          The edit distance to a string contained by the MultiTreeWordList.
    * @param ignoreToken
    *          Characters which can be ignored.
    * @return A list of types which belong to a prefix of a string that is contained by the
    *         MultiTreeWordList.
    */
   @Override
   public List<String> containsFragment(String string, boolean ignoreCase, int ignoreLength,
           boolean edit, double distance, String ignoreToken) {

     List<String> resultList = new LinkedList<String>();
     Map<String, Set<String>> resultMap = null;

     if (!edit) {
       return recursiveContains2(root, string, 0, ignoreCase && string.length() > ignoreLength, true,
               ignoreToken.toCharArray(), ignoreLength);
     } else {
       if (string.length() >= ignoreLength && ignoreCase) {
         resultMap = editDistance(string, (int) distance, true, ignoreToken, true);
       } else {
         resultMap = editDistance(string, (int) distance, false, ignoreToken, true);
       }

       for (Set<String> set : resultMap.values()) {
         for (String s : set) {
           if (!resultList.contains(s)) {
             // resultList.addAll(resultMap.get(set));
             resultList.add(s);
           }
         }
       }
     }

     return resultList;
   }

   /**
    * Returns true, if the MultiTreeWordList contains the string text, false otherwise.
    *
    * @param pointer
    *          The MultiTextNode we are looking at.
    * @param text
    *          The string which is contained or not.
    * @param index
    *          The index of the string text we checked until now.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param fragment
    *          Indicates whether we are looking for a prefix of the string text.
    * @param ignoreChars
    *          Characters which can be ignored.
    * @param maxIgnoreChars
    *          Maximum number of characters which are allowed to be ignored.
    * @return True, if the TreeWordList contains the string text, false otherwise.
    */
   private List<String> recursiveContains2(MultiTextNode pointer, String text, int index,
           boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars) {

     if (pointer == null) {
       return null;
     }

     if (index == text.length()) {
       if (pointer.isWordEnd()) {
         return new ArrayList<String>(pointer.getTypes());
       }
       if (fragment) {
         return Collections.emptyList();
       }
     }

     char charAt = text.charAt(index);
     boolean charAtIgnored = false;

     if (ignoreChars != null) {
       for (char each : ignoreChars) {
         if (each == charAt) {
           charAtIgnored = true;
           break;
         }
       }
       charAtIgnored &= index != 0;
     }

     int next = ++index;

     if (ignoreCase) {

       // Lower Case Node.
       MultiTextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt));
       if (childNodeL == null) {
         childNodeL = skipWS(pointer, Character.toLowerCase(charAt));
       }

       // Upper Case Node.
       MultiTextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt));
       if (childNodeU == null) {
         childNodeU = skipWS(pointer, Character.toUpperCase(charAt));
       }

       if (charAtIgnored && childNodeL == null && childNodeU == null) {
         // Character is ignored and does not appear.
         return recursiveContains2(pointer, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       } else {
         // Recursion.
         Collection<String> recursiveContainsL = recursiveContains2(childNodeL, text, next,
                 ignoreCase, fragment, ignoreChars, maxIgnoreChars);
         Collection<String> recursiveContainsU = recursiveContains2(childNodeU, text, next,
                 ignoreCase, fragment, ignoreChars, maxIgnoreChars);
         if (recursiveContainsL == null && recursiveContainsU == null) {
           return null;
         }
         List<String> result = new LinkedList<String>();
         if (recursiveContainsL != null) {
           result.addAll(recursiveContainsL);
         }
         if (recursiveContainsU != null) {
           result.addAll(recursiveContainsU);
         }
         return result;
       }

     } else {
       // Case sensitive.
       MultiTextNode childNode = pointer.getChildNode(charAt);

       if (charAtIgnored && childNode == null) {
         // Recursion with incremented index.
         return recursiveContains2(pointer, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       } else {
         // Recursion with new node.
         return recursiveContains2(childNode, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       }
     }
   }

   private MultiTextNode skipWS(MultiTextNode pointer, char charAt) {
     MultiTextNode childNode = pointer.getChildNode(' ');
     if (childNode != null) {
       MultiTextNode node = childNode.getChildNode(charAt);
       if (node == null) {
         return skipWS(childNode, charAt);
       } else {
         return node;
       }
     }
     return null;
   }

   /**
    * Returns true, if the MultiTreeWordList contains the string text, false otherwise.
    *
    * @param pointer
    *          The MultiTextNode we are looking at.
    * @param text
    *          The string which is contained or not.
    * @param index
    *          The index of the string text we checked until now.
    * @param ignoreCase
    *          Indicates whether we search case sensitive or not.
    * @param fragment
    *          Indicates whether we are looking for a prefix of the string text.
    * @param ignoreChars
    *          Characters which can be ignored.
    * @param maxIgnoreChars
    *          Maximum number of characters which are allowed to be ignored.
    * @return True, if the TreeWordList contains the string text, false otherwise.
    */
   private boolean recursiveContains(MultiTextNode pointer, String text, int index,
           boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars) {

     if (pointer == null) {
       return false;
     }

     if (index == text.length()) {
       return fragment || pointer.isWordEnd();
     }

     char charAt = text.charAt(index);
     boolean charAtIgnored = false;

     if (ignoreChars != null) {
       for (char each : ignoreChars) {
         if (each == charAt) {
           charAtIgnored = true;
           break;
         }
       }
       charAtIgnored &= index != 0;
     }

     int next = ++index;

     if (ignoreCase) {

       // Lower Case Node.
       MultiTextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt));

       // Upper Case Node.
       MultiTextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt));

       if (charAtIgnored && childNodeL == null && childNodeU == null) {
         // Character is ignored and does not appear.
         return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       } else {
         // Recursion.
         return recursiveContains(childNodeL, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars)
                 || recursiveContains(childNodeU, text, next, ignoreCase, fragment, ignoreChars,
                         maxIgnoreChars);
       }

     } else {
       // Case sensitive.
       MultiTextNode childNode = pointer.getChildNode(charAt);

       if (charAtIgnored && childNode == null) {
         // Recursion with incremented index.
         return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       } else {
         // Recursion with new node.
         return recursiveContains(childNode, text, next, ignoreCase, fragment, ignoreChars,
                 maxIgnoreChars);
       }
     }
   }

   @Override
   public Collection<AnnotationFS> find(RutaStream stream, Map<String, Object> typeMap,
           boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) {

     Collection<AnnotationFS> results = new HashSet<AnnotationFS>();
     stream.moveToFirst();
     RutaStream streamPointer = stream.copy();

     while (stream.isValid()) {
       RutaBasic anchorBasic = (RutaBasic) stream.get();
       streamPointer.moveTo(anchorBasic);

       List<RutaBasic> basicsToAdd = new ArrayList<RutaBasic>();
       basicsToAdd.add(anchorBasic);
       String text = anchorBasic.getCoveredText();
       StringBuilder candidate = new StringBuilder(text);
       String lastCandidate = candidate.toString();

       if (text.length() != 1 || !ignoreToken.contains(text)) {

         List<AnnotationFS> interResults = new ArrayList<AnnotationFS>();

         while (streamPointer.isValid()) {

           boolean skip = false;
           String currentBasicText = basicsToAdd.get(basicsToAdd.size() - 1).getCoveredText();
           if (currentBasicText.length() == 1 && ignoreToken.contains(currentBasicText)) {
             skip = true;
           }
           List<String> types = null;
           if (!skip) {
             types = containsFragment(candidate.toString(), ignoreCase, ignoreLength, edit, distance,
                     ignoreToken);
           }
           if (skip || types != null) {
             streamPointer.moveToNext();
             if (streamPointer.isValid()) {
               RutaBasic next = (RutaBasic) streamPointer.get();
               // List<String> contains = contains(candidate,
               // ignoreCase,
               // ignoreLength, edit, distance, ignoreToken);
               if (!skip) {
                 tryToCreateAnnotation(types, stream, results, basicsToAdd, candidate.toString(),
                         interResults, ignoreCase, ignoreLength, edit, distance, ignoreToken,
                         typeMap);
               }
               lastCandidate = candidate.toString();
               candidate.append(next.getCoveredText());
               basicsToAdd.add(next);

             } else {
               // !streamPointer.isValid();
               tryToCreateAnnotation(types, stream, results, basicsToAdd, lastCandidate,
                       interResults, ignoreCase, ignoreLength, edit, distance, ignoreToken, typeMap);
             }
           } else {

             // containsFragment.isEmpty();
             // basicsToAdd.remove(basicsToAdd.size() - 1);
             // tryToCreateAnnotation(stream, results, basicsToAdd,
             // lastCandidate, interResults, ignoreCase,
             // ignoreLength, edit, distance, ignoreToken, typeMap);

             // breaks inner while()-loop.
             break;
           }

         }
       }
       stream.moveToNext();
     }

     return results;
   }

   @Override
   public List<AnnotationFS> find(RutaStream stream, boolean ignoreCase, int size,
           char[] ignoreChars, int maxIgnoredChars, boolean ignoreWS) {
     assert false;
     return new ArrayList<AnnotationFS>();
   }

   private void tryToCreateAnnotation(List<String> types, RutaStream stream,
           Collection<AnnotationFS> results, List<RutaBasic> basicsToAdd, String lastCandidate,
           List<AnnotationFS> interResult, boolean ignoreCase, int ignoreLength, boolean edit,
           double distance, String ignoreToken, Map<String, Object> map) {
     if (basicsToAdd.size() >= 1 && types != null) {
       Set<String> set = new HashSet<String>(types);
       for (String each : set) {
         Object o = map.get(each);
         if (o instanceof Type) {
           Type type = (Type) o;
           int begin = basicsToAdd.get(0).getBegin();
           int end = basicsToAdd.get(basicsToAdd.size() - 1).getEnd();
           AnnotationFS newFS = stream.getCas().createAnnotation(type, begin, end);
           results.add(newFS);
         } else if (o instanceof List) {
           List<?> list = (List<?>) o;
           Type type = null;
           String featureString = null;
           Object value = each;
           if (list.size() == 2 || list.size() == 3) {
             if (list.get(0) instanceof Type) {
               type = (Type) list.get(0);
             }
             if (list.get(1) instanceof String) {
               featureString = (String) list.get(1);
             }
             if (list.size() == 3) {
               value = list.get(2);
             }

             if (type != null && featureString != null) {
               int begin = basicsToAdd.get(0).getBegin();
               int end = basicsToAdd.get(basicsToAdd.size() - 1).getEnd();
               AnnotationFS newFS = stream.getCas().createAnnotation(type, begin, end);
               Feature feature = type.getFeatureByBaseName(featureString);
               setFeatureValue(newFS, feature, value, stream);
               results.add(newFS);
             }
           }
         }
       }
     } else if (interResult != null && !interResult.isEmpty()) {
       results.addAll(interResult);
     }
   }

   private void setFeatureValue(AnnotationFS annotationFS, Feature feature, Object o,
           RutaStream stream) {
     TypeSystem typeSystem = stream.getCas().getTypeSystem();
     if (feature != null && o != null) {
       Type range = feature.getRange();
       String rangeName = range.getName();
       if (typeSystem.subsumes(typeSystem.getType(CAS.TYPE_NAME_STRING), range)
               && o instanceof String) {
         annotationFS.setStringValue(feature, (String) o);
       } else if (rangeName.equals(CAS.TYPE_NAME_INTEGER) && o instanceof Number) {
         annotationFS.setIntValue(feature, ((Number) o).intValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_DOUBLE) && o instanceof Number) {
         annotationFS.setDoubleValue(feature, ((Number) o).doubleValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_FLOAT) && o instanceof Number) {
         annotationFS.setFloatValue(feature, ((Number) o).floatValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_BYTE) && o instanceof Number) {
         annotationFS.setByteValue(feature, ((Number) o).byteValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_SHORT) && o instanceof Number) {
         annotationFS.setShortValue(feature, ((Number) o).shortValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_LONG) && o instanceof Number) {
         annotationFS.setLongValue(feature, ((Number) o).longValue());
       } else if (rangeName.equals(CAS.TYPE_NAME_BOOLEAN) && o instanceof Boolean) {
         annotationFS.setBooleanValue(feature, (Boolean) o);
       } else if (typeSystem.subsumes(typeSystem.getType(CAS.TYPE_NAME_STRING), range)
               & o instanceof Type) {
         annotationFS.setStringValue(feature, ((Type) o).getName());
       }
     } else {
       throw new IllegalArgumentException(
               "Not able to assign feature value: " + o + " -> " + feature);
     }
   }

   /**
    * Returns a map with all strings with a specified edit distance to the string query as keys and
    * the files they belong to as values.
    *
    * @param query
    *          - The query string.
    * @param distance
    *          - The specified edit distance.
    * @return A map with all strings with a specified edit distance to the string query as keys and
    *         the files they belong to as values.
    */
   public Map<String, Set<String>> editDistance(String query, int distance) {
     return editDistance(query, distance, false, "");
   }

   /**
    * Returns a map with all strings with a specified edit distance to the string query as keys and
    * the files they belong to as values.
    *
    * @param query
    *          - The query string.
    * @param distance
    *          - The specified edit distance.
    * @param ignoreCase
    *          - Indicates whether we search case sensitive or not.
    * @param ignoreToken
    *          - Indicates the characters to ignore
    * @return A map with all strings with a specified edit distance to the string query as keys and
    *         the files they belong to as values.
    */
   public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
           String ignoreToken) {
     return editDistance(query, distance, ignoreCase, ignoreToken, false);
   }

   /**
    * Returns a map with all strings with a specified edit distance to the string query as keys and
    * the files they belong to as values.
    *
    * @param query
    *          - The query string.
    * @param distance
    *          - The specified edit distance.
    * @param ignoreCase
    *          - Indicates whether we search case sensitive or not.
    * @param ignoreToken
    *          - the characters to ignore
    * @param fragment
    *          - Indicates whether we search for fragments of the query string or not.
    * @return A map with all strings with a specified edit distance to the string query as keys and
    *         the files they belong to as values.
    */
   public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
           String ignoreToken, boolean fragment) {

     // The second alternative realizes the fragment functionality by
     // setting the insert costs of the ignored character to zero. This
     // is much more elegant and easier to maintain. I don't know if the
     // other way is faster, so I did not delete it yet.

     Map<Character, Double> oldInsertCosts = new HashMap<Character, Double>();
     EditDistanceCostMap edcm = new EditDistanceCostMap();

     // We need to store the old insert costs before we set them to zero.
     for (char c : ignoreToken.toCharArray()) {
       oldInsertCosts.put(c, edcm.getInsertCosts(c));
       edcm.setInsertCosts(c, 0.0);
     }

     Map<String, Set<String>> result = null;

     if (ignoreCase) {
       result = editDistanceClever(root, query.toLowerCase(), "", distance, 0, true, fragment, edcm,
               false, false);
     } else {
       result = editDistanceClever(root, query, "", distance, 0, false, fragment, edcm, false,
               false);
     }

     // Restoring of the old insert costs.
     for (Entry<Character, Double> c : oldInsertCosts.entrySet()) {
       edcm.setDeleteCosts(c.getKey(), c.getValue());
     }

     return result;
   }

   /**
    * Returns a map with all strings with a specified edit distance to the string query as keys and
    * the files they belong to as values.
    *
    * @param node
    *          - The MultiTextNode which is under consideration at the moment.
    * @param query
    *          - The query string.
    * @param result
    *          - The result which matched until now.
    * @param distance
    *          - The remaining edit distance.
    * @param index
    *          - The index of the query string at the moment.
    * @param ignoreCase
    *          - Indicates whether we search case sensitive or not.
    * @param fragment
    *          - Indicates whether we search for fragments of the query string or not.
    * @param edm
    *          - The edit distance cost map we are using.
    * @param lastActionInsert
    *          - Indicates whether the last action was an insert action.
    * @param lastActionDelete
    *          - Indicates whether the last action was a delete action.
    * @return A map with all strings with a specified edit distance to the string query as keys and
    *         the files they belong to as values.
    */
   private Map<String, Set<String>> editDistanceClever(MultiTextNode node, String query,
           String result, double distance, int index, boolean ignoreCase, boolean fragment,
           EditDistanceCostMap edm, boolean lastActionInsert, boolean lastActionDelete) {

     EditDistanceResultMap resultMap = new EditDistanceResultMap();

     if (!lastActionInsert) {
       // Delete.
       if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
         resultMap.putAll(editDistanceClever(node, query, result,
                 distance - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment,
                 edm, false, true));
       }
     }

     // Recursion stop.
     if (node.isWordEnd() || fragment) {

       HashMap<String, Set<String>> temp = new HashMap<String, Set<String>>();

       double remainingInsertCosts = 0.0;

       // Accumulating remaining insert costs if the query is longer than
       // the word in the trie.
       for (int i = index; i < query.length(); i++) {
         remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
       }

       if (remainingInsertCosts <= distance) {
         // if (remainingInsertCosts <= distance &&
         // !node.getTypes().isEmpty()) {
         // if (query.length() - index <= distance) {

         if (fragment) {
           temp.put(result, new HashSet<String>(getTypeCone(node)));
         } else {
           temp.put(result, new HashSet<String>(node.getTypes()));
         }

         resultMap.putAll(temp);
       }

       // Important: word end does not mean no children any more!
       if (node.getChildren() == null) {
         return resultMap;
       }
     }

     // Recursion.
     for (MultiTextNode tempNode : node.getChildren().values()) {

       if (index < query.length()) {
         if (ignoreCase) {
           if (Character.toLowerCase(tempNode.getValue()) == Character
                   .toLowerCase(query.charAt(index))) {
             resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
                     distance, index + 1, ignoreCase, fragment, edm, false, false));
           }
         } else {
           if (tempNode.getValue() == query.charAt(index)) {
             resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
                     distance, index + 1, ignoreCase, fragment, edm, false, false));
           }
         }
       }

       if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {

         // Substitute.
         resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
                 distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1,
                 ignoreCase, fragment, edm, false, false));
       }

       if (!lastActionDelete) {
         if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
           // Insert - use the same index twice.
           resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
                   distance - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment,
                   edm, true, false));
         }
       }
     }

     return resultMap;
   }

   /**
    * Checks if a string is contained by the MultiTreeWordList.
    *
    * @param node
    *          - The MultiTextNode which is under consideration at the moment.
    * @param query
    *          - The query string.
    * @param result
    *          - The result which matched until now.
    * @param distance
    *          - The remaining edit distance.
    * @param index
    *          - The index of the query string at the moment.
    * @param ignoreCase
    *          - Indicates whether we search case sensitive or not.
    * @param fragment
    *          - Indicates whether we search for fragments of the query string or not.
    * @param edm
    *          - The edit distance cost map we are using.
    * @return A map with all strings with a specified edit distance to the string query as keys and
    *         the files they belong to as values.
    */
   private boolean editDistanceBool(MultiTextNode node, String query, String result, double distance,
           int index, boolean ignoreCase, boolean fragment, EditDistanceCostMap edm) {

     boolean deletion = false;
     boolean insertion = false;
     boolean substitution = false;
     boolean noop = false;

     // Recursion stop.
     if (fragment) {
       if (index == query.length()) {
         return true;
       }
     }

     if (node.isWordEnd()) {

       double remainingInsertCosts = 0.0;

       // Accumulating remaining insert costs if the query is longer than
       // the word in the trie.
       for (int i = index; i < query.length(); i++) {
         remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
       }

       if (remainingInsertCosts <= distance) {
         // if (query.length() - index <= distance) {
         return true;
       }
     }

     // Delete.
     if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
       deletion = editDistanceBool(node, query, result,
               distance - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment, edm);

       if (deletion) {
         return true;
       }
     }

     // Recursion.
     for (MultiTextNode tempNode : node.getChildren().values()) {

       if (index < query.length()) {
         if (ignoreCase) {
           if (Character.toLowerCase(tempNode.getValue()) == Character
                   .toLowerCase(query.charAt(index))) {
             noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
                     index + 1, ignoreCase, fragment, edm);
           }
         } else {
           if (tempNode.getValue() == query.charAt(index)) {
             noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
                     index + 1, ignoreCase, fragment, edm);
           }
         }

         if (noop) {
           return true;
         }
       }

       if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {

         // Substitute.
         substitution = editDistanceBool(tempNode, query, result + tempNode.getValue(),
                 distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1,
                 ignoreCase, fragment, edm);

         if (substitution) {
           return true;
         }
       }

       if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
         // Insert - use the same index twice.
         insertion = editDistanceBool(tempNode, query, result + tempNode.getValue(),
                 distance - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment,
                 edm);

         if (insertion) {
           return true;
         }
       }

     }

     return false;
   }

   @Override
   public int hashCode() {
     final int prime = 31;
     int result = 1;
     result = prime * result + ((costMap == null) ? 0 : costMap.hashCode());
     result = prime * result + ((root == null) ? 0 : root.hashCode());
     return result;
   }

   @Override
   public boolean equals(Object obj) {
     if (this == obj)
       return true;
     if (obj == null)
       return false;
     if (getClass() != obj.getClass())
       return false;
     MultiTreeWordList other = (MultiTreeWordList) obj;
     if (costMap == null) {
       if (other.costMap != null)
         return false;
     } else if (!costMap.equals(other.costMap))
       return false;
     if (root == null) {
       if (other.root != null)
         return false;
     } else if (!root.equals(other.root))
       return false;
     return true;
   }

   public void createMTWLFile(String path, boolean compress, String encoding) throws IOException {
     persistence.createMTWLFile(root, path, compress, encoding);
   }

 }