blob: c382c38bac29ea1448c4599b4542e93862e9beb2 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.resource;
import java.io.BufferedReader;
import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Map.Entry;
import java.util.Set;
import org.apache.commons.io.IOUtils;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.ruta.RutaStream;
import org.apache.uima.ruta.type.RutaBasic;
import org.springframework.core.io.FileSystemResource;
import org.springframework.core.io.Resource;
/**
* Class MultiTreeWordList.
*
*
*/
public class MultiTreeWordList implements RutaWordList {
private static final String ENCODING = "UTF-8";
private MultiTreeWordListPersistence persistence = new MultiTreeWordListPersistence();
/** The root of the TreeWordList. */
protected MultiTextNode root;
/** The cost model we are using. */
private EditDistanceCostMap costMap;
/**
* Default constructor.
*
* @throws IOException
* - should not happen but required by called constructor
*/
public MultiTreeWordList() throws IOException {
this(new String[] {}, null);
}
/**
* Default constructor uses just one file.
*
* @param pathname
* the pathname of the used file.
* @param base
* the relative base
* @throws IOException
* When there is a problem reading pathname.
*/
public MultiTreeWordList(String pathname, File base) throws IOException {
this(new FileSystemResource(pathname));
}
/**
* @param lists
* Resources to load.
* @throws IOException
* When there is a problem reading a resource.
*/
public MultiTreeWordList(Resource... lists) throws IOException {
this.root = new MultiTextNode();
this.costMap = new EditDistanceCostMap();
if (lists == null) {
return;
}
for (Resource list : lists) {
// check if the resource is a directory
File directory = null;
try {
directory = list.getFile();
} catch (IOException e) {
// resource is not on the file system
directory = null;
}
if (directory != null && directory.isDirectory()) {
// resource is a directory, load its content
for (File data : directory.listFiles()) {
load(new FileSystemResource(data));
}
} else {
// resource is not a directory, load it normally
load(list);
}
}
}
/**
* Constructor from an open stream. This method will close the stream.
*
* @param stream
* the stream to read the file from.
* @param name
* associated name
* @throws IOException
* When there is a problem reading the input stream.
*/
public MultiTreeWordList(InputStream stream, String name) throws IOException {
this.root = new MultiTextNode();
this.costMap = new EditDistanceCostMap();
if (name.endsWith(".mtwl")) {
persistence.readMTWL(root, stream, ENCODING);
} else if (name.endsWith(".txt")) {
buildNewTree(stream, name);
}
}
/**
* Constructs a TreeWordList from a file with path = filename
*
* @param pathnames
* path of the file to create a TextWordList from
* @param base
* - the relative base
* @throws IOException
* When there is a problem reading a path.
*/
public MultiTreeWordList(String[] pathnames, File base) throws IOException {
this.root = new MultiTextNode();
this.costMap = new EditDistanceCostMap();
if (pathnames == null) {
return;
}
for (String pathname : pathnames) {
String name = getRelativePath(new File(pathname), base);
load(new FileSystemResource(pathname), name);
}
}
/**
* @param files
* - the input files
* @param base
* - the relative base
* @throws IOException
* - When there is a problem reading the files.
*/
public MultiTreeWordList(List<File> files, File base) throws IOException {
this.root = new MultiTextNode();
this.costMap = new EditDistanceCostMap();
if (files == null) {
return;
}
for (File file : files) {
String name = getRelativePath(file, base);
load(new FileSystemResource(file), name);
}
}
private String getRelativePath(File file, File base) {
if (base == null) {
return file.getName();
}
Path filePath = file.toPath();
Path basePath = base.toPath();
Path relativize = basePath.relativize(filePath);
String result = relativize.toString().replaceAll("\\\\", "/");
return result;
}
/**
* Load a resource in this word list.
*
* @param resource
* Resource to load. The resource's name must end with .txt or .mtwl.
* @throws IOException
* When there is a problem reading the resource.
*/
private void load(Resource resource) throws IOException {
load(resource, resource.getFilename());
}
/**
* Load a resource in this word list.
*
* @param resource
* Resource to load.
* @param name
* - The resource's name must end with .txt or .mtwl.
* @throws IOException
* When there is a problem reading the resource.
*/
private void load(Resource resource, String name) throws IOException {
InputStream stream = null;
try {
stream = resource.getInputStream();
if (name == null) {
throw new IllegalArgumentException("List does not have a name.");
} else if (name.endsWith(".txt")) {
buildNewTree(stream, name);
} else if (name.endsWith(".mtwl")) {
persistence.readMTWL(root, stream, "UTF-8");
} else {
throw new IllegalArgumentException(
"File name should end with .mtwl or .txt, found " + name);
}
} finally {
IOUtils.closeQuietly(stream);
}
}
/**
* Creates a new Tree in the existing treeWordList from a file with path pathname
*
* @param stream
* - Input stream for the file containing the words for the treeWordList
* @param name
* - Associated name for the file
* @throws IOException
* When there is a problem reading the inputstream.
*/
public void buildNewTree(InputStream stream, String name) throws IOException {
BufferedReader br = new BufferedReader(new InputStreamReader(stream, ENCODING));
String s = null;
while ((s = br.readLine()) != null) {
addWord(s.trim(), name);
}
stream.close();
br.close();
}
/**
* Add a new String into the MultiTreeWordList.
*
* @param s
* The String to add
* @param type
* The type of the string.
*/
public void addWord(String s, String type) {
// Create Nodes from all chars of the strings besides the last one
MultiTextNode pointer = root;
for (Character each : s.toCharArray()) {
MultiTextNode childNode = pointer.getChildNode(each);
if (childNode == null) {
childNode = new MultiTextNode(each, false);
pointer.addChild(childNode);
}
pointer = childNode;
}
pointer.setWordEnd(s.length() > 0);
pointer.addType(type);
}
/**
* Returns all Types contained by the MultiTreeWordList.
*
* @return all Types contained by the MultiTreeWordList.
*/
public Collection<String> getTypes() {
return getTypeCone(root);
}
/**
* Returns all types contained by the cone of the MultiTextNode node, including the types of node
* itself.
*
* @param node
* The node where we start, the root of the cone.
* @return all types contained by the cone of the MultiTextNode node, including the types of node
* itself.
*/
public Collection<String> getTypeCone(MultiTextNode node) {
// TODO improve this method!
List<String> returnList = new LinkedList<String>();
if (node.getTypes() != null) {
for (String s : node.getTypes()) {
if (!returnList.contains(s)) {
returnList.add(s);
}
}
}
for (Character c : node.getChildren().keySet()) {
for (String s : getTypeCone(node.getChildNode(c))) {
if (!returnList.contains(s)) {
returnList.add(s);
}
}
}
return returnList;
}
/**
* Returns all strings contained by the MultiTreeWordList.
*
* @return All strings contained by the MultiTreeWordList.
*/
public Collection<String> keySet() {
List<String> keySet = new LinkedList<String>(keySet(root, ""));
Collections.sort(keySet);
return keySet;
}
/**
* Returns all strings contained by the cone of the MultiTextNode node and uses prefix as the
* prefix of all the strings.
*
* @param node
* the node we are considering.
* @param prefix
* the prefix until now.
* @return All strings contained by the cone of the MultiTextNode node.
*/
private Collection<String> keySet(MultiTextNode node, String prefix) {
List<String> resultList = new LinkedList<String>();
// Recursion stop.
if (node.isWordEnd()) {
resultList.add(prefix);
}
// Recursion step.
for (Character c : node.getChildren().keySet()) {
String temp = prefix + String.valueOf(c);
resultList.addAll(keySet(node.getChildNode(c), temp));
}
return resultList;
}
/**
* Returns all types of the very string s.
*
* @param s
* The string with the types.
* @return All types from the very string s.
*/
public Collection<String> getTypes(String s) {
return getTypes(s, false);
}
/**
* Returns the types of the string s.
*
* @param s
* The string with the types.
* @param ignoreCase
* Indicates, whether we search case sensitive or not.
* @return The types of the string s.
*/
public Collection<String> getTypes(String s, boolean ignoreCase) {
// Collection<Set<String>> types = editDistanceClever(root, s, "", 0.0,
// 0,
// ignoreCase, false, costMap, false, false).values();
// Map<String, Set<String>> types = editDistanceClever(root, s, "", 0.0,
// 0, ignoreCase, false, costMap, false, false);
Map<String, Set<String>> types = editDistance(s, 0, ignoreCase, "");
Set<String> returnSet = new HashSet<String>();
for (Entry<String, Set<String>> each : types.entrySet()) {
returnSet.addAll(each.getValue());
}
return returnSet;
}
/**
* Returns a list of types which belong to a string.
*
* @param string
* The string which types we want to have.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param ignoreLength
* If the length of the string is less than of equal to this, we search case insensitive.
* @param edit
* Indicates whether we use an edit distance or not.
* @param distance
* The edit distance to a string contained by the MultiTreeWordList.
* @param ignoreToken
* Characters which can be ignored.
* @return Returns a list of types which belong to a string.
*/
@Override
public List<String> contains(String string, boolean ignoreCase, int ignoreLength, boolean edit,
double distance, String ignoreToken) {
List<String> resultList = new LinkedList<String>();
Map<String, Set<String>> editDistance;
if (string.length() >= ignoreLength && ignoreCase) {
editDistance = editDistance(string, (int) distance, true, ignoreToken, false);
} else {
editDistance = editDistance(string, (int) distance, false, ignoreToken, false);
}
for (Entry<String, Set<String>> each : editDistance.entrySet()) {
resultList.addAll(each.getValue());
}
return resultList;
}
/**
* Checks whether a string is contained by the MultiTreeWordList or not.
*
* @param string
* The string which is contained or not.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param ignoreLength
* If the length of the string is less than of equal to this, we search case insensitive.
* @param edit
* Indicates whether we use an edit distance or not.
* @param distance
* The edit distance to a string contained by the MultiTreeWordList.
* @param ignoreToken
* Characters which can be ignored.
* @return true, if the string is contained by the MultiTreeWordList, false otherwise.
*/
public boolean containsBool(String string, boolean ignoreCase, int ignoreLength, boolean edit,
double distance, String ignoreToken) {
return editDistanceBool(root, string, "", distance, 0, ignoreCase, false, costMap);
}
/**
* Checks whether the tree contains exaclty the string s.
*
* @param s
* The string which is contained or not.
* @return True, if the TreeWordList contains exactly the string s, false otherwise.
*/
public boolean contains(String s) {
return contains(s, false);
}
/**
* Checks whether the tree contains the string s.
*
* @param s
* The string which is contained or not.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @return True, if the TreeWordList contains the string s, false otherwise.
*/
public boolean contains(String s, boolean ignoreCase) {
return contains(s, ignoreCase, 0, new char[] {}, 0, true);
}
/**
* Checks if the MultiTreeWordList contains the string s.
*
* @param s
* The string which is contained or not.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param size
* The index of the string.
* @param ignoreChars
* Characters which can be ignored.
* @param maxIgnoreChars
* The maximum number of ignored characters.
* @return true, if TreeWordList contains the string, false otherwise.
*/
@Override
public boolean contains(String s, boolean ignoreCase, int size, char[] ignoreChars,
int maxIgnoreChars, boolean ignoreWS) {
EditDistanceCostMap edm = new EditDistanceCostMap();
for (Character c : ignoreChars) {
edm.setDeleteCosts(c, 0.0);
}
return editDistanceBool(root, s, "", maxIgnoreChars, 0, ignoreCase, false, edm);
}
/**
* Checks if the MultiTreeWordList contains a prefix of the string s.
*
* @param s
* The string which is contained or not.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param size
* The index of the string.
* @param ignoreChars
* Characters which can be ignored.
* @param maxIgnoreChars
* The maximum number of ignored characters.
* @return true, if TreeWordList contains a prefix of the string, false otherwise.
*/
@Override
public boolean containsFragment(String s, boolean ignoreCase, int size, char[] ignoreChars,
int maxIgnoreChars, boolean ignoreWS) {
MultiTextNode pointer = root;
return recursiveContains(pointer, s, 0, ignoreCase && s.length() > size, true, ignoreChars,
maxIgnoreChars);
}
/**
* Checks whether prefix of a string is contained by the MultiTreeWordList or not.
*
* @param string
* The string whose prefix is contained or not.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param ignoreLength
* If the length of the string is less than of equal to this, we search case insensitive.
* @param edit
* Indicates whether we use an edit distance or not.
* @param distance
* The edit distance to a string contained by the MultiTreeWordList.
* @param ignoreToken
* Characters which can be ignored.
* @return true, if a prefix of the string is contained by the MultiTreeWordList, false otherwise.
*/
public boolean containsFragmentBool(String string, boolean ignoreCase, int ignoreLength,
boolean edit, double distance, String ignoreToken) {
if (string.length() >= ignoreLength && ignoreCase) {
return editDistanceBool(root, string, "", distance, 0, true, true, costMap);
} else {
return editDistanceBool(root, string, "", distance, 0, false, true, costMap);
}
}
/**
* Returns a list of types which belong to a prefix of a string that is contained by the
* MultiTreeWordList.
*
* @param string
* The string whose prefix's types we are interested in.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param ignoreLength
* If the length of the string is less than of equal to this, we search case insensitive.
* @param edit
* Indicates whether we use an edit distance or not.
* @param distance
* The edit distance to a string contained by the MultiTreeWordList.
* @param ignoreToken
* Characters which can be ignored.
* @return A list of types which belong to a prefix of a string that is contained by the
* MultiTreeWordList.
*/
@Override
public List<String> containsFragment(String string, boolean ignoreCase, int ignoreLength,
boolean edit, double distance, String ignoreToken) {
List<String> resultList = new LinkedList<String>();
Map<String, Set<String>> resultMap = null;
if (!edit) {
return recursiveContains2(root, string, 0, ignoreCase && string.length() > ignoreLength, true,
ignoreToken.toCharArray(), ignoreLength);
} else {
if (string.length() >= ignoreLength && ignoreCase) {
resultMap = editDistance(string, (int) distance, true, ignoreToken, true);
} else {
resultMap = editDistance(string, (int) distance, false, ignoreToken, true);
}
for (Set<String> set : resultMap.values()) {
for (String s : set) {
if (!resultList.contains(s)) {
// resultList.addAll(resultMap.get(set));
resultList.add(s);
}
}
}
}
return resultList;
}
/**
* Returns true, if the MultiTreeWordList contains the string text, false otherwise.
*
* @param pointer
* The MultiTextNode we are looking at.
* @param text
* The string which is contained or not.
* @param index
* The index of the string text we checked until now.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param fragment
* Indicates whether we are looking for a prefix of the string text.
* @param ignoreChars
* Characters which can be ignored.
* @param maxIgnoreChars
* Maximum number of characters which are allowed to be ignored.
* @return True, if the TreeWordList contains the string text, false otherwise.
*/
private List<String> recursiveContains2(MultiTextNode pointer, String text, int index,
boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars) {
if (pointer == null) {
return null;
}
if (index == text.length()) {
if (pointer.isWordEnd()) {
return new ArrayList<String>(pointer.getTypes());
}
if (fragment) {
return Collections.emptyList();
}
}
char charAt = text.charAt(index);
boolean charAtIgnored = false;
if (ignoreChars != null) {
for (char each : ignoreChars) {
if (each == charAt) {
charAtIgnored = true;
break;
}
}
charAtIgnored &= index != 0;
}
int next = ++index;
if (ignoreCase) {
// Lower Case Node.
MultiTextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt));
if (childNodeL == null) {
childNodeL = skipWS(pointer, Character.toLowerCase(charAt));
}
// Upper Case Node.
MultiTextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt));
if (childNodeU == null) {
childNodeU = skipWS(pointer, Character.toUpperCase(charAt));
}
if (charAtIgnored && childNodeL == null && childNodeU == null) {
// Character is ignored and does not appear.
return recursiveContains2(pointer, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
} else {
// Recursion.
Collection<String> recursiveContainsL = recursiveContains2(childNodeL, text, next,
ignoreCase, fragment, ignoreChars, maxIgnoreChars);
Collection<String> recursiveContainsU = recursiveContains2(childNodeU, text, next,
ignoreCase, fragment, ignoreChars, maxIgnoreChars);
if (recursiveContainsL == null && recursiveContainsU == null) {
return null;
}
List<String> result = new LinkedList<String>();
if (recursiveContainsL != null) {
result.addAll(recursiveContainsL);
}
if (recursiveContainsU != null) {
result.addAll(recursiveContainsU);
}
return result;
}
} else {
// Case sensitive.
MultiTextNode childNode = pointer.getChildNode(charAt);
if (charAtIgnored && childNode == null) {
// Recursion with incremented index.
return recursiveContains2(pointer, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
} else {
// Recursion with new node.
return recursiveContains2(childNode, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
}
}
}
private MultiTextNode skipWS(MultiTextNode pointer, char charAt) {
MultiTextNode childNode = pointer.getChildNode(' ');
if (childNode != null) {
MultiTextNode node = childNode.getChildNode(charAt);
if (node == null) {
return skipWS(childNode, charAt);
} else {
return node;
}
}
return null;
}
/**
* Returns true, if the MultiTreeWordList contains the string text, false otherwise.
*
* @param pointer
* The MultiTextNode we are looking at.
* @param text
* The string which is contained or not.
* @param index
* The index of the string text we checked until now.
* @param ignoreCase
* Indicates whether we search case sensitive or not.
* @param fragment
* Indicates whether we are looking for a prefix of the string text.
* @param ignoreChars
* Characters which can be ignored.
* @param maxIgnoreChars
* Maximum number of characters which are allowed to be ignored.
* @return True, if the TreeWordList contains the string text, false otherwise.
*/
private boolean recursiveContains(MultiTextNode pointer, String text, int index,
boolean ignoreCase, boolean fragment, char[] ignoreChars, int maxIgnoreChars) {
if (pointer == null) {
return false;
}
if (index == text.length()) {
return fragment || pointer.isWordEnd();
}
char charAt = text.charAt(index);
boolean charAtIgnored = false;
if (ignoreChars != null) {
for (char each : ignoreChars) {
if (each == charAt) {
charAtIgnored = true;
break;
}
}
charAtIgnored &= index != 0;
}
int next = ++index;
if (ignoreCase) {
// Lower Case Node.
MultiTextNode childNodeL = pointer.getChildNode(Character.toLowerCase(charAt));
// Upper Case Node.
MultiTextNode childNodeU = pointer.getChildNode(Character.toUpperCase(charAt));
if (charAtIgnored && childNodeL == null && childNodeU == null) {
// Character is ignored and does not appear.
return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
} else {
// Recursion.
return recursiveContains(childNodeL, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars)
|| recursiveContains(childNodeU, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
}
} else {
// Case sensitive.
MultiTextNode childNode = pointer.getChildNode(charAt);
if (charAtIgnored && childNode == null) {
// Recursion with incremented index.
return recursiveContains(pointer, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
} else {
// Recursion with new node.
return recursiveContains(childNode, text, next, ignoreCase, fragment, ignoreChars,
maxIgnoreChars);
}
}
}
@Override
public Collection<AnnotationFS> find(RutaStream stream, Map<String, Object> typeMap,
boolean ignoreCase, int ignoreLength, boolean edit, double distance, String ignoreToken) {
Collection<AnnotationFS> results = new HashSet<AnnotationFS>();
stream.moveToFirst();
RutaStream streamPointer = stream.copy();
while (stream.isValid()) {
RutaBasic anchorBasic = (RutaBasic) stream.get();
streamPointer.moveTo(anchorBasic);
List<RutaBasic> basicsToAdd = new ArrayList<RutaBasic>();
basicsToAdd.add(anchorBasic);
String text = anchorBasic.getCoveredText();
StringBuilder candidate = new StringBuilder(text);
String lastCandidate = candidate.toString();
if (text.length() != 1 || !ignoreToken.contains(text)) {
List<AnnotationFS> interResults = new ArrayList<AnnotationFS>();
while (streamPointer.isValid()) {
boolean skip = false;
String currentBasicText = basicsToAdd.get(basicsToAdd.size() - 1).getCoveredText();
if (currentBasicText.length() == 1 && ignoreToken.contains(currentBasicText)) {
skip = true;
}
List<String> types = null;
if (!skip) {
types = containsFragment(candidate.toString(), ignoreCase, ignoreLength, edit, distance,
ignoreToken);
}
if (skip || types != null) {
streamPointer.moveToNext();
if (streamPointer.isValid()) {
RutaBasic next = (RutaBasic) streamPointer.get();
// List<String> contains = contains(candidate,
// ignoreCase,
// ignoreLength, edit, distance, ignoreToken);
if (!skip) {
tryToCreateAnnotation(types, stream, results, basicsToAdd, candidate.toString(),
interResults, ignoreCase, ignoreLength, edit, distance, ignoreToken,
typeMap);
}
lastCandidate = candidate.toString();
candidate.append(next.getCoveredText());
basicsToAdd.add(next);
} else {
// !streamPointer.isValid();
tryToCreateAnnotation(types, stream, results, basicsToAdd, lastCandidate,
interResults, ignoreCase, ignoreLength, edit, distance, ignoreToken, typeMap);
}
} else {
// containsFragment.isEmpty();
// basicsToAdd.remove(basicsToAdd.size() - 1);
// tryToCreateAnnotation(stream, results, basicsToAdd,
// lastCandidate, interResults, ignoreCase,
// ignoreLength, edit, distance, ignoreToken, typeMap);
// breaks inner while()-loop.
break;
}
}
}
stream.moveToNext();
}
return results;
}
@Override
public List<AnnotationFS> find(RutaStream stream, boolean ignoreCase, int size,
char[] ignoreChars, int maxIgnoredChars, boolean ignoreWS) {
assert false;
return new ArrayList<AnnotationFS>();
}
private void tryToCreateAnnotation(List<String> types, RutaStream stream,
Collection<AnnotationFS> results, List<RutaBasic> basicsToAdd, String lastCandidate,
List<AnnotationFS> interResult, boolean ignoreCase, int ignoreLength, boolean edit,
double distance, String ignoreToken, Map<String, Object> map) {
if (basicsToAdd.size() >= 1 && types != null) {
Set<String> set = new HashSet<String>(types);
for (String each : set) {
Object o = map.get(each);
if (o instanceof Type) {
Type type = (Type) o;
int begin = basicsToAdd.get(0).getBegin();
int end = basicsToAdd.get(basicsToAdd.size() - 1).getEnd();
AnnotationFS newFS = stream.getCas().createAnnotation(type, begin, end);
results.add(newFS);
} else if (o instanceof List) {
List<?> list = (List<?>) o;
Type type = null;
String featureString = null;
Object value = each;
if (list.size() == 2 || list.size() == 3) {
if (list.get(0) instanceof Type) {
type = (Type) list.get(0);
}
if (list.get(1) instanceof String) {
featureString = (String) list.get(1);
}
if (list.size() == 3) {
value = list.get(2);
}
if (type != null && featureString != null) {
int begin = basicsToAdd.get(0).getBegin();
int end = basicsToAdd.get(basicsToAdd.size() - 1).getEnd();
AnnotationFS newFS = stream.getCas().createAnnotation(type, begin, end);
Feature feature = type.getFeatureByBaseName(featureString);
setFeatureValue(newFS, feature, value, stream);
results.add(newFS);
}
}
}
}
} else if (interResult != null && !interResult.isEmpty()) {
results.addAll(interResult);
}
}
private void setFeatureValue(AnnotationFS annotationFS, Feature feature, Object o,
RutaStream stream) {
TypeSystem typeSystem = stream.getCas().getTypeSystem();
if (feature != null && o != null) {
Type range = feature.getRange();
String rangeName = range.getName();
if (typeSystem.subsumes(typeSystem.getType(CAS.TYPE_NAME_STRING), range)
&& o instanceof String) {
annotationFS.setStringValue(feature, (String) o);
} else if (rangeName.equals(CAS.TYPE_NAME_INTEGER) && o instanceof Number) {
annotationFS.setIntValue(feature, ((Number) o).intValue());
} else if (rangeName.equals(CAS.TYPE_NAME_DOUBLE) && o instanceof Number) {
annotationFS.setDoubleValue(feature, ((Number) o).doubleValue());
} else if (rangeName.equals(CAS.TYPE_NAME_FLOAT) && o instanceof Number) {
annotationFS.setFloatValue(feature, ((Number) o).floatValue());
} else if (rangeName.equals(CAS.TYPE_NAME_BYTE) && o instanceof Number) {
annotationFS.setByteValue(feature, ((Number) o).byteValue());
} else if (rangeName.equals(CAS.TYPE_NAME_SHORT) && o instanceof Number) {
annotationFS.setShortValue(feature, ((Number) o).shortValue());
} else if (rangeName.equals(CAS.TYPE_NAME_LONG) && o instanceof Number) {
annotationFS.setLongValue(feature, ((Number) o).longValue());
} else if (rangeName.equals(CAS.TYPE_NAME_BOOLEAN) && o instanceof Boolean) {
annotationFS.setBooleanValue(feature, (Boolean) o);
} else if (typeSystem.subsumes(typeSystem.getType(CAS.TYPE_NAME_STRING), range)
& o instanceof Type) {
annotationFS.setStringValue(feature, ((Type) o).getName());
}
} else {
throw new IllegalArgumentException(
"Not able to assign feature value: " + o + " -> " + feature);
}
}
/**
* Returns a map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*
* @param query
* - The query string.
* @param distance
* - The specified edit distance.
* @return A map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*/
public Map<String, Set<String>> editDistance(String query, int distance) {
return editDistance(query, distance, false, "");
}
/**
* Returns a map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*
* @param query
* - The query string.
* @param distance
* - The specified edit distance.
* @param ignoreCase
* - Indicates whether we search case sensitive or not.
* @param ignoreToken
* - Indicates the characters to ignore
* @return A map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*/
public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
String ignoreToken) {
return editDistance(query, distance, ignoreCase, ignoreToken, false);
}
/**
* Returns a map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*
* @param query
* - The query string.
* @param distance
* - The specified edit distance.
* @param ignoreCase
* - Indicates whether we search case sensitive or not.
* @param ignoreToken
* - the characters to ignore
* @param fragment
* - Indicates whether we search for fragments of the query string or not.
* @return A map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*/
public Map<String, Set<String>> editDistance(String query, int distance, boolean ignoreCase,
String ignoreToken, boolean fragment) {
// The second alternative realizes the fragment functionality by
// setting the insert costs of the ignored character to zero. This
// is much more elegant and easier to maintain. I don't know if the
// other way is faster, so I did not delete it yet.
Map<Character, Double> oldInsertCosts = new HashMap<Character, Double>();
EditDistanceCostMap edcm = new EditDistanceCostMap();
// We need to store the old insert costs before we set them to zero.
for (char c : ignoreToken.toCharArray()) {
oldInsertCosts.put(c, edcm.getInsertCosts(c));
edcm.setInsertCosts(c, 0.0);
}
Map<String, Set<String>> result = null;
if (ignoreCase) {
result = editDistanceClever(root, query.toLowerCase(), "", distance, 0, true, fragment, edcm,
false, false);
} else {
result = editDistanceClever(root, query, "", distance, 0, false, fragment, edcm, false,
false);
}
// Restoring of the old insert costs.
for (Entry<Character, Double> c : oldInsertCosts.entrySet()) {
edcm.setDeleteCosts(c.getKey(), c.getValue());
}
return result;
}
/**
* Returns a map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*
* @param node
* - The MultiTextNode which is under consideration at the moment.
* @param query
* - The query string.
* @param result
* - The result which matched until now.
* @param distance
* - The remaining edit distance.
* @param index
* - The index of the query string at the moment.
* @param ignoreCase
* - Indicates whether we search case sensitive or not.
* @param fragment
* - Indicates whether we search for fragments of the query string or not.
* @param edm
* - The edit distance cost map we are using.
* @param lastActionInsert
* - Indicates whether the last action was an insert action.
* @param lastActionDelete
* - Indicates whether the last action was a delete action.
* @return A map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*/
private Map<String, Set<String>> editDistanceClever(MultiTextNode node, String query,
String result, double distance, int index, boolean ignoreCase, boolean fragment,
EditDistanceCostMap edm, boolean lastActionInsert, boolean lastActionDelete) {
EditDistanceResultMap resultMap = new EditDistanceResultMap();
if (!lastActionInsert) {
// Delete.
if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
resultMap.putAll(editDistanceClever(node, query, result,
distance - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment,
edm, false, true));
}
}
// Recursion stop.
if (node.isWordEnd() || fragment) {
HashMap<String, Set<String>> temp = new HashMap<String, Set<String>>();
double remainingInsertCosts = 0.0;
// Accumulating remaining insert costs if the query is longer than
// the word in the trie.
for (int i = index; i < query.length(); i++) {
remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
}
if (remainingInsertCosts <= distance) {
// if (remainingInsertCosts <= distance &&
// !node.getTypes().isEmpty()) {
// if (query.length() - index <= distance) {
if (fragment) {
temp.put(result, new HashSet<String>(getTypeCone(node)));
} else {
temp.put(result, new HashSet<String>(node.getTypes()));
}
resultMap.putAll(temp);
}
// Important: word end does not mean no children any more!
if (node.getChildren() == null) {
return resultMap;
}
}
// Recursion.
for (MultiTextNode tempNode : node.getChildren().values()) {
if (index < query.length()) {
if (ignoreCase) {
if (Character.toLowerCase(tempNode.getValue()) == Character
.toLowerCase(query.charAt(index))) {
resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
distance, index + 1, ignoreCase, fragment, edm, false, false));
}
} else {
if (tempNode.getValue() == query.charAt(index)) {
resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
distance, index + 1, ignoreCase, fragment, edm, false, false));
}
}
}
if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {
// Substitute.
resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1,
ignoreCase, fragment, edm, false, false));
}
if (!lastActionDelete) {
if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
// Insert - use the same index twice.
resultMap.putAll(editDistanceClever(tempNode, query, result + tempNode.getValue(),
distance - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment,
edm, true, false));
}
}
}
return resultMap;
}
/**
* Checks if a string is contained by the MultiTreeWordList.
*
* @param node
* - The MultiTextNode which is under consideration at the moment.
* @param query
* - The query string.
* @param result
* - The result which matched until now.
* @param distance
* - The remaining edit distance.
* @param index
* - The index of the query string at the moment.
* @param ignoreCase
* - Indicates whether we search case sensitive or not.
* @param fragment
* - Indicates whether we search for fragments of the query string or not.
* @param edm
* - The edit distance cost map we are using.
* @return A map with all strings with a specified edit distance to the string query as keys and
* the files they belong to as values.
*/
private boolean editDistanceBool(MultiTextNode node, String query, String result, double distance,
int index, boolean ignoreCase, boolean fragment, EditDistanceCostMap edm) {
boolean deletion = false;
boolean insertion = false;
boolean substitution = false;
boolean noop = false;
// Recursion stop.
if (fragment) {
if (index == query.length()) {
return true;
}
}
if (node.isWordEnd()) {
double remainingInsertCosts = 0.0;
// Accumulating remaining insert costs if the query is longer than
// the word in the trie.
for (int i = index; i < query.length(); i++) {
remainingInsertCosts += edm.getInsertCosts(query.charAt(i));
}
if (remainingInsertCosts <= distance) {
// if (query.length() - index <= distance) {
return true;
}
}
// Delete.
if (distance - edm.getDeleteCosts(node.getValue()) >= 0 && result.length() > 0) {
deletion = editDistanceBool(node, query, result,
distance - edm.getDeleteCosts(node.getValue()), index + 1, ignoreCase, fragment, edm);
if (deletion) {
return true;
}
}
// Recursion.
for (MultiTextNode tempNode : node.getChildren().values()) {
if (index < query.length()) {
if (ignoreCase) {
if (Character.toLowerCase(tempNode.getValue()) == Character
.toLowerCase(query.charAt(index))) {
noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
index + 1, ignoreCase, fragment, edm);
}
} else {
if (tempNode.getValue() == query.charAt(index)) {
noop = editDistanceBool(tempNode, query, result + tempNode.getValue(), distance,
index + 1, ignoreCase, fragment, edm);
}
}
if (noop) {
return true;
}
}
if (distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()) >= 0) {
// Substitute.
substitution = editDistanceBool(tempNode, query, result + tempNode.getValue(),
distance - edm.getReplaceCosts(node.getValue(), tempNode.getValue()), index + 1,
ignoreCase, fragment, edm);
if (substitution) {
return true;
}
}
if (distance - edm.getInsertCosts(tempNode.getValue()) >= 0) {
// Insert - use the same index twice.
insertion = editDistanceBool(tempNode, query, result + tempNode.getValue(),
distance - edm.getInsertCosts(tempNode.getValue()), index, ignoreCase, fragment,
edm);
if (insertion) {
return true;
}
}
}
return false;
}
@Override
public int hashCode() {
final int prime = 31;
int result = 1;
result = prime * result + ((costMap == null) ? 0 : costMap.hashCode());
result = prime * result + ((root == null) ? 0 : root.hashCode());
return result;
}
@Override
public boolean equals(Object obj) {
if (this == obj)
return true;
if (obj == null)
return false;
if (getClass() != obj.getClass())
return false;
MultiTreeWordList other = (MultiTreeWordList) obj;
if (costMap == null) {
if (other.costMap != null)
return false;
} else if (!costMap.equals(other.costMap))
return false;
if (root == null) {
if (other.root != null)
return false;
} else if (!root.equals(other.root))
return false;
return true;
}
public void createMTWLFile(String path, boolean compress, String encoding) throws IOException {
persistence.createMTWLFile(root, path, compress, encoding);
}
}