blob: e57e918a50d5507cdccab88aebfccc4e3d4c2c8d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.ruta.textruler.core;
import java.io.BufferedWriter;
import java.io.File;
import java.io.FileInputStream;
import java.io.FileOutputStream;
import java.io.FileWriter;
import java.io.IOException;
import java.net.URL;
import java.text.CharacterIterator;
import java.text.StringCharacterIterator;
import java.util.ArrayList;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashSet;
import java.util.List;
import java.util.Set;
import org.apache.uima.UIMAFramework;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineDescription;
import org.apache.uima.cas.CAS;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.Feature;
import org.apache.uima.cas.Type;
import org.apache.uima.cas.TypeSystem;
import org.apache.uima.cas.impl.XmiCasDeserializer;
import org.apache.uima.cas.impl.XmiCasSerializer;
import org.apache.uima.cas.text.AnnotationFS;
import org.apache.uima.resource.metadata.TypeDescription;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.ruta.engine.RutaEngine;
import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils;
import org.apache.uima.ruta.textruler.TextRulerPlugin;
import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType;
import org.apache.uima.util.XMLInputSource;
import org.apache.uima.util.XMLSerializer;
import org.eclipse.core.runtime.FileLocator;
import org.eclipse.core.runtime.IPath;
import org.eclipse.core.runtime.Path;
/**
*
* This static class provides all kinds of helper methods and constants that are useful for all
* kinds of stuff in this project.
*/
public class TextRulerToolkit {
public static final boolean LOGGING_ENABLED = true;
public static final boolean DEBUG = false;
public static final String RUTA_ALL_TYPE_NAME = "org.apache.uima.ruta.type.ALL";
public static final String RUTA_ANY_TYPE_NAME = "org.apache.uima.ruta.type.ANY";
public static final String RUTA_WORD_TYPE_NAME = "org.apache.uima.ruta.type.W";
public static final String RUTA_BREAK_TYPE_NAME = "org.apache.uima.ruta.type.BREAK";
public static final String RUTA_SPACE_TYPE_NAME = "org.apache.uima.ruta.type.SPACE";
public static final String RUTA_NUM_TYPE_NAME = "org.apache.uima.ruta.type.NUM";
public static final String RUTA_MARKUP_TYPE_NAME = "org.apache.uima.ruta.type.MARKUP";
public static final String RUTA_SPECIAL_TYPE_NAME = "org.apache.uima.ruta.type.SPECIAL";
public static final String RUTA_NBSP_TYPE_NAME = "org.apache.uima.ruta.type.NBSP";
public static final String LEFT_BOUNDARY_EXTENSION = "START";
public static final String RIGHT_BOUNDARY_EXTENSION = "END";
public static void log(String str) {
if (LOGGING_ENABLED)
System.out.println(str);
}
public static void logIfDebug(String str) {
if (DEBUG)
log(str);
}
public static void logIf(boolean condition, String str) {
if (LOGGING_ENABLED && condition)
System.out.println(str);
}
public static URL getResourceURL(String name) {
return FileLocator.find(TextRulerPlugin.getDefault().getBundle(), new Path(name), null);
}
public static AnalysisEngineDescription getAnalysisEngineDescription(String descFile) {
AnalysisEngineDescription result = null;
try {
XMLInputSource in = new XMLInputSource(descFile);
result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
} catch (Exception e) {
TextRulerPlugin.error(e);
result = null;
}
return result;
}
public static AnalysisEngineDescription getAnalysisEngineDescription(URL fileURL) {
AnalysisEngineDescription result = null;
try {
XMLInputSource in = new XMLInputSource(fileURL);
result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in);
} catch (Exception e) {
TextRulerPlugin.error(e);
result = null;
}
return result;
}
public static AnalysisEngine loadAnalysisEngine(AnalysisEngineDescription desc) {
AnalysisEngine result = null;
try {
result = UIMAFramework.produceAnalysisEngine(desc);
} catch (Exception e) {
TextRulerPlugin.error(e);
result = null;
}
return result;
}
public static void addBoundaryTypes(AnalysisEngineDescription description, String[] slotNames) {
List<String> list = new ArrayList<String>();
for (String eachSlot : slotNames) {
list.add(eachSlot + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION);
list.add(eachSlot + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION);
}
TypeSystemDescription typeSystem = description.getAnalysisEngineMetaData().getTypeSystem();
for (String string : list) {
TypeDescription type = typeSystem.getType(string);
if (type == null) {
typeSystem.addType(string, "", "uima.tcas.Annotation");
}
}
}
public static CAS readCASfromXMIFile(String filename, AnalysisEngine ae, CAS reuseCAS) {
return readCASfromXMIFile(new File(filename), ae, reuseCAS);
}
public static CAS readCASfromXMIFile(File file, AnalysisEngine ae, CAS reuseCAS) {
FileInputStream inputStream = null;
try {
CAS resultCas;
inputStream = new FileInputStream(file);
if (reuseCAS != null) {
reuseCAS.reset();
resultCas = reuseCAS;
} else {
resultCas = GlobalCASSource.allocCAS(ae); // ae.newCAS();
}
XmiCasDeserializer.deserialize(inputStream, resultCas, true);
return resultCas;
} catch (Exception e) {
TextRulerPlugin.error(e);
} finally {
try {
if (inputStream != null)
inputStream.close();
} catch (Exception e) {
TextRulerPlugin.error(e);
}
}
return null;
}
public static void writeCAStoXMIFile(CAS aCas, String filename)// throws
// IOException,
// SAXException
{
File newFile = new File(filename);
FileOutputStream out = null;
try {
// write XMI
out = new FileOutputStream(newFile);
XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem());
XMLSerializer xmlSer = new XMLSerializer(out, false);
ser.serialize(aCas, xmlSer.getContentHandler());
} catch (Exception e) {
TextRulerPlugin.error(e);
} finally {
if (out != null) {
try {
out.close();
} catch (Exception e) {
TextRulerPlugin.error(e);
}
}
}
}
public static List<AnnotationFS> extractAnnotationsForSlotName(CAS aCas, String slotName) {
List<AnnotationFS> result = new ArrayList<AnnotationFS>();
TypeSystem ts = aCas.getTypeSystem();
Type slotType = ts.getType(slotName);
FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(slotType).iterator(true);
if (!it.isValid()) {
// System.out.println("##### -> iterator not valid for slots!!");
}
while (it.isValid()) {
AnnotationFS fs = it.get();
// quick hack for quantifier bug in TM:
AnnotationFS previous = result.size() > 0 ? result.get(result.size() - 1) : null;
if (previous == null || previous.getBegin() != fs.getBegin()
|| previous.getEnd() != fs.getEnd())
result.add(fs);
else {
logIfDebug("******** TM QUANTIFIER BUG ?? Multiple annotation: " + fs.getType().getName());
}
it.moveToNext();
}
return result;
}
private static List<AnnotationFS> getAnnotationWithinBounds(CAS aCas, int posStart, int posEnd,
Set<String> filterSet, Type rootType) {
List<AnnotationFS> result = new ArrayList<AnnotationFS>();
TypeSystem ts = aCas.getTypeSystem();
try {
// TODO wie in TMs AnnotationRetrieval evtl nicht den subiterator
// nehmen, da der auf
// type comparisons basiert, die wir evtl nicht gegeben haben!?
AnnotationFS boundaryAnnotation = aCas.createAnnotation(ts.getType("uima.tcas.Annotation"),
posStart > 0 ? posStart - 1 : 0, posEnd); // TODO ist das
// richtig so??!!
FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().subiterator(boundaryAnnotation, true,
true);
while (it.isValid()) {
AnnotationFS fs = it.get();
if (fs.getBegin() < posStart || fs.getEnd() > posEnd) {
it.moveToNext();
continue;
}
if (rootType != null) {
if (!ts.subsumes(rootType, fs.getType())) {
it.moveToNext();
continue;
}
}
if (filterSet == null || !filterSet.contains(fs.getType().getName())) {
result.add(fs);
}
it.moveToNext();
}
} catch (Exception e) {
TextRulerPlugin.error(e);
}
return result;
}
public static List<AnnotationFS> getAnnotationsBeforePosition(CAS aCas, int position,
int maxCount, Set<String> filterSet, Type rootType) {
List<AnnotationFS> result = getAnnotationWithinBounds(aCas, 0, position, filterSet, rootType);
if (maxCount > 0) {
while (result.size() > maxCount)
result.remove(0); // remove from front of queue !
}
return result;
}
public static List<AnnotationFS> getAnnotationsAfterPosition(CAS aCas, int position,
int maxCount, Set<String> filterSet, Type rootType) {
int maxPos = aCas.getDocumentText().length() - 1;
List<AnnotationFS> result = getAnnotationWithinBounds(aCas, position, maxPos, filterSet,
rootType);
if (maxCount > 0) {
while (result.size() > maxCount)
result.remove(result.size() - 1); // remove from end of queue!
}
return result;
}
public static List<AnnotationFS> getAnnotationsWithinBounds(CAS aCas, int start, int end,
Set<String> filterSet, Type rootType) {
return getAnnotationWithinBounds(aCas, start, end, filterSet, rootType);
}
public static List<AnnotationFS> getOtherAnnotationsOverToken(CAS aCas,
AnnotationFS tmTokenAnnotation, Set<String> filterSet) {
List<AnnotationFS> result = new ArrayList<AnnotationFS>();
// filter out document annotation!!
FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator();
Type tokenType = tmTokenAnnotation.getType();
FSIterator<AnnotationFS> leftIt = null;
FSIterator<AnnotationFS> rightIt = null;
TypeSystem ts = aCas.getTypeSystem();
Type tmRootType = ts.getType(RUTA_ALL_TYPE_NAME);
Set<String> allFilters = new HashSet<String>();
allFilters.add("uima.tcas.DocumentAnnotation");
allFilters.add(RutaEngine.BASIC_TYPE);
if (filterSet != null)
allFilters.addAll(filterSet);
for (; it.isValid(); it.moveToNext()) {
AnnotationFS fs = (AnnotationFS) it.get();
if (fs.getBegin() == tmTokenAnnotation.getBegin()
&& fs.getEnd() == tmTokenAnnotation.getEnd() && fs.getType().equals(tokenType)) {
leftIt = it;
rightIt = it.copy();
break;
}
}
if (leftIt == null)
return null; // the token annotation was not found !
if (leftIt.isValid())
leftIt.moveToPrevious(); // leave our token annotation behind us...
// search from the token annotation to the left
for (; leftIt.isValid(); leftIt.moveToPrevious()) {
AnnotationFS fs = (AnnotationFS) leftIt.get();
if (fs.getEnd() <= tmTokenAnnotation.getBegin())
break; // if that happens we are out of reach and can stop
if (fs.getBegin() <= tmTokenAnnotation.getBegin()
&& fs.getEnd() >= tmTokenAnnotation.getEnd()
&& !allFilters.contains(fs.getType().getName())
&& !ts.subsumes(tmRootType, fs.getType()))
result.add(fs);
}
// search from the token annotation to the right
if (rightIt.isValid())
rightIt.moveToNext(); // leave our token annotation behind us...
for (; rightIt.isValid(); rightIt.moveToNext()) {
AnnotationFS fs = (AnnotationFS) rightIt.get();
if (fs.getBegin() >= tmTokenAnnotation.getEnd())
break; // if that happens we are out of reach and can stop
if (fs.getBegin() <= tmTokenAnnotation.getBegin()
&& fs.getEnd() >= tmTokenAnnotation.getEnd()
&& !allFilters.contains(fs.getType().getName())
&& !ts.subsumes(tmRootType, fs.getType()))
result.add(fs);
}
return result;
}
public static synchronized Set<String> getFilterSetWithSlotNames(String[] slotNames,
Set<String> otherFilters) {
Set<String> result = new HashSet<String>(otherFilters);
result.add(RutaEngine.BASIC_TYPE);
if (slotNames != null)
for (String s : slotNames)
result.add(s);
return result;
}
public static synchronized Set<String> getFilterSetWithSlotName(String slotName,
Set<String> otherFilters) {
String[] sn = { slotName };
return getFilterSetWithSlotNames(sn, otherFilters);
}
public static synchronized String getStandardFilterSetString() {
String str = "";
for (String s : getStandardFilterSet(null))
if (str.length() == 0)
str += s;
else
str += ", " + s;
return str;
}
public static synchronized Set<String> getStandardFilterSet(String[] slotNames) {
Set<String> filterSet = new HashSet<String>();
if (slotNames != null) {
for (String s : slotNames)
filterSet.add(s);
}
filterSet.add(RUTA_SPACE_TYPE_NAME);
filterSet.add(RUTA_BREAK_TYPE_NAME);
filterSet.add(RUTA_MARKUP_TYPE_NAME);
filterSet.add(RUTA_NBSP_TYPE_NAME);
return filterSet;
}
public static synchronized Set<String> getStandardFeatureFilterSet() {
Set<String> filterSet = new HashSet<String>();
filterSet.add("uima.cas.AnnotationBase:sofa");
filterSet.add("uima.tcas.Annotation:begin");
filterSet.add("uima.tcas.Annotation:end");
filterSet.add("org.apache.uima.ruta.type.RutaBasic:Replacement");
return filterSet;
}
// return the example of the list if found, null otherwise
public static synchronized TextRulerExample exampleListContainsAnnotation(
List<TextRulerExample> list, TextRulerAnnotation ann) {
TextRulerExample needle = new TextRulerExample(null, ann, true, null);
int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() {
public int compare(TextRulerExample o1, TextRulerExample o2) {
TextRulerAnnotation afs1 = o1.getAnnotation();
TextRulerAnnotation afs2 = o2.getAnnotation();
if (afs1.getBegin() < afs2.getBegin())
return -1;
else if (afs1.getBegin() > afs2.getBegin())
return 1;
else if (afs1.getEnd() > afs2.getEnd())
return -1;
else if (afs1.getEnd() < afs2.getEnd())
return 1;
else
return 0;
}
});
if (index >= 0)
return list.get(index);
else
return null;
}
public static synchronized String addTrailingSlashToPath(String path) {
if (!(path.endsWith("/") || path.endsWith("\\")))
path = path + System.getProperty("file.separator");
return path;
}
public static synchronized String createTemporaryDirectory() throws IOException {
final File temp;
temp = File.createTempFile("temp", Long.toString(System.nanoTime()));
if (!(temp.delete()))
return null;
if (!(temp.mkdir()))
return null;
temp.deleteOnExit();
return addTrailingSlashToPath(temp.getPath());
}
public static synchronized String getTypeShortName(String typeName) {
if (typeName.indexOf(".") >= 0) {
String components[] = typeName.split("\\.");
return components[components.length - 1];
} else
return typeName;
}
public static synchronized String getEngineDescriptorFromTMSourceFile(IPath scriptFilePath) {
IPath folder = scriptFilePath;
while (!folder.lastSegment().equals(RutaProjectUtils.getDefaultScriptLocation())) {
folder = folder.removeLastSegments(1);
}
IPath relativeTo = scriptFilePath.makeRelativeTo(folder);
IPath projectPath = folder.removeLastSegments(1);
String elementName = scriptFilePath.lastSegment();
int lastIndexOf = elementName.lastIndexOf(RutaEngine.SCRIPT_FILE_EXTENSION);
if (lastIndexOf != -1) {
elementName = elementName.substring(0, lastIndexOf);
}
IPath descPath = projectPath.append(RutaProjectUtils.getDefaultDescriptorLocation());
IPath descPackagePath = descPath.append(relativeTo.removeLastSegments(1));
return descPackagePath.append(elementName + "Engine.xml").toString();
}
public static synchronized String getTypeSystemDescriptorFromTMSourceFile(IPath scriptFilePath) {
IPath folder = scriptFilePath;
while (!folder.lastSegment().equals(RutaProjectUtils.getDefaultScriptLocation())) {
folder = folder.removeLastSegments(1);
}
IPath relativeTo = scriptFilePath.makeRelativeTo(folder);
IPath projectPath = folder.removeLastSegments(1);
String elementName = scriptFilePath.lastSegment();
int lastIndexOf = elementName.lastIndexOf(RutaEngine.SCRIPT_FILE_EXTENSION);
if (lastIndexOf != -1) {
elementName = elementName.substring(0, lastIndexOf);
}
IPath descPath = projectPath.append(RutaProjectUtils.getDefaultDescriptorLocation());
IPath descPackagePath = descPath.append(relativeTo.removeLastSegments(1));
return descPackagePath.append(elementName + "TypeSystem.xml").toString();
}
public static synchronized String escapeForRegExp(String aRegexFragment) {
final StringBuilder result = new StringBuilder();
final StringCharacterIterator iterator = new StringCharacterIterator(aRegexFragment);
char character = iterator.current();
while (character != CharacterIterator.DONE) {
/*
* All literals need to have backslashes doubled.
*/
if (character == '.') {
result.append("\\.");
} else if (character == '\\') {
result.append("\\\\");
} else if (character == '?') {
result.append("\\?");
} else if (character == '*') {
result.append("\\*");
} else if (character == '+') {
result.append("\\+");
} else if (character == '&') {
result.append("\\&");
} else if (character == ':') {
result.append("\\:");
} else if (character == '{') {
result.append("\\{");
} else if (character == '}') {
result.append("\\}");
} else if (character == '[') {
result.append("\\[");
} else if (character == ']') {
result.append("\\]");
} else if (character == '(') {
result.append("\\(");
} else if (character == ')') {
result.append("\\)");
} else if (character == '^') {
result.append("\\^");
} else if (character == '$') {
result.append("\\$");
} else {
// the char is not a special one
// add it to the result as is
result.append(character);
}
character = iterator.next();
}
return result.toString();
}
public static synchronized String escapeForTMStringParameter(String aTMStringFragment) {
final StringBuilder result = new StringBuilder();
final StringCharacterIterator iterator = new StringCharacterIterator(aTMStringFragment);
char character = iterator.current();
while (character != CharacterIterator.DONE) {
if (character == '"') {
result.append("\\\"");
} else if (character == '\\') {
result.append("\\\\");
} else {
result.append(character);
}
character = iterator.next();
}
return result.toString();
}
public static synchronized void appendStringToFile(String fileName, String str) {
try {
File f = new File(fileName);
BufferedWriter output;
if (!f.exists())
output = new BufferedWriter(new FileWriter(fileName));
else
output = new BufferedWriter(new FileWriter(fileName, true));
output.append(str);
output.close();
} catch (IOException e) {
TextRulerPlugin.error(e);
}
}
public static synchronized TextRulerAnnotation convertToTargetAnnotation(AnnotationFS fs,
TextRulerExampleDocument doc, TextRulerTarget target, TypeSystem ts) {
AnnotationFS theAnnotation;
if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY)
theAnnotation = fs.getCAS().createAnnotation(ts.getType(target.getSingleSlotTypeName()),
fs.getBegin(), fs.getBegin());
else if (target.type == MLTargetType.SINGLE_RIGHT_BOUNDARY)
theAnnotation = fs.getCAS().createAnnotation(ts.getType(target.getSingleSlotTypeName()),
fs.getEnd(), fs.getEnd());
else
theAnnotation = fs;
return new TextRulerAnnotation(theAnnotation, doc);
}
public static synchronized List<Feature> getFilteredAnnotationFeatures(AnnotationFS afs) {
List<Feature> result = new ArrayList<Feature>();
List<Feature> theFeatures = afs.getType().getFeatures();
Set<String> filters = getStandardFeatureFilterSet();
for (Feature f : theFeatures)
if (!filters.contains(f.getName()))
result.add(f);
return result;
}
}