| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| package org.apache.uima.ruta.textruler.core; |
| |
| import java.io.BufferedWriter; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.FileOutputStream; |
| import java.io.FileWriter; |
| import java.io.IOException; |
| import java.net.URL; |
| import java.text.CharacterIterator; |
| import java.text.StringCharacterIterator; |
| import java.util.ArrayList; |
| import java.util.Collections; |
| import java.util.Comparator; |
| import java.util.HashSet; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.uima.UIMAFramework; |
| import org.apache.uima.analysis_engine.AnalysisEngine; |
| import org.apache.uima.analysis_engine.AnalysisEngineDescription; |
| import org.apache.uima.cas.CAS; |
| import org.apache.uima.cas.FSIterator; |
| import org.apache.uima.cas.Feature; |
| import org.apache.uima.cas.Type; |
| import org.apache.uima.cas.TypeSystem; |
| import org.apache.uima.cas.impl.XmiCasDeserializer; |
| import org.apache.uima.cas.impl.XmiCasSerializer; |
| import org.apache.uima.cas.text.AnnotationFS; |
| import org.apache.uima.resource.metadata.TypeDescription; |
| import org.apache.uima.resource.metadata.TypeSystemDescription; |
| import org.apache.uima.ruta.engine.RutaEngine; |
| import org.apache.uima.ruta.ide.core.builder.RutaProjectUtils; |
| import org.apache.uima.ruta.textruler.TextRulerPlugin; |
| import org.apache.uima.ruta.textruler.core.TextRulerTarget.MLTargetType; |
| import org.apache.uima.util.XMLInputSource; |
| import org.apache.uima.util.XMLSerializer; |
| import org.eclipse.core.runtime.FileLocator; |
| import org.eclipse.core.runtime.IPath; |
| import org.eclipse.core.runtime.Path; |
| |
| /** |
| * |
| * This static class provides all kinds of helper methods and constants that are useful for all |
| * kinds of stuff in this project. |
| */ |
| public class TextRulerToolkit { |
| |
| public static final boolean LOGGING_ENABLED = true; |
| |
| public static final boolean DEBUG = false; |
| |
| public static final String RUTA_ALL_TYPE_NAME = "org.apache.uima.ruta.type.ALL"; |
| |
| public static final String RUTA_ANY_TYPE_NAME = "org.apache.uima.ruta.type.ANY"; |
| |
| public static final String RUTA_WORD_TYPE_NAME = "org.apache.uima.ruta.type.W"; |
| |
| public static final String RUTA_BREAK_TYPE_NAME = "org.apache.uima.ruta.type.BREAK"; |
| |
| public static final String RUTA_SPACE_TYPE_NAME = "org.apache.uima.ruta.type.SPACE"; |
| |
| public static final String RUTA_NUM_TYPE_NAME = "org.apache.uima.ruta.type.NUM"; |
| |
| public static final String RUTA_MARKUP_TYPE_NAME = "org.apache.uima.ruta.type.MARKUP"; |
| |
| public static final String RUTA_SPECIAL_TYPE_NAME = "org.apache.uima.ruta.type.SPECIAL"; |
| |
| public static final String RUTA_NBSP_TYPE_NAME = "org.apache.uima.ruta.type.NBSP"; |
| |
| public static final String LEFT_BOUNDARY_EXTENSION = "START"; |
| |
| public static final String RIGHT_BOUNDARY_EXTENSION = "END"; |
| |
| public static void log(String str) { |
| if (LOGGING_ENABLED) |
| System.out.println(str); |
| } |
| |
| public static void logIfDebug(String str) { |
| if (DEBUG) |
| log(str); |
| } |
| |
| public static void logIf(boolean condition, String str) { |
| if (LOGGING_ENABLED && condition) |
| System.out.println(str); |
| } |
| |
| public static URL getResourceURL(String name) { |
| return FileLocator.find(TextRulerPlugin.getDefault().getBundle(), new Path(name), null); |
| } |
| |
| public static AnalysisEngineDescription getAnalysisEngineDescription(String descFile) { |
| AnalysisEngineDescription result = null; |
| try { |
| XMLInputSource in = new XMLInputSource(descFile); |
| result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| result = null; |
| } |
| return result; |
| } |
| |
| public static AnalysisEngineDescription getAnalysisEngineDescription(URL fileURL) { |
| AnalysisEngineDescription result = null; |
| try { |
| XMLInputSource in = new XMLInputSource(fileURL); |
| result = (AnalysisEngineDescription) UIMAFramework.getXMLParser().parseResourceSpecifier(in); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| result = null; |
| } |
| return result; |
| } |
| |
| public static AnalysisEngine loadAnalysisEngine(AnalysisEngineDescription desc) { |
| AnalysisEngine result = null; |
| try { |
| result = UIMAFramework.produceAnalysisEngine(desc); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| result = null; |
| } |
| return result; |
| } |
| |
| public static void addBoundaryTypes(AnalysisEngineDescription description, String[] slotNames) { |
| List<String> list = new ArrayList<String>(); |
| for (String eachSlot : slotNames) { |
| list.add(eachSlot + TextRulerToolkit.LEFT_BOUNDARY_EXTENSION); |
| list.add(eachSlot + TextRulerToolkit.RIGHT_BOUNDARY_EXTENSION); |
| } |
| TypeSystemDescription typeSystem = description.getAnalysisEngineMetaData().getTypeSystem(); |
| for (String string : list) { |
| TypeDescription type = typeSystem.getType(string); |
| if (type == null) { |
| typeSystem.addType(string, "", "uima.tcas.Annotation"); |
| } |
| } |
| } |
| |
| public static CAS readCASfromXMIFile(String filename, AnalysisEngine ae, CAS reuseCAS) { |
| return readCASfromXMIFile(new File(filename), ae, reuseCAS); |
| } |
| |
| public static CAS readCASfromXMIFile(File file, AnalysisEngine ae, CAS reuseCAS) { |
| FileInputStream inputStream = null; |
| try { |
| CAS resultCas; |
| inputStream = new FileInputStream(file); |
| if (reuseCAS != null) { |
| reuseCAS.reset(); |
| resultCas = reuseCAS; |
| } else { |
| resultCas = GlobalCASSource.allocCAS(ae); // ae.newCAS(); |
| } |
| XmiCasDeserializer.deserialize(inputStream, resultCas, true); |
| return resultCas; |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| } finally { |
| try { |
| if (inputStream != null) |
| inputStream.close(); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| } |
| } |
| return null; |
| } |
| |
| public static void writeCAStoXMIFile(CAS aCas, String filename)// throws |
| // IOException, |
| // SAXException |
| { |
| File newFile = new File(filename); |
| FileOutputStream out = null; |
| |
| try { |
| // write XMI |
| out = new FileOutputStream(newFile); |
| XmiCasSerializer ser = new XmiCasSerializer(aCas.getTypeSystem()); |
| XMLSerializer xmlSer = new XMLSerializer(out, false); |
| ser.serialize(aCas, xmlSer.getContentHandler()); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| } finally { |
| if (out != null) { |
| try { |
| out.close(); |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| } |
| |
| } |
| } |
| } |
| |
| public static List<AnnotationFS> extractAnnotationsForSlotName(CAS aCas, String slotName) { |
| List<AnnotationFS> result = new ArrayList<AnnotationFS>(); |
| TypeSystem ts = aCas.getTypeSystem(); |
| Type slotType = ts.getType(slotName); |
| FSIterator<AnnotationFS> it = aCas.getAnnotationIndex(slotType).iterator(true); |
| if (!it.isValid()) { |
| // System.out.println("##### -> iterator not valid for slots!!"); |
| } |
| while (it.isValid()) { |
| AnnotationFS fs = it.get(); |
| |
| // quick hack for quantifier bug in TM: |
| AnnotationFS previous = result.size() > 0 ? result.get(result.size() - 1) : null; |
| if (previous == null || previous.getBegin() != fs.getBegin() |
| || previous.getEnd() != fs.getEnd()) |
| result.add(fs); |
| else { |
| logIfDebug("******** TM QUANTIFIER BUG ?? Multiple annotation: " + fs.getType().getName()); |
| } |
| it.moveToNext(); |
| } |
| |
| return result; |
| } |
| |
| private static List<AnnotationFS> getAnnotationWithinBounds(CAS aCas, int posStart, int posEnd, |
| Set<String> filterSet, Type rootType) { |
| List<AnnotationFS> result = new ArrayList<AnnotationFS>(); |
| TypeSystem ts = aCas.getTypeSystem(); |
| try { |
| |
| // TODO wie in TMs AnnotationRetrieval evtl nicht den subiterator |
| // nehmen, da der auf |
| // type comparisons basiert, die wir evtl nicht gegeben haben!? |
| AnnotationFS boundaryAnnotation = aCas.createAnnotation(ts.getType("uima.tcas.Annotation"), |
| posStart > 0 ? posStart - 1 : 0, posEnd); // TODO ist das |
| // richtig so??!! |
| FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().subiterator(boundaryAnnotation, true, |
| true); |
| while (it.isValid()) { |
| AnnotationFS fs = it.get(); |
| if (fs.getBegin() < posStart || fs.getEnd() > posEnd) { |
| it.moveToNext(); |
| continue; |
| } |
| if (rootType != null) { |
| if (!ts.subsumes(rootType, fs.getType())) { |
| it.moveToNext(); |
| continue; |
| } |
| } |
| if (filterSet == null || !filterSet.contains(fs.getType().getName())) { |
| result.add(fs); |
| } |
| |
| it.moveToNext(); |
| } |
| |
| } catch (Exception e) { |
| TextRulerPlugin.error(e); |
| } |
| return result; |
| } |
| |
| public static List<AnnotationFS> getAnnotationsBeforePosition(CAS aCas, int position, |
| int maxCount, Set<String> filterSet, Type rootType) { |
| List<AnnotationFS> result = getAnnotationWithinBounds(aCas, 0, position, filterSet, rootType); |
| if (maxCount > 0) { |
| while (result.size() > maxCount) |
| result.remove(0); // remove from front of queue ! |
| } |
| return result; |
| } |
| |
| public static List<AnnotationFS> getAnnotationsAfterPosition(CAS aCas, int position, |
| int maxCount, Set<String> filterSet, Type rootType) { |
| int maxPos = aCas.getDocumentText().length() - 1; |
| List<AnnotationFS> result = getAnnotationWithinBounds(aCas, position, maxPos, filterSet, |
| rootType); |
| if (maxCount > 0) { |
| while (result.size() > maxCount) |
| result.remove(result.size() - 1); // remove from end of queue! |
| } |
| return result; |
| } |
| |
| public static List<AnnotationFS> getAnnotationsWithinBounds(CAS aCas, int start, int end, |
| Set<String> filterSet, Type rootType) { |
| return getAnnotationWithinBounds(aCas, start, end, filterSet, rootType); |
| } |
| |
| public static List<AnnotationFS> getOtherAnnotationsOverToken(CAS aCas, |
| AnnotationFS tmTokenAnnotation, Set<String> filterSet) { |
| List<AnnotationFS> result = new ArrayList<AnnotationFS>(); |
| // filter out document annotation!! |
| FSIterator<AnnotationFS> it = aCas.getAnnotationIndex().iterator(); |
| Type tokenType = tmTokenAnnotation.getType(); |
| FSIterator<AnnotationFS> leftIt = null; |
| FSIterator<AnnotationFS> rightIt = null; |
| TypeSystem ts = aCas.getTypeSystem(); |
| Type tmRootType = ts.getType(RUTA_ALL_TYPE_NAME); |
| Set<String> allFilters = new HashSet<String>(); |
| allFilters.add("uima.tcas.DocumentAnnotation"); |
| allFilters.add(RutaEngine.BASIC_TYPE); |
| if (filterSet != null) |
| allFilters.addAll(filterSet); |
| for (; it.isValid(); it.moveToNext()) { |
| AnnotationFS fs = (AnnotationFS) it.get(); |
| if (fs.getBegin() == tmTokenAnnotation.getBegin() |
| && fs.getEnd() == tmTokenAnnotation.getEnd() && fs.getType().equals(tokenType)) { |
| leftIt = it; |
| |
| rightIt = it.copy(); |
| break; |
| } |
| } |
| if (leftIt == null) |
| return null; // the token annotation was not found ! |
| if (leftIt.isValid()) |
| leftIt.moveToPrevious(); // leave our token annotation behind us... |
| // search from the token annotation to the left |
| for (; leftIt.isValid(); leftIt.moveToPrevious()) { |
| AnnotationFS fs = (AnnotationFS) leftIt.get(); |
| if (fs.getEnd() <= tmTokenAnnotation.getBegin()) |
| break; // if that happens we are out of reach and can stop |
| if (fs.getBegin() <= tmTokenAnnotation.getBegin() |
| && fs.getEnd() >= tmTokenAnnotation.getEnd() |
| && !allFilters.contains(fs.getType().getName()) |
| && !ts.subsumes(tmRootType, fs.getType())) |
| result.add(fs); |
| } |
| |
| // search from the token annotation to the right |
| if (rightIt.isValid()) |
| rightIt.moveToNext(); // leave our token annotation behind us... |
| for (; rightIt.isValid(); rightIt.moveToNext()) { |
| AnnotationFS fs = (AnnotationFS) rightIt.get(); |
| if (fs.getBegin() >= tmTokenAnnotation.getEnd()) |
| break; // if that happens we are out of reach and can stop |
| if (fs.getBegin() <= tmTokenAnnotation.getBegin() |
| && fs.getEnd() >= tmTokenAnnotation.getEnd() |
| && !allFilters.contains(fs.getType().getName()) |
| && !ts.subsumes(tmRootType, fs.getType())) |
| result.add(fs); |
| } |
| return result; |
| } |
| |
| public static synchronized Set<String> getFilterSetWithSlotNames(String[] slotNames, |
| Set<String> otherFilters) { |
| Set<String> result = new HashSet<String>(otherFilters); |
| result.add(RutaEngine.BASIC_TYPE); |
| if (slotNames != null) |
| for (String s : slotNames) |
| result.add(s); |
| return result; |
| } |
| |
| public static synchronized Set<String> getFilterSetWithSlotName(String slotName, |
| Set<String> otherFilters) { |
| String[] sn = { slotName }; |
| return getFilterSetWithSlotNames(sn, otherFilters); |
| } |
| |
| public static synchronized String getStandardFilterSetString() { |
| String str = ""; |
| for (String s : getStandardFilterSet(null)) |
| if (str.length() == 0) |
| str += s; |
| else |
| str += ", " + s; |
| return str; |
| } |
| |
| public static synchronized Set<String> getStandardFilterSet(String[] slotNames) { |
| Set<String> filterSet = new HashSet<String>(); |
| if (slotNames != null) { |
| for (String s : slotNames) |
| filterSet.add(s); |
| } |
| filterSet.add(RUTA_SPACE_TYPE_NAME); |
| filterSet.add(RUTA_BREAK_TYPE_NAME); |
| filterSet.add(RUTA_MARKUP_TYPE_NAME); |
| filterSet.add(RUTA_NBSP_TYPE_NAME); |
| return filterSet; |
| } |
| |
| public static synchronized Set<String> getStandardFeatureFilterSet() { |
| Set<String> filterSet = new HashSet<String>(); |
| |
| filterSet.add("uima.cas.AnnotationBase:sofa"); |
| filterSet.add("uima.tcas.Annotation:begin"); |
| filterSet.add("uima.tcas.Annotation:end"); |
| filterSet.add("org.apache.uima.ruta.type.RutaBasic:Replacement"); |
| return filterSet; |
| } |
| |
| // return the example of the list if found, null otherwise |
| public static synchronized TextRulerExample exampleListContainsAnnotation( |
| List<TextRulerExample> list, TextRulerAnnotation ann) { |
| TextRulerExample needle = new TextRulerExample(null, ann, true, null); |
| int index = Collections.binarySearch(list, needle, new Comparator<TextRulerExample>() { |
| public int compare(TextRulerExample o1, TextRulerExample o2) { |
| TextRulerAnnotation afs1 = o1.getAnnotation(); |
| TextRulerAnnotation afs2 = o2.getAnnotation(); |
| if (afs1.getBegin() < afs2.getBegin()) |
| return -1; |
| else if (afs1.getBegin() > afs2.getBegin()) |
| return 1; |
| else if (afs1.getEnd() > afs2.getEnd()) |
| return -1; |
| else if (afs1.getEnd() < afs2.getEnd()) |
| return 1; |
| else |
| return 0; |
| } |
| }); |
| if (index >= 0) |
| return list.get(index); |
| else |
| return null; |
| } |
| |
| public static synchronized String addTrailingSlashToPath(String path) { |
| if (!(path.endsWith("/") || path.endsWith("\\"))) |
| path = path + System.getProperty("file.separator"); |
| return path; |
| } |
| |
| public static synchronized String createTemporaryDirectory() throws IOException { |
| |
| final File temp; |
| |
| temp = File.createTempFile("temp", Long.toString(System.nanoTime())); |
| if (!(temp.delete())) |
| return null; |
| if (!(temp.mkdir())) |
| return null; |
| temp.deleteOnExit(); |
| return addTrailingSlashToPath(temp.getPath()); |
| } |
| |
| public static synchronized String getTypeShortName(String typeName) { |
| if (typeName.indexOf(".") >= 0) { |
| String components[] = typeName.split("\\."); |
| return components[components.length - 1]; |
| } else |
| return typeName; |
| } |
| |
| public static synchronized String getEngineDescriptorFromTMSourceFile(IPath scriptFilePath) { |
| IPath folder = scriptFilePath; |
| |
| while (!folder.lastSegment().equals(RutaProjectUtils.getDefaultScriptLocation())) { |
| folder = folder.removeLastSegments(1); |
| } |
| IPath relativeTo = scriptFilePath.makeRelativeTo(folder); |
| IPath projectPath = folder.removeLastSegments(1); |
| String elementName = scriptFilePath.lastSegment(); |
| int lastIndexOf = elementName.lastIndexOf(RutaEngine.SCRIPT_FILE_EXTENSION); |
| if (lastIndexOf != -1) { |
| elementName = elementName.substring(0, lastIndexOf); |
| } |
| IPath descPath = projectPath.append(RutaProjectUtils.getDefaultDescriptorLocation()); |
| IPath descPackagePath = descPath.append(relativeTo.removeLastSegments(1)); |
| return descPackagePath.append(elementName + "Engine.xml").toString(); |
| } |
| |
| public static synchronized String getTypeSystemDescriptorFromTMSourceFile(IPath scriptFilePath) { |
| IPath folder = scriptFilePath; |
| |
| while (!folder.lastSegment().equals(RutaProjectUtils.getDefaultScriptLocation())) { |
| folder = folder.removeLastSegments(1); |
| } |
| IPath relativeTo = scriptFilePath.makeRelativeTo(folder); |
| IPath projectPath = folder.removeLastSegments(1); |
| String elementName = scriptFilePath.lastSegment(); |
| int lastIndexOf = elementName.lastIndexOf(RutaEngine.SCRIPT_FILE_EXTENSION); |
| if (lastIndexOf != -1) { |
| elementName = elementName.substring(0, lastIndexOf); |
| } |
| IPath descPath = projectPath.append(RutaProjectUtils.getDefaultDescriptorLocation()); |
| IPath descPackagePath = descPath.append(relativeTo.removeLastSegments(1)); |
| return descPackagePath.append(elementName + "TypeSystem.xml").toString(); |
| } |
| |
| public static synchronized String escapeForRegExp(String aRegexFragment) { |
| final StringBuilder result = new StringBuilder(); |
| |
| final StringCharacterIterator iterator = new StringCharacterIterator(aRegexFragment); |
| char character = iterator.current(); |
| while (character != CharacterIterator.DONE) { |
| /* |
| * All literals need to have backslashes doubled. |
| */ |
| if (character == '.') { |
| result.append("\\."); |
| } else if (character == '\\') { |
| result.append("\\\\"); |
| } else if (character == '?') { |
| result.append("\\?"); |
| } else if (character == '*') { |
| result.append("\\*"); |
| } else if (character == '+') { |
| result.append("\\+"); |
| } else if (character == '&') { |
| result.append("\\&"); |
| } else if (character == ':') { |
| result.append("\\:"); |
| } else if (character == '{') { |
| result.append("\\{"); |
| } else if (character == '}') { |
| result.append("\\}"); |
| } else if (character == '[') { |
| result.append("\\["); |
| } else if (character == ']') { |
| result.append("\\]"); |
| } else if (character == '(') { |
| result.append("\\("); |
| } else if (character == ')') { |
| result.append("\\)"); |
| } else if (character == '^') { |
| result.append("\\^"); |
| } else if (character == '$') { |
| result.append("\\$"); |
| } else { |
| // the char is not a special one |
| // add it to the result as is |
| result.append(character); |
| } |
| character = iterator.next(); |
| } |
| return result.toString(); |
| } |
| |
| public static synchronized String escapeForTMStringParameter(String aTMStringFragment) { |
| final StringBuilder result = new StringBuilder(); |
| |
| final StringCharacterIterator iterator = new StringCharacterIterator(aTMStringFragment); |
| char character = iterator.current(); |
| while (character != CharacterIterator.DONE) { |
| if (character == '"') { |
| result.append("\\\""); |
| } else if (character == '\\') { |
| result.append("\\\\"); |
| } else { |
| result.append(character); |
| } |
| character = iterator.next(); |
| } |
| return result.toString(); |
| } |
| |
| public static synchronized void appendStringToFile(String fileName, String str) { |
| try { |
| File f = new File(fileName); |
| BufferedWriter output; |
| if (!f.exists()) |
| output = new BufferedWriter(new FileWriter(fileName)); |
| else |
| output = new BufferedWriter(new FileWriter(fileName, true)); |
| output.append(str); |
| output.close(); |
| } catch (IOException e) { |
| TextRulerPlugin.error(e); |
| } |
| } |
| |
| public static synchronized TextRulerAnnotation convertToTargetAnnotation(AnnotationFS fs, |
| TextRulerExampleDocument doc, TextRulerTarget target, TypeSystem ts) { |
| AnnotationFS theAnnotation; |
| if (target.type == MLTargetType.SINGLE_LEFT_BOUNDARY) |
| theAnnotation = fs.getCAS().createAnnotation(ts.getType(target.getSingleSlotTypeName()), |
| fs.getBegin(), fs.getBegin()); |
| else if (target.type == MLTargetType.SINGLE_RIGHT_BOUNDARY) |
| theAnnotation = fs.getCAS().createAnnotation(ts.getType(target.getSingleSlotTypeName()), |
| fs.getEnd(), fs.getEnd()); |
| else |
| theAnnotation = fs; |
| return new TextRulerAnnotation(theAnnotation, doc); |
| } |
| |
| public static synchronized List<Feature> getFilteredAnnotationFeatures(AnnotationFS afs) { |
| List<Feature> result = new ArrayList<Feature>(); |
| List<Feature> theFeatures = afs.getType().getFeatures(); |
| Set<String> filters = getStandardFeatureFilterSet(); |
| for (Feature f : theFeatures) |
| if (!filters.contains(f.getName())) |
| result.add(f); |
| return result; |
| } |
| } |