uimafit-core/src/main/java/org/apache/uima/fit/component/CasDumpWriter.java - uima-uimafit - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.uima.fit.component;

 import java.io.File;
 import java.io.FileOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStreamWriter;
 import java.io.PrintWriter;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.Comparator;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Set;
 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 import org.apache.commons.io.IOUtils;
 import org.apache.commons.io.output.CloseShieldOutputStream;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.FeatureStructure;
 import org.apache.uima.cas.Type;
 import org.apache.uima.cas.impl.FeatureStructureImpl;
 import org.apache.uima.cas.text.AnnotationFS;
 import org.apache.uima.fit.descriptor.ConfigurationParameter;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.springframework.util.DigestUtils;

 /**
  * Dumps CAS content to a text file. This is useful when setting up test cases which contain a
  * reference output to which an actually produced CAS is compared. The format produced by this
  * component is more easily comparable than a XCAS or XMI format.
  *
  */
 public class CasDumpWriter extends CasConsumer_ImplBase {
   /**
    * Pattern inclusion prefix.
    */
   public static final String INCLUDE_PREFIX = "+|";

   /**
    * Pattern exclusion prefix.
    */
   public static final String EXCLUDE_PREFIX = "-|";

   /**
    * Output file. If multiple CASes as processed, their contents are concatenated into this file.
    * Mind that a test case using this consumer with multiple CASes requires a reader which produced
    * the CASes always in the same order. When this file is set to "-", the dump does to
    * {@link System#out} (default).
    */
   public static final String PARAM_OUTPUT_FILE = "outputFile";

   @ConfigurationParameter(name = PARAM_OUTPUT_FILE, mandatory = true, defaultValue = "-")
   private File outputFile;

   /**
    * Whether to dump the content of the {@link CAS#getDocumentAnnotation()}.
    */
   public static final String PARAM_WRITE_DOCUMENT_META_DATA = "writeDocumentMetaData";

   @ConfigurationParameter(name = PARAM_WRITE_DOCUMENT_META_DATA, mandatory = true, defaultValue = "true")
   private boolean writeDocumentMetaData;

   /**
    * Include/exclude features according to the following patterns. Mind that the patterns do not
    * actually match feature names but lines produced by {@code FeatureStructure.toString()}.
    */
   public static final String PARAM_FEATURE_PATTERNS = "featurePatterns";

   @ConfigurationParameter(name = PARAM_FEATURE_PATTERNS, mandatory = true, defaultValue = { "+|.*",
       "-|^.*documentUri:.*$", "-|^.*collectionId:.*$", "-|^.*documentBaseUri:.*$" })
   private String[] featurePatterns;

   private InExPattern[] cookedFeaturePatterns;

   /**
    * Include/exclude specified UIMA types in the output.
    */
   public static final String PARAM_TYPE_PATTERNS = "typePatterns";

   @ConfigurationParameter(name = PARAM_TYPE_PATTERNS, mandatory = true, defaultValue = { "+|.*" })
   private String[] typePatterns;

   /**
    * Sort increasing by begin, decreasing by end, increasing by name instead of relying on index
    * order.
    */
   public static final String PARAM_SORT = "sort";

   @ConfigurationParameter(name = PARAM_SORT, mandatory = true, defaultValue = "false")
   private boolean sort;

   private InExPattern[] cookedTypePatterns;

   private PrintWriter out;

   private int iCas;

   @Override
   public void initialize(UimaContext context) throws ResourceInitializationException {
     super.initialize(context);

     try {
       if (out == null) {
         if ("-".equals(outputFile.getName())) {
           out = new PrintWriter(new CloseShieldOutputStream(System.out));
         } else {
           if (outputFile.getParentFile() != null) {
             outputFile.getParentFile().mkdirs();
           }
           out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));
         }
       }
     } catch (IOException e) {
       throw new ResourceInitializationException(e);
     }

     cookedTypePatterns = compilePatterns(typePatterns);
     cookedFeaturePatterns = compilePatterns(featurePatterns);
   }

   @Override
   public void process(CAS aCAS) throws AnalysisEngineProcessException {
     out.println("======== CAS " + iCas + " begin ==================================");
     out.println();

     Iterator<CAS> viewIt = aCAS.getViewIterator();
     while (viewIt.hasNext()) {
       CAS view = viewIt.next();
       processView(view);

       if (view.getDocumentText() == null && view.getSofaDataStream() != null) {
         processSofaData(view);
       }
     }

     out.println("======== CAS " + iCas + " end ==================================");
     out.println();
     out.println();
     out.flush();

     iCas++;
   }

   @Override
   public void collectionProcessComplete() {
     IOUtils.closeQuietly(out);
     out = null;
   }

   private void processDocumentMetadata(CAS aCAS) {
     if (!writeDocumentMetaData) {
       return;
     }

     processFeatureStructure(aCAS.getDocumentAnnotation());
   }

   private void processDocumentText(CAS aCAS) {
     out.println();
     out.println("CAS-Text:");
     out.println(aCAS.getDocumentText());
   }

   private void processFeatureStructures(CAS aCAS) {
     Set<String> typesToPrint = getTypes(aCAS);
     Iterator<AnnotationFS> annotationIterator = aCAS.getAnnotationIndex().iterator();

     if (sort) {
       List<AnnotationFS> sortedFS = new ArrayList<AnnotationFS>();
       while (annotationIterator.hasNext()) {
         sortedFS.add(annotationIterator.next());
       }

       Collections.sort(sortedFS, new Comparator<AnnotationFS>() {
         @Override
         public int compare(AnnotationFS aO1, AnnotationFS aO2) {
           int begin = aO1.getBegin() - aO2.getBegin();
           if (begin != 0) {
             return begin;
           }

           int end = aO2.getEnd() - aO1.getEnd();
           if (end != 0) {
             return end;
           }

           int name = aO1.getType().getName().compareTo(aO2.getType().getName());
           if (name != 0) {
             return name;
           }

           // Last resort: try the address.
           if (aO1 instanceof FeatureStructureImpl && aO2 instanceof FeatureStructureImpl) {
             return ((FeatureStructureImpl) aO1).getAddress()
                     - ((FeatureStructureImpl) aO2).getAddress();
           }

           // Fall back to name.
           return name;
         }
       });

       annotationIterator = sortedFS.iterator();
     }

     while (annotationIterator.hasNext()) {
       AnnotationFS annotation = annotationIterator.next();
       if (!typesToPrint.contains(annotation.getType().getName())) {
         continue;
       }
       try {
         out.println("[" + annotation.getCoveredText() + "]");
       } catch (IndexOutOfBoundsException e) {
         out.println("<OFFSETS OUT OF BOUNDS>");
       }
       processFeatureStructure(annotation);
     }
   }

   private void processFeatureStructure(FeatureStructure aFS) {
     String meta = aFS.toString();
     for (String line : meta.split("\n")) {
       boolean print = false;
       for (InExPattern p : cookedFeaturePatterns) {
         p.matchter.reset(line);
         if (p.matchter.matches()) {
           print = p.includeInOutput;
         }
       }
       if (print) {
         out.println(line);
       }
     }
   }

   private void processView(CAS aCAS) {
     out.println("-------- View " + aCAS.getViewName() + " begin ----------------------------------");
     out.println();

     processDocumentMetadata(aCAS);
     processDocumentText(aCAS);
     processFeatureStructures(aCAS);

     out.println("-------- View " + aCAS.getViewName() + " end ----------------------------------");
     out.println();
   }

   private void processSofaData(CAS aCAS) throws AnalysisEngineProcessException {
     out.println("Sofa data:");

     //

     // Mime type
     String mimeType = aCAS.getSofaMimeType();
     if (mimeType != null) {
       out.println("   mime type:\t" + mimeType);
     }
     // Data
     byte[] bytes = null;
     InputStream in = null;
     try {
       in = aCAS.getSofaDataStream();
       bytes = IOUtils.toByteArray(in);
     } catch (IOException e) {
       throw new AnalysisEngineProcessException(e);
     } finally {
       IOUtils.closeQuietly(in);
     }
     if (bytes != null) {
       // Data size
       out.println("   size:\t" + bytes.length + " byte(s)");
       // Hash value of the bytes
       String hash = DigestUtils.md5DigestAsHex(bytes);
       out.println("   hash value:\t" + hash);
     }

     out.println();
   }

   private static InExPattern[] compilePatterns(String[] aPatterns) {
     InExPattern[] patterns = new InExPattern[aPatterns.length];
     for (int i = 0; i < aPatterns.length; i++) {
       if (aPatterns[i].startsWith(INCLUDE_PREFIX)) {
         patterns[i] = new InExPattern(aPatterns[i].substring(INCLUDE_PREFIX.length()), true);
       } else if (aPatterns[i].startsWith(EXCLUDE_PREFIX)) {
         patterns[i] = new InExPattern(aPatterns[i].substring(EXCLUDE_PREFIX.length()), false);
       } else {
         patterns[i] = new InExPattern(aPatterns[i], false);
       }
     }
     return patterns;
   }

   private Set<String> getTypes(CAS cas) {
     Set<String> types = new HashSet<String>();
     Iterator<Type> typeIt = cas.getTypeSystem().getTypeIterator();
     nextType: while (typeIt.hasNext()) {
       Type type = typeIt.next();

       if (type.getName().equals(cas.getDocumentAnnotation().getType().getName())) {
         continue;
       }

       for (InExPattern p : cookedTypePatterns) {
         p.matchter.reset(type.getName());
         if (p.matchter.matches()) {
           if (p.includeInOutput) {
             types.add(type.getName());
           } else {
             types.remove(type.getName());
           }
           continue nextType;
         }
       }
     }
     return types;
   }

   private static class InExPattern {
     final boolean includeInOutput;

     final Matcher matchter;

     public InExPattern(String aPattern, boolean aInclude) {
       includeInOutput = aInclude;
       matchter = Pattern.compile(aPattern).matcher("");
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.uima.fit.component;

	import java.io.File;
	import java.io.FileOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStreamWriter;
	import java.io.PrintWriter;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.Comparator;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Set;
	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	import org.apache.commons.io.IOUtils;
	import org.apache.commons.io.output.CloseShieldOutputStream;
	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.FeatureStructure;
	import org.apache.uima.cas.Type;
	import org.apache.uima.cas.impl.FeatureStructureImpl;
	import org.apache.uima.cas.text.AnnotationFS;
	import org.apache.uima.fit.descriptor.ConfigurationParameter;
	import org.apache.uima.resource.ResourceInitializationException;
	import org.springframework.util.DigestUtils;

	/**
	* Dumps CAS content to a text file. This is useful when setting up test cases which contain a
	* reference output to which an actually produced CAS is compared. The format produced by this
	* component is more easily comparable than a XCAS or XMI format.
	*
	*/
	public class CasDumpWriter extends CasConsumer_ImplBase {
	/**
	* Pattern inclusion prefix.
	*/
	public static final String INCLUDE_PREFIX = "+\|";

	/**
	* Pattern exclusion prefix.
	*/
	public static final String EXCLUDE_PREFIX = "-\|";

	/**
	* Output file. If multiple CASes as processed, their contents are concatenated into this file.
	* Mind that a test case using this consumer with multiple CASes requires a reader which produced
	* the CASes always in the same order. When this file is set to "-", the dump does to
	* {@link System#out} (default).
	*/
	public static final String PARAM_OUTPUT_FILE = "outputFile";

	@ConfigurationParameter(name = PARAM_OUTPUT_FILE, mandatory = true, defaultValue = "-")
	private File outputFile;

	/**
	* Whether to dump the content of the {@link CAS#getDocumentAnnotation()}.
	*/
	public static final String PARAM_WRITE_DOCUMENT_META_DATA = "writeDocumentMetaData";

	@ConfigurationParameter(name = PARAM_WRITE_DOCUMENT_META_DATA, mandatory = true, defaultValue = "true")
	private boolean writeDocumentMetaData;

	/**
	* Include/exclude features according to the following patterns. Mind that the patterns do not
	* actually match feature names but lines produced by {@code FeatureStructure.toString()}.
	*/
	public static final String PARAM_FEATURE_PATTERNS = "featurePatterns";

	@ConfigurationParameter(name = PARAM_FEATURE_PATTERNS, mandatory = true, defaultValue = { "+\|.*",
	"-\|^.documentUri:.$", "-\|^.collectionId:.$", "-\|^.documentBaseUri:.$" })
	private String[] featurePatterns;

	private InExPattern[] cookedFeaturePatterns;

	/**
	* Include/exclude specified UIMA types in the output.
	*/
	public static final String PARAM_TYPE_PATTERNS = "typePatterns";

	@ConfigurationParameter(name = PARAM_TYPE_PATTERNS, mandatory = true, defaultValue = { "+\|.*" })
	private String[] typePatterns;

	/**
	* Sort increasing by begin, decreasing by end, increasing by name instead of relying on index
	* order.
	*/
	public static final String PARAM_SORT = "sort";

	@ConfigurationParameter(name = PARAM_SORT, mandatory = true, defaultValue = "false")
	private boolean sort;

	private InExPattern[] cookedTypePatterns;

	private PrintWriter out;

	private int iCas;

	@Override
	public void initialize(UimaContext context) throws ResourceInitializationException {
	super.initialize(context);

	try {
	if (out == null) {
	if ("-".equals(outputFile.getName())) {
	out = new PrintWriter(new CloseShieldOutputStream(System.out));
	} else {
	if (outputFile.getParentFile() != null) {
	outputFile.getParentFile().mkdirs();
	}
	out = new PrintWriter(new OutputStreamWriter(new FileOutputStream(outputFile), "UTF-8"));
	}
	}
	} catch (IOException e) {
	throw new ResourceInitializationException(e);
	}

	cookedTypePatterns = compilePatterns(typePatterns);
	cookedFeaturePatterns = compilePatterns(featurePatterns);
	}

	@Override
	public void process(CAS aCAS) throws AnalysisEngineProcessException {
	out.println("======== CAS " + iCas + " begin ==================================");
	out.println();

	Iterator<CAS> viewIt = aCAS.getViewIterator();
	while (viewIt.hasNext()) {
	CAS view = viewIt.next();
	processView(view);

	if (view.getDocumentText() == null && view.getSofaDataStream() != null) {
	processSofaData(view);
	}
	}

	out.println("======== CAS " + iCas + " end ==================================");
	out.println();
	out.println();
	out.flush();

	iCas++;
	}

	@Override
	public void collectionProcessComplete() {
	IOUtils.closeQuietly(out);
	out = null;
	}

	private void processDocumentMetadata(CAS aCAS) {
	if (!writeDocumentMetaData) {
	return;
	}

	processFeatureStructure(aCAS.getDocumentAnnotation());
	}

	private void processDocumentText(CAS aCAS) {
	out.println();
	out.println("CAS-Text:");
	out.println(aCAS.getDocumentText());
	}

	private void processFeatureStructures(CAS aCAS) {
	Set<String> typesToPrint = getTypes(aCAS);
	Iterator<AnnotationFS> annotationIterator = aCAS.getAnnotationIndex().iterator();

	if (sort) {
	List<AnnotationFS> sortedFS = new ArrayList<AnnotationFS>();
	while (annotationIterator.hasNext()) {
	sortedFS.add(annotationIterator.next());
	}

	Collections.sort(sortedFS, new Comparator<AnnotationFS>() {
	@Override
	public int compare(AnnotationFS aO1, AnnotationFS aO2) {
	int begin = aO1.getBegin() - aO2.getBegin();
	if (begin != 0) {
	return begin;
	}

	int end = aO2.getEnd() - aO1.getEnd();
	if (end != 0) {
	return end;
	}

	int name = aO1.getType().getName().compareTo(aO2.getType().getName());
	if (name != 0) {
	return name;
	}

	// Last resort: try the address.
	if (aO1 instanceof FeatureStructureImpl && aO2 instanceof FeatureStructureImpl) {
	return ((FeatureStructureImpl) aO1).getAddress()
	- ((FeatureStructureImpl) aO2).getAddress();
	}

	// Fall back to name.
	return name;
	}
	});

	annotationIterator = sortedFS.iterator();
	}

	while (annotationIterator.hasNext()) {
	AnnotationFS annotation = annotationIterator.next();
	if (!typesToPrint.contains(annotation.getType().getName())) {
	continue;
	}
	try {
	out.println("[" + annotation.getCoveredText() + "]");
	} catch (IndexOutOfBoundsException e) {
	out.println("<OFFSETS OUT OF BOUNDS>");
	}
	processFeatureStructure(annotation);
	}
	}

	private void processFeatureStructure(FeatureStructure aFS) {
	String meta = aFS.toString();
	for (String line : meta.split("\n")) {
	boolean print = false;
	for (InExPattern p : cookedFeaturePatterns) {
	p.matchter.reset(line);
	if (p.matchter.matches()) {
	print = p.includeInOutput;
	}
	}
	if (print) {
	out.println(line);
	}
	}
	}

	private void processView(CAS aCAS) {
	out.println("-------- View " + aCAS.getViewName() + " begin ----------------------------------");
	out.println();

	processDocumentMetadata(aCAS);
	processDocumentText(aCAS);
	processFeatureStructures(aCAS);

	out.println("-------- View " + aCAS.getViewName() + " end ----------------------------------");
	out.println();
	}

	private void processSofaData(CAS aCAS) throws AnalysisEngineProcessException {
	out.println("Sofa data:");

	//

	// Mime type
	String mimeType = aCAS.getSofaMimeType();
	if (mimeType != null) {
	out.println(" mime type:\t" + mimeType);
	}
	// Data
	byte[] bytes = null;
	InputStream in = null;
	try {
	in = aCAS.getSofaDataStream();
	bytes = IOUtils.toByteArray(in);
	} catch (IOException e) {
	throw new AnalysisEngineProcessException(e);
	} finally {
	IOUtils.closeQuietly(in);
	}
	if (bytes != null) {
	// Data size
	out.println(" size:\t" + bytes.length + " byte(s)");
	// Hash value of the bytes
	String hash = DigestUtils.md5DigestAsHex(bytes);
	out.println(" hash value:\t" + hash);
	}

	out.println();
	}

	private static InExPattern[] compilePatterns(String[] aPatterns) {
	InExPattern[] patterns = new InExPattern[aPatterns.length];
	for (int i = 0; i < aPatterns.length; i++) {
	if (aPatterns[i].startsWith(INCLUDE_PREFIX)) {
	patterns[i] = new InExPattern(aPatterns[i].substring(INCLUDE_PREFIX.length()), true);
	} else if (aPatterns[i].startsWith(EXCLUDE_PREFIX)) {
	patterns[i] = new InExPattern(aPatterns[i].substring(EXCLUDE_PREFIX.length()), false);
	} else {
	patterns[i] = new InExPattern(aPatterns[i], false);
	}
	}
	return patterns;
	}

	private Set<String> getTypes(CAS cas) {
	Set<String> types = new HashSet<String>();
	Iterator<Type> typeIt = cas.getTypeSystem().getTypeIterator();
	nextType: while (typeIt.hasNext()) {
	Type type = typeIt.next();

	if (type.getName().equals(cas.getDocumentAnnotation().getType().getName())) {
	continue;
	}

	for (InExPattern p : cookedTypePatterns) {
	p.matchter.reset(type.getName());
	if (p.matchter.matches()) {
	if (p.includeInOutput) {
	types.add(type.getName());
	} else {
	types.remove(type.getName());
	}
	continue nextType;
	}
	}
	}
	return types;
	}

	private static class InExPattern {
	final boolean includeInOutput;

	final Matcher matchter;

	public InExPattern(String aPattern, boolean aInclude) {
	includeInOutput = aInclude;
	matchter = Pattern.compile(aPattern).matcher("");
	}
	}
	}