ctakes-core/src/main/java/org/apache/ctakes/core/cc/NormalizedFilesInDirectoryCasConsumer.java - ctakes - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.ctakes.core.cc;

 import org.apache.ctakes.core.pipeline.PipeBitInfo;
 import org.apache.ctakes.core.util.annotation.WordTokenUtil;
 import org.apache.ctakes.core.util.doc.DocIdUtil;
 import org.apache.ctakes.typesystem.type.syntax.WordToken;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.collection.CasConsumer_ImplBase;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.JFSIndexRepository;
 import org.apache.uima.jcas.tcas.Annotation;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceProcessException;

 import java.io.*;

 /**
  * For each CAS a local file with the document text is written to a directory specifed by a parameter.
  * This CAS consumer does not make use of any annotation information in the cas except for the document
  * id specified the CommonTypeSystem.xml descriptor.  The document id will be the name of the file written
  * for each CAS.
  * <p/>
  * This CAS consumer may be useful if you want to write the results of a collection reader and/or CAS
  * initializer to the local file system.  For example, a JDBC Collection Reader may read XML documents
  * from a database and a specialized cas initializer may convert the XML to plain text.  The
  * FilesInDirectoryCasConsumer can now be used to write the plain text to local plain text files.
  */

 @PipeBitInfo(
       name = "Document Text Writer (Dir)",
       description = "Writes Text files with original text from the document in a specified directory.",
       role = PipeBitInfo.Role.WRITER,
       dependencies = { PipeBitInfo.TypeProduct.DOCUMENT_ID, PipeBitInfo.TypeProduct.BASE_TOKEN }
 )
 public class NormalizedFilesInDirectoryCasConsumer extends CasConsumer_ImplBase {

    public static final String PARAM_OUTPUTDIR = "OutputDirectory";

    File iv_outputDirectory;

    @Override
    public void initialize() throws ResourceInitializationException {
       String outputDirectoryName = (String)getConfigParameterValue( PARAM_OUTPUTDIR );
       iv_outputDirectory = new File( outputDirectoryName );
       if ( !iv_outputDirectory.exists() || !iv_outputDirectory.isDirectory() ) {
          throw new ResourceInitializationException(
                new Exception( "Parameter setting 'OutputDirectory' does not point to an existing directory." ) );
       }
    }

    /**
     * {@inheritDoc}
     */
    @Override
    public void processCas( final CAS cas ) throws ResourceProcessException {
       try {
          final JCas jcas = cas.getJCas();
          final StringBuilder normalizedText = new StringBuilder();
          final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
          // Determine and set the normalized form for each <code>BaseToken</code>
          for ( Annotation annotation : indexes.getAnnotationIndex( WordToken.type ) ) {
             if ( annotation instanceof WordToken ) {
                normalizedText.append( WordTokenUtil.getCanonicalForm( (WordToken)annotation ) );
                normalizedText.append( " " );
             }
          }
          final String documentID = DocIdUtil.getDocumentID( jcas );
          writeToFile( documentID, normalizedText.toString() );
       } catch ( CASException | IOException multE ) {
          throw new ResourceProcessException( multE );
       }
    }

    private void writeToFile( String documentID, String documentText ) throws IOException {
       File outputFile = new File( iv_outputDirectory, documentID );
       outputFile.createNewFile();
       OutputStream out = new BufferedOutputStream( new FileOutputStream( outputFile ) );
       out.write( documentText.getBytes() );
       out.flush();
       out.close();
    }
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.ctakes.core.cc;

	import org.apache.ctakes.core.pipeline.PipeBitInfo;
	import org.apache.ctakes.core.util.annotation.WordTokenUtil;
	import org.apache.ctakes.core.util.doc.DocIdUtil;
	import org.apache.ctakes.typesystem.type.syntax.WordToken;
	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.CASException;
	import org.apache.uima.collection.CasConsumer_ImplBase;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.jcas.JFSIndexRepository;
	import org.apache.uima.jcas.tcas.Annotation;
	import org.apache.uima.resource.ResourceInitializationException;
	import org.apache.uima.resource.ResourceProcessException;

	import java.io.*;

	/**
	* For each CAS a local file with the document text is written to a directory specifed by a parameter.
	* This CAS consumer does not make use of any annotation information in the cas except for the document
	* id specified the CommonTypeSystem.xml descriptor. The document id will be the name of the file written
	* for each CAS.
	* <p/>
	* This CAS consumer may be useful if you want to write the results of a collection reader and/or CAS
	* initializer to the local file system. For example, a JDBC Collection Reader may read XML documents
	* from a database and a specialized cas initializer may convert the XML to plain text. The
	* FilesInDirectoryCasConsumer can now be used to write the plain text to local plain text files.
	*/

	@PipeBitInfo(
	name = "Document Text Writer (Dir)",
	description = "Writes Text files with original text from the document in a specified directory.",
	role = PipeBitInfo.Role.WRITER,
	dependencies = { PipeBitInfo.TypeProduct.DOCUMENT_ID, PipeBitInfo.TypeProduct.BASE_TOKEN }
	)
	public class NormalizedFilesInDirectoryCasConsumer extends CasConsumer_ImplBase {

	public static final String PARAM_OUTPUTDIR = "OutputDirectory";

	File iv_outputDirectory;

	@Override
	public void initialize() throws ResourceInitializationException {
	String outputDirectoryName = (String)getConfigParameterValue( PARAM_OUTPUTDIR );
	iv_outputDirectory = new File( outputDirectoryName );
	if ( !iv_outputDirectory.exists() \|\| !iv_outputDirectory.isDirectory() ) {
	throw new ResourceInitializationException(
	new Exception( "Parameter setting 'OutputDirectory' does not point to an existing directory." ) );
	}
	}

	/**
	* {@inheritDoc}
	*/
	@Override
	public void processCas( final CAS cas ) throws ResourceProcessException {
	try {
	final JCas jcas = cas.getJCas();
	final StringBuilder normalizedText = new StringBuilder();
	final JFSIndexRepository indexes = jcas.getJFSIndexRepository();
	// Determine and set the normalized form for each <code>BaseToken</code>
	for ( Annotation annotation : indexes.getAnnotationIndex( WordToken.type ) ) {
	if ( annotation instanceof WordToken ) {
	normalizedText.append( WordTokenUtil.getCanonicalForm( (WordToken)annotation ) );
	normalizedText.append( " " );
	}
	}
	final String documentID = DocIdUtil.getDocumentID( jcas );
	writeToFile( documentID, normalizedText.toString() );
	} catch ( CASException \| IOException multE ) {
	throw new ResourceProcessException( multE );
	}
	}

	private void writeToFile( String documentID, String documentText ) throws IOException {
	File outputFile = new File( iv_outputDirectory, documentID );
	outputFile.createNewFile();
	OutputStream out = new BufferedOutputStream( new FileOutputStream( outputFile ) );
	out.write( documentText.getBytes() );
	out.flush();
	out.close();
	}
	}