blob: de155852c88d4d73bdb05bf453ea0bf2adbf7cdf [file] [log] [blame]
package org.apache.ctakes.examples.cc;
import org.apache.ctakes.core.cc.AbstractJCasFileWriter;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
import org.apache.ctakes.core.util.log.DotLogger;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.log4j.Logger;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.UimaContext;
import org.apache.uima.resource.ResourceInitializationException;
import java.io.*;
import static org.apache.ctakes.core.pipeline.PipeBitInfo.TypeProduct.*;
/**
* @author SPF , chip-nlp
* @since {3/3/2023}
*/
@PipeBitInfo(
name = "TokenSpanWriter",
description = "Writes files listing base tokens and their spans in a directory tree.",
role = PipeBitInfo.Role.WRITER,
usables = { DOCUMENT_ID_PREFIX, BASE_TOKEN }
)
public class TokenSpanWriter extends AbstractJCasFileWriter {
// If you do not need to utilize the entire cas, or need more than the doc cas, consider AbstractFileWriter<T>.
static private final Logger LOGGER = Logger.getLogger( "TokenSpanWriter" );
// to add a configuration parameter, type "param" and hit tab.
/**
* {@inheritDoc}
*/
@Override
public void writeFile( JCas jCas,
String outputDir,
String documentId,
String fileName ) throws IOException {
final File file = new File( outputDir, documentId + "_tokenSpans.txt" );
final String docText = jCas.getDocumentText();
try ( Writer writer = new BufferedWriter( new FileWriter( file ) ) ) {
for ( BaseToken token : JCasUtil.select( jCas, BaseToken.class ) ) {
final int begin = token.getBegin();
final int end = token.getEnd();
final String text = token instanceof NewlineToken ? "<EOL>" : docText.substring( begin, end );
writer.write( text + "|" + begin + "," + end + "\n" );
}
}
}
}