blob: 4f2c49795cedd7fc6d447041c947b95a02ebeb36 [file] [log] [blame]
package org.apache.ctakes.core.cc;
import org.apache.ctakes.core.pipeline.PipeBitInfo;
import org.apache.ctakes.typesystem.type.syntax.BaseToken;
import org.apache.ctakes.typesystem.type.syntax.NewlineToken;
import org.apache.uima.fit.util.JCasUtil;
import org.apache.uima.jcas.JCas;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import static org.apache.ctakes.core.pipeline.PipeBitInfo.TypeProduct.*;
/**
* @author SPF , chip-nlp
* @since {3/3/2023}
*/
@PipeBitInfo(
name = "Token Table Writer",
description = "Writes a table of base tokens and their spans in a directory tree.",
role = PipeBitInfo.Role.WRITER,
usables = { DOCUMENT_ID_PREFIX, BASE_TOKEN }
)
final public class TokenTableFileWriter extends AbstractTableFileWriter {
/**
* {@inheritDoc}
*/
@Override
protected List<String> createHeaderRow( final JCas jCas ) {
return Arrays.asList( " Token Text ", " Text Span " );
}
/**
* {@inheritDoc}
*/
@Override
protected List<List<String>> createDataRows( final JCas jCas ) {
final List<List<String>> dataRows = new ArrayList<>();
final String docText = jCas.getDocumentText();
for ( BaseToken token : JCasUtil.select( jCas, BaseToken.class ) ) {
final int begin = token.getBegin();
final int end = token.getEnd();
final String text = token instanceof NewlineToken ? "<EOL>" : docText.substring( begin, end );
dataRows.add( Arrays.asList( text, begin + "," + end ) );
}
return dataRows;
}
}