blob: 08464a874342dcbc03a2f8c315ce8acf833302db [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.ctakes.smokingstatus.ae;
import java.util.HashMap;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Map;
import java.util.Set;
import org.apache.log4j.Logger;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
import org.apache.uima.jcas.JFSIndexRepository;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.ctakes.typesystem.type.textspan.Segment;
import org.apache.ctakes.typesystem.type.textspan.Sentence;
/**
* UIMA annotator that uses some patterns and some rules about those patterns to
* adjust certain annotations.
*
* This annotator was extended to handle sentence
* boundaries for the Smoking status classification: Example: "Tobacco: none"
* has two sentences as detected by the sentence boundary detector. This
* annotator merges them into one sentence to enable correct negation detection.
*
*/
public class SentenceAdjuster extends JCasAnnotator_ImplBase {
// TODO @ConfigurationParam
/**
* The list of words ("and" "&") to ignore in pattern matching.
*/
public static final String PARAM_IGNORE_WORDS = "WordsToIgnore";
/**
* The list of words ("none", "no", etc) used in the pattern
*/
public static final String PARAM_WORDS_IN_PATTERN = "WordsInPattern";
// LOG4J logger based on class name
public Logger iv_logger = Logger.getLogger(getClass().getName());
/**
* Performs initialization logic. This implementation just reads values for
* the configuration parameters. This method is not invoked for every
* document processed.
*
*/
@Override
public void initialize( final UimaContext aContext ) throws ResourceInitializationException {
super.initialize( aContext );
context = aContext;
try {
configInit();
} catch (AnnotatorContextException e) {
// TODO Auto-generated catch block
e.printStackTrace();
}
}
/**
* Sets configuration parameters with values from the descriptor.
*/
private void configInit() throws AnnotatorContextException {
// populate the HashSet of words that we will ignore when pattern
// matching
// TODO @ConfigurationParam
String[] ignoreWords = (String[]) context
.getConfigParameterValue(PARAM_IGNORE_WORDS);
wordsToIgnore = new HashSet<String>();
for (int i = 0; i < ignoreWords.length; i++)
wordsToIgnore.add(ignoreWords[i]);
if (iv_logger.isInfoEnabled())
iv_logger.info("Loaded list of " + ignoreWords.length
+ " words to ignore during adjustment.");
// populate the HashSet of the laterality or tobacco-related words
String[] patternWords = (String[]) context
.getConfigParameterValue(PARAM_WORDS_IN_PATTERN);
wordsInPattern = new HashSet<String>();
for (int i = 0; i < patternWords.length; i++)
wordsInPattern.add(patternWords[i]);
if (iv_logger.isInfoEnabled())
iv_logger.info("Loaded list of " + patternWords.length
+ " pattern words for adjustment.");
useSegments = ((Boolean) context.getConfigParameterValue("UseSegments"))
.booleanValue();
String[] skipSegmentIDs = (String[]) context
.getConfigParameterValue("SegmentsToSkip");
skipSegmentsSet = new HashSet<String>();
for (int i = 0; i < skipSegmentIDs.length; i++)
skipSegmentsSet.add(skipSegmentIDs[i]);
if (iv_logger.isInfoEnabled())
iv_logger.info("List of words to ignore during adjustment:");
Object[] o = wordsToIgnore.toArray();
// String [] ignoreTheseWords = (String []) wordsToIgnore.toArray();
if (iv_logger.isInfoEnabled()) {
for (int i = 0; i < ignoreWords.length; i++)
iv_logger.info(" " + o[i]);
}
}
/**
* Invokes this annotator's analysis logic. Invoked for each document
* processed.
*/
public void process( final JCas jcas ) throws AnalysisEngineProcessException {
String text = jcas.getDocumentText();
try {
// just one sentence
iv_logger.info(" jcas "+ jcas.getViewName());
if (!useSegments) {
// annotate over full doc text
annotateRange(jcas, text, 0, text.length());
} else {
JFSIndexRepository indexes = jcas.getJFSIndexRepository();
Iterator<?> segmentItr = indexes.getAnnotationIndex(
Segment.type).iterator();
while (segmentItr.hasNext()) {
Segment segmentAnnotation = (Segment) segmentItr.next();
String segmentID = segmentAnnotation.getId();
if (!skipSegmentsSet.contains(segmentID)) {
int start = segmentAnnotation.getBegin();
int end = segmentAnnotation.getEnd();
annotateRange(jcas, text, start, end);
}
}
}
} catch (AnnotatorContextException e) {
e.printStackTrace();
}
}
/**
* A utility method that annotates a given range.
*/
protected void annotateRange(JCas jcas, String text, int rangeBegin,
int rangeEnd) throws AnnotatorContextException {
if (iv_logger.isInfoEnabled())
iv_logger.info("started Sentence merging process.");
JFSIndexRepository indexes = jcas.getJFSIndexRepository();
Iterator<?> sentItr = indexes.getAnnotationIndex(Sentence.type)
.iterator();
Sentence prevSent = null;
Sentence currSent = null;
Map<Integer, Sentence> sentences = getSentencesOrderById(sentItr);
// find the first Sentence in the specified range
for (int i = 0; i < sentences.size(); i++) {
prevSent = currSent;
currSent = (Sentence) sentences.get(new Integer(i));
if (currSent == null)
iv_logger.error("Wow! some sentence is null");
if (prevSent == null)
continue; // got to have 2 sentences
// Added a check for the currSent being null which should not be happening, but apparently is for some reason 10/7/2011
if (prevSent.getCoveredText().endsWith(":") && currSent != null) {
int newEnd = -1;
if ((currSent.getSentenceNumber() - 1) == prevSent
.getSentenceNumber()) {
// System.out.println("Found adjecent sentence: " +
// prevSent.getSentenceNumber() + " is next to " +
// currSent.getSentenceNumber());
String textSent2 = currSent.getCoveredText().toLowerCase();
Iterator<String> itWordsInPattern = wordsInPattern
.iterator();
while (itWordsInPattern.hasNext()) {
String word = (String) itWordsInPattern.next();
// System.out.println("Checking for word: " + word);
if (textSent2.startsWith(word)) {
// System.out.println("Sentence begins with word: "
// + word);
newEnd = currSent.getEnd();
// System.out.println("Old Sentence ["+text.substring(prevSent.getBegin(),
// prevSent.getEnd())+"]");
prevSent.setEnd(newEnd);
// System.out.println("New Sentence ["+text.substring(prevSent.getBegin(),
// prevSent.getEnd())+"]");
currSent.removeFromIndexes();
}
}
}
}
}
}
private Map<Integer, Sentence> getSentencesOrderById(Iterator<?> sentItr) {
Map<Integer, Sentence> sentences = new HashMap<Integer, Sentence>();
while (sentItr.hasNext()) {
Sentence sa = (Sentence) sentItr.next();
int snum = sa.getSentenceNumber();
sentences.put((new Integer(snum)), sa);
}
return sentences;
}
private UimaContext context;
private HashSet<String> wordsToIgnore;
private HashSet<String> wordsInPattern;
private boolean useSegments;
private Set<String> skipSegmentsSet;
}