ctakes-smoking-status/src/main/java/org/apache/ctakes/smokingstatus/ae/SentenceAdjuster.java - ctakes - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 package org.apache.ctakes.smokingstatus.ae;

 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.Map;
 import java.util.Set;

 import org.apache.log4j.Logger;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
 import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
 import org.apache.uima.jcas.JFSIndexRepository;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.resource.ResourceInitializationException;

 import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;

 /**
  * UIMA annotator that uses some patterns and some rules about those patterns to
  * adjust certain annotations.
  *
  * This annotator was extended to handle sentence
  * boundaries for the Smoking status classification: Example: "Tobacco: none"
  * has two sentences as detected by the sentence boundary detector. This
  * annotator merges them into one sentence to enable correct negation detection.
  *
  */
 public class SentenceAdjuster extends JCasAnnotator_ImplBase {
 	// TODO @ConfigurationParam
 	/**
 	 * The list of words ("and" "&") to ignore in pattern matching.
 	 */
 	public static final String PARAM_IGNORE_WORDS = "WordsToIgnore";

 	/**
 	 * The list of words ("none", "no", etc) used in the pattern
 	 */
 	public static final String PARAM_WORDS_IN_PATTERN = "WordsInPattern";

 	// LOG4J logger based on class name
 	public Logger iv_logger = Logger.getLogger(getClass().getName());


 	/**
 	 * Performs initialization logic. This implementation just reads values for
 	 * the configuration parameters. This method is not invoked for every
 	 * document processed.
 	 *
 	 */
 	@Override
 	public void initialize( final UimaContext aContext ) throws ResourceInitializationException {
 		super.initialize( aContext );

 		context = aContext;
 		try {
 			configInit();
 		} catch (AnnotatorContextException e) {
 			// TODO Auto-generated catch block
 			e.printStackTrace();
 		}
 	}

 	/**
 	 * Sets configuration parameters with values from the descriptor.
 	 */
 	private void configInit() throws AnnotatorContextException {
 		// populate the HashSet of words that we will ignore when pattern
 		// matching
 		// TODO @ConfigurationParam
 		String[] ignoreWords = (String[]) context
 				.getConfigParameterValue(PARAM_IGNORE_WORDS);

 		wordsToIgnore = new HashSet<String>();
 		for (int i = 0; i < ignoreWords.length; i++)
 			wordsToIgnore.add(ignoreWords[i]);

 		if (iv_logger.isInfoEnabled())
 			iv_logger.info("Loaded list of " + ignoreWords.length
 					+ " words to ignore during adjustment.");

 		// populate the HashSet of the laterality or tobacco-related words
 		String[] patternWords = (String[]) context
 				.getConfigParameterValue(PARAM_WORDS_IN_PATTERN);
 		wordsInPattern = new HashSet<String>();
 		for (int i = 0; i < patternWords.length; i++)
 			wordsInPattern.add(patternWords[i]);

 		if (iv_logger.isInfoEnabled())
 			iv_logger.info("Loaded list of " + patternWords.length
 					+ " pattern words for adjustment.");

 		useSegments = ((Boolean) context.getConfigParameterValue("UseSegments"))
 				.booleanValue();
 		String[] skipSegmentIDs = (String[]) context
 				.getConfigParameterValue("SegmentsToSkip");
 		skipSegmentsSet = new HashSet<String>();
 		for (int i = 0; i < skipSegmentIDs.length; i++)
 			skipSegmentsSet.add(skipSegmentIDs[i]);

 		if (iv_logger.isInfoEnabled())
 			iv_logger.info("List of words to ignore during adjustment:");

 		Object[] o = wordsToIgnore.toArray();
 		// String [] ignoreTheseWords = (String []) wordsToIgnore.toArray();

 		if (iv_logger.isInfoEnabled()) {
 			for (int i = 0; i < ignoreWords.length; i++)
 				iv_logger.info("  " + o[i]);
 		}

 	}

 	/**
 	 * Invokes this annotator's analysis logic. Invoked for each document
 	 * processed.
 	 */
 	public void process( final JCas jcas ) throws AnalysisEngineProcessException {
 		String text = jcas.getDocumentText();
 		try {
 			// just one sentence
 			iv_logger.info(" jcas "+ jcas.getViewName());
 			if (!useSegments) {
 				// annotate over full doc text
 				annotateRange(jcas, text, 0, text.length());
 			} else {
 				JFSIndexRepository indexes = jcas.getJFSIndexRepository();
 				Iterator<?> segmentItr = indexes.getAnnotationIndex(
 						Segment.type).iterator();
 				while (segmentItr.hasNext()) {
 					Segment segmentAnnotation = (Segment) segmentItr.next();
 					String segmentID = segmentAnnotation.getId();

 					if (!skipSegmentsSet.contains(segmentID)) {
 						int start = segmentAnnotation.getBegin();
 						int end = segmentAnnotation.getEnd();
 						annotateRange(jcas, text, start, end);
 					}
 				}
 			}
 		} catch (AnnotatorContextException e) {
 			e.printStackTrace();
 		}
 	}

 	/**
 	 * A utility method that annotates a given range.
 	 */
 	protected void annotateRange(JCas jcas, String text, int rangeBegin,
 			int rangeEnd) throws AnnotatorContextException {
 		if (iv_logger.isInfoEnabled())
 			iv_logger.info("started Sentence merging process.");

 		JFSIndexRepository indexes = jcas.getJFSIndexRepository();

 		Iterator<?> sentItr = indexes.getAnnotationIndex(Sentence.type)
 				.iterator();
 		Sentence prevSent = null;
 		Sentence currSent = null;

 		Map<Integer, Sentence> sentences = getSentencesOrderById(sentItr);

 		// find the first Sentence in the specified range
 		for (int i = 0; i < sentences.size(); i++) {
 			prevSent = currSent;
 			currSent = (Sentence) sentences.get(new Integer(i));

 			if (currSent == null)
 				iv_logger.error("Wow! some sentence is null");

 			if (prevSent == null)
 				continue; // got to have 2 sentences
 // Added a check for the currSent being null which should not be happening, but apparently is for some reason 10/7/2011
 			if (prevSent.getCoveredText().endsWith(":") && currSent != null) {
 				int newEnd = -1;
 				if ((currSent.getSentenceNumber() - 1) == prevSent
 						.getSentenceNumber()) {
 					// System.out.println("Found adjecent sentence: " +
 					// prevSent.getSentenceNumber() + " is next to " +
 					// currSent.getSentenceNumber());
 					String textSent2 = currSent.getCoveredText().toLowerCase();
 					Iterator<String> itWordsInPattern = wordsInPattern
 							.iterator();
 					while (itWordsInPattern.hasNext()) {
 						String word = (String) itWordsInPattern.next();
 						// System.out.println("Checking for word: " + word);
 						if (textSent2.startsWith(word)) {
 							// System.out.println("Sentence begins with word: "
 							// + word);
 							newEnd = currSent.getEnd();
 							// System.out.println("Old Sentence ["+text.substring(prevSent.getBegin(),
 							// prevSent.getEnd())+"]");
 							prevSent.setEnd(newEnd);
 							// System.out.println("New Sentence ["+text.substring(prevSent.getBegin(),
 							// prevSent.getEnd())+"]");
 							currSent.removeFromIndexes();
 						}
 					}
 				}
 			}
 		}
 	}

 	private Map<Integer, Sentence> getSentencesOrderById(Iterator<?> sentItr) {
 		Map<Integer, Sentence> sentences = new HashMap<Integer, Sentence>();
 		while (sentItr.hasNext()) {
 			Sentence sa = (Sentence) sentItr.next();
 			int snum = sa.getSentenceNumber();

 			sentences.put((new Integer(snum)), sa);
 		}

 		return sentences;
 	}
 	private UimaContext context;

 	private HashSet<String> wordsToIgnore;
 	private HashSet<String> wordsInPattern;

 	private boolean useSegments;
 	private Set<String> skipSegmentsSet;
 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	package org.apache.ctakes.smokingstatus.ae;

	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.Map;
	import java.util.Set;

	import org.apache.log4j.Logger;
	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.analysis_engine.annotator.AnnotatorContextException;
	import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
	import org.apache.uima.jcas.JFSIndexRepository;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.resource.ResourceInitializationException;

	import org.apache.ctakes.typesystem.type.textspan.Segment;
	import org.apache.ctakes.typesystem.type.textspan.Sentence;

	/**
	* UIMA annotator that uses some patterns and some rules about those patterns to
	* adjust certain annotations.
	*
	* This annotator was extended to handle sentence
	* boundaries for the Smoking status classification: Example: "Tobacco: none"
	* has two sentences as detected by the sentence boundary detector. This
	* annotator merges them into one sentence to enable correct negation detection.
	*
	*/
	public class SentenceAdjuster extends JCasAnnotator_ImplBase {
	// TODO @ConfigurationParam
	/**
	* The list of words ("and" "&") to ignore in pattern matching.
	*/
	public static final String PARAM_IGNORE_WORDS = "WordsToIgnore";

	/**
	* The list of words ("none", "no", etc) used in the pattern
	*/
	public static final String PARAM_WORDS_IN_PATTERN = "WordsInPattern";

	// LOG4J logger based on class name
	public Logger iv_logger = Logger.getLogger(getClass().getName());



	/**
	* Performs initialization logic. This implementation just reads values for
	* the configuration parameters. This method is not invoked for every
	* document processed.
	*
	*/
	@Override
	public void initialize( final UimaContext aContext ) throws ResourceInitializationException {
	super.initialize( aContext );

	context = aContext;
	try {
	configInit();
	} catch (AnnotatorContextException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	/**
	* Sets configuration parameters with values from the descriptor.
	*/
	private void configInit() throws AnnotatorContextException {
	// populate the HashSet of words that we will ignore when pattern
	// matching
	// TODO @ConfigurationParam
	String[] ignoreWords = (String[]) context
	.getConfigParameterValue(PARAM_IGNORE_WORDS);

	wordsToIgnore = new HashSet<String>();
	for (int i = 0; i < ignoreWords.length; i++)
	wordsToIgnore.add(ignoreWords[i]);

	if (iv_logger.isInfoEnabled())
	iv_logger.info("Loaded list of " + ignoreWords.length
	+ " words to ignore during adjustment.");

	// populate the HashSet of the laterality or tobacco-related words
	String[] patternWords = (String[]) context
	.getConfigParameterValue(PARAM_WORDS_IN_PATTERN);
	wordsInPattern = new HashSet<String>();
	for (int i = 0; i < patternWords.length; i++)
	wordsInPattern.add(patternWords[i]);

	if (iv_logger.isInfoEnabled())
	iv_logger.info("Loaded list of " + patternWords.length
	+ " pattern words for adjustment.");

	useSegments = ((Boolean) context.getConfigParameterValue("UseSegments"))
	.booleanValue();
	String[] skipSegmentIDs = (String[]) context
	.getConfigParameterValue("SegmentsToSkip");
	skipSegmentsSet = new HashSet<String>();
	for (int i = 0; i < skipSegmentIDs.length; i++)
	skipSegmentsSet.add(skipSegmentIDs[i]);

	if (iv_logger.isInfoEnabled())
	iv_logger.info("List of words to ignore during adjustment:");

	Object[] o = wordsToIgnore.toArray();
	// String [] ignoreTheseWords = (String []) wordsToIgnore.toArray();

	if (iv_logger.isInfoEnabled()) {
	for (int i = 0; i < ignoreWords.length; i++)
	iv_logger.info(" " + o[i]);
	}

	}

	/**
	* Invokes this annotator's analysis logic. Invoked for each document
	* processed.
	*/
	public void process( final JCas jcas ) throws AnalysisEngineProcessException {
	String text = jcas.getDocumentText();
	try {
	// just one sentence
	iv_logger.info(" jcas "+ jcas.getViewName());
	if (!useSegments) {
	// annotate over full doc text
	annotateRange(jcas, text, 0, text.length());
	} else {
	JFSIndexRepository indexes = jcas.getJFSIndexRepository();
	Iterator<?> segmentItr = indexes.getAnnotationIndex(
	Segment.type).iterator();
	while (segmentItr.hasNext()) {
	Segment segmentAnnotation = (Segment) segmentItr.next();
	String segmentID = segmentAnnotation.getId();

	if (!skipSegmentsSet.contains(segmentID)) {
	int start = segmentAnnotation.getBegin();
	int end = segmentAnnotation.getEnd();
	annotateRange(jcas, text, start, end);
	}
	}
	}
	} catch (AnnotatorContextException e) {
	e.printStackTrace();
	}
	}

	/**
	* A utility method that annotates a given range.
	*/
	protected void annotateRange(JCas jcas, String text, int rangeBegin,
	int rangeEnd) throws AnnotatorContextException {
	if (iv_logger.isInfoEnabled())
	iv_logger.info("started Sentence merging process.");

	JFSIndexRepository indexes = jcas.getJFSIndexRepository();

	Iterator<?> sentItr = indexes.getAnnotationIndex(Sentence.type)
	.iterator();
	Sentence prevSent = null;
	Sentence currSent = null;

	Map<Integer, Sentence> sentences = getSentencesOrderById(sentItr);

	// find the first Sentence in the specified range
	for (int i = 0; i < sentences.size(); i++) {
	prevSent = currSent;
	currSent = (Sentence) sentences.get(new Integer(i));

	if (currSent == null)
	iv_logger.error("Wow! some sentence is null");

	if (prevSent == null)
	continue; // got to have 2 sentences
	// Added a check for the currSent being null which should not be happening, but apparently is for some reason 10/7/2011
	if (prevSent.getCoveredText().endsWith(":") && currSent != null) {
	int newEnd = -1;
	if ((currSent.getSentenceNumber() - 1) == prevSent
	.getSentenceNumber()) {
	// System.out.println("Found adjecent sentence: " +
	// prevSent.getSentenceNumber() + " is next to " +
	// currSent.getSentenceNumber());
	String textSent2 = currSent.getCoveredText().toLowerCase();
	Iterator<String> itWordsInPattern = wordsInPattern
	.iterator();
	while (itWordsInPattern.hasNext()) {
	String word = (String) itWordsInPattern.next();
	// System.out.println("Checking for word: " + word);
	if (textSent2.startsWith(word)) {
	// System.out.println("Sentence begins with word: "
	// + word);
	newEnd = currSent.getEnd();
	// System.out.println("Old Sentence ["+text.substring(prevSent.getBegin(),
	// prevSent.getEnd())+"]");
	prevSent.setEnd(newEnd);
	// System.out.println("New Sentence ["+text.substring(prevSent.getBegin(),
	// prevSent.getEnd())+"]");
	currSent.removeFromIndexes();
	}
	}
	}
	}
	}
	}

	private Map<Integer, Sentence> getSentencesOrderById(Iterator<?> sentItr) {
	Map<Integer, Sentence> sentences = new HashMap<Integer, Sentence>();
	while (sentItr.hasNext()) {
	Sentence sa = (Sentence) sentItr.next();
	int snum = sa.getSentenceNumber();

	sentences.put((new Integer(snum)), sa);
	}

	return sentences;
	}
	private UimaContext context;

	private HashSet<String> wordsToIgnore;
	private HashSet<String> wordsInPattern;

	private boolean useSegments;
	private Set<String> skipSegmentsSet;
	}