ctakes-smoking-status/src/main/java/org/apache/ctakes/smokingstatus/ae/ClassifiableEntries.java - ctakes - Git at Google

 /**
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */
 /**
  *
  */
 package org.apache.ctakes.smokingstatus.ae;

 import java.io.BufferedReader;
 import java.io.File;
 import java.io.FileReader;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.StringTokenizer;

 import org.apache.log4j.Logger;
 import org.apache.uima.UIMAFramework;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_engine.AnalysisEngine;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
 import org.apache.uima.cas.FSIterator;
 import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.JFSIndexRepository;
 import org.apache.uima.jcas.cas.TOP;
 import org.apache.uima.resource.ResourceInitializationException;
 import org.apache.uima.resource.ResourceManager;
 import org.apache.uima.resource.ResourceProcessException;
 import org.apache.uima.resource.ResourceSpecifier;
 import org.apache.uima.util.CasCreationUtils;
 import org.apache.uima.util.XMLInputSource;

 import org.apache.ctakes.smokingstatus.i2b2.type.RecordSentence;

 import org.apache.ctakes.smokingstatus.type.SmokingDocumentClassification;

 import org.apache.ctakes.core.resource.FileResource;
 import org.apache.ctakes.smokingstatus.Const;
 import org.apache.ctakes.smokingstatus.util.ClassifiableEntry;
 import org.apache.ctakes.smokingstatus.util.TruthValue;
 import org.apache.ctakes.typesystem.type.structured.DocumentID;
 import org.apache.ctakes.typesystem.type.textspan.Segment;
 import org.apache.ctakes.typesystem.type.textspan.Sentence;
 import org.apache.ctakes.smokingstatus.type.libsvm.NominalAttributeValue;

 /**
  * @author Mayo Clinic Intended as an annotator to generate ClassifiableEntries
  *         for SmokingStatus pipeline
  */
 public class ClassifiableEntries extends JCasAnnotator_ImplBase {


 	// TODO @ConfigurationParameter all of these parameters.

 	/**
 	 * Name of configuration parameter that must be set to the filepath of the
 	 * UIMA descriptor ProductionPostSentenceAggregate.xml
 	 */
 //	public static final String PARAM_SMOKING_STATUS_DESC_STEP1 = "UimaDescriptorStep1";
 //	public static final String PARAM_SMOKING_STATUS_DESC_STEP2 = "UimaDescriptorStep2";
 	public static final String PARAM_SMOKING_STATUS_DESC_STEP1KEY = "UimaDescriptorStep1Key";
 	public static final String PARAM_SMOKING_STATUS_DESC_STEP2KEY = "UimaDescriptorStep2Key";

 	/**
 	 * Name of configuration parameter that must be set to the filepath of the
 	 * delimited truth file. This is optional.
 	 */
 	public static final String PARAM_TRUTH_FILE = "TruthFile";

 	/**
 	 * Name of configuration parameter that describes the character delimiter
 	 * used in the delimited truth file. This is optional.
 	 */
 	public static final String PARAM_TRUTH_FILE_DELIMITER = "TruthFileDelimiter";

 	/**
 	 * Name of configuration parameter that determines the allowed
 	 * Classification values. This is optional and only gets used if a Truth
 	 * file is specified.
 	 */
 	public static final String PARAM_ALLOWED_CLASSES = "AllowedClassifications";

 	/**
 	 * Name of configuration parameter that determines whether section headers
 	 * will be parsed out and Segments made to cover the section text.
 	 */
 	public static final String PARAM_PARSE_SECTIONS = "ParseSections";

 	/**
 	 * Sections NOT to be entered in ClassifiableEntries
 	 */
 	public static final String PARAM_IGNORE_SECTIONS = "SectionsToIgnore";

 	public void initialize( final UimaContext aContext ) throws ResourceInitializationException {
 		super.initialize( aContext );
 		boolean windowsSystem = true;
 		try {
 			ResMgr = UIMAFramework.newDefaultResourceManager();
 			iv_procEntryList = new ArrayList<ClassifiableEntry>();
 			iv_entryIndexMap = new HashMap<String, List<ClassifiableEntry>>();
 			iv_segList = new ArrayList<Segment>();

 			// load (optional) truth data
 			initTruthData();
 			// What type of operating system type are we running on? Unix or Windows
 			if (System.getProperty("file.separator").matches("/"))
 				windowsSystem = false;

 			// load TAE from XML descriptor
 			FileResource fResrc = (FileResource) aContext.getResourceObject(PARAM_SMOKING_STATUS_DESC_STEP1KEY);
 			File descFile = fResrc.getFile();
 			taeSpecifierStep1 = UIMAFramework.getXMLParser().parseResourceSpecifier(
 			new XMLInputSource(fResrc.getFile()));

 			fResrc = (FileResource) aContext.getResourceObject(PARAM_SMOKING_STATUS_DESC_STEP2KEY);
 			descFile = fResrc.getFile();

 			taeSpecifierStep2 = UIMAFramework.getXMLParser().parseResourceSpecifier(
 					new XMLInputSource(fResrc.getFile()));

 			ra = new ResolutionAnnotator();
 			ra.initialize(aContext);
 			String dataPath  = aContext.getDataPath();
 			System.out.println("descFile "+descFile.getAbsolutePath());
 //			if (!descFile.getAbsolutePath().contains(apiMacroHome)) {
 //				ClassLoader thisBundle = this.getClass().getClassLoader();
 //				iv_logger.info("Using data path : "+dataPath);
 //				if (!windowsSystem) {
 //					//thisBundle.getSystemResources(dataPath);
 //					dataPath = "\""+dataPath+"\"";
 //					// This bundle is needed for Linux issues w/ embedded blanks in path names
 //					//this.getClass().getClassLoader().getResource(dataPath);
 //					ResMgr.setExtensionClassPath(thisBundle, dataPath, true);
 //				} else {
 //					ResMgr.setExtensionClassPath(dataPath, true);
 //				}
 //			}
 //			else if (iv_logger.isInfoEnabled()) {
 //				iv_logger.warn("Shouldn't need to set the classpath "+descFile.getAbsolutePath());
 //			}
 			taeStep1 = UIMAFramework.produceAnalysisEngine(taeSpecifierStep1, ResMgr, null);
 			taeStep2 = UIMAFramework.produceAnalysisEngine(taeSpecifierStep2, ResMgr, null);
 			jcas_local = CasCreationUtils.createCas(taeStep1.getAnalysisEngineMetaData()).getJCas();

 //			if (iv_logger.isInfoEnabled())
 //				iv_logger.info("Loaded UIMA TAE from descriptor: "
 //						+ descFile.getAbsolutePath().replaceAll(apiMacroHome, "."));

 			// get sections to ignore
 			String[] sections = (String[]) getContext().getConfigParameterValue(PARAM_IGNORE_SECTIONS);
 			sectionsToIgnore = new HashSet<String>();
 			for (int i = 0; i < sections.length; i++)
 				sectionsToIgnore.add(sections[i]);
 		} catch (Exception e) {
 			throw new ResourceInitializationException(e);
 		}
 	}

 	private void initTruthData() throws Exception {
 		String truthFilePath = (String) getContext().getConfigParameterValue(
 				PARAM_TRUTH_FILE);
 		if (truthFilePath != null && truthFilePath.length() > 0) {
 			String delimiter = "\t";
 			File truthFile = new File(truthFilePath);
 			loadTruthData(truthFile, delimiter);

 			String[] allowedArr = (String[]) getContext()
 					.getConfigParameterValue(PARAM_ALLOWED_CLASSES);
 			iv_allowedClassifications = new HashSet<String>();
 			for (int i = 0; i < allowedArr.length; i++) {
 				String classification = allowedArr[i];
 				if (classification.equals(Const.CLASS_CURR_SMOKER)
 						|| classification.equals(Const.CLASS_NON_SMOKER)
 						|| classification.equals(Const.CLASS_PAST_SMOKER)
 						|| classification.equals(Const.CLASS_SMOKER)
 						|| classification.equals(Const.CLASS_UNKNOWN)) {
 					iv_allowedClassifications.add(classification);
 				} else {
 					throw new Exception(
 							"Invalid classification value for param "
 									+ PARAM_ALLOWED_CLASSES + ":"
 									+ classification);
 				}
 			}
 		}
 	}

 	/**
 	 * Parses the TRUTH file in delimited format. Stores data in maps.
 	 *
 	 * @param truthFile
 	 * @param delimiter
 	 * @throws Exception
 	 */
 	private void loadTruthData(File truthFile, String delimiter)
 			throws Exception {
 		iv_truthMap = new HashMap<Integer, TruthValue>();

 		BufferedReader br = new BufferedReader(new FileReader(truthFile));
 		int lineNum = 1;
 		String line = br.readLine();
 		while (line != null) {
 			StringTokenizer st = new StringTokenizer(line, delimiter);
 			if (st.countTokens() == 4) {
 				Integer recordID = new Integer(st.nextToken().trim());
 				String truthVal = st.nextToken().trim();
 				String sentence = st.nextToken().trim();
 				// String section = st.nextToken().trim();

 				String ssClass = null;
 				if (truthVal.equals("CURRENT SMOKER")) {
 					ssClass = Const.CLASS_CURR_SMOKER;
 				} else if (truthVal.equals("PAST SMOKER")) {
 					ssClass = Const.CLASS_PAST_SMOKER;
 				} else if (truthVal.equals("SMOKER")) {
 					ssClass = Const.CLASS_SMOKER;
 				} else if (truthVal.equals("NON-SMOKER")) {
 					ssClass = Const.CLASS_NON_SMOKER;
 				} else if (truthVal.equals("UNKNOWN")) {
 					ssClass = Const.CLASS_UNKNOWN;
 				} else {
 					throw new Exception("Invalid truth value for line:" + line);
 				}

 				TruthValue tVal = (TruthValue) iv_truthMap.get(recordID);
 				if (tVal == null) {
 					tVal = new TruthValue();
 					tVal.iv_sentenceList = new ArrayList<String>();
 					tVal.iv_classification = ssClass;
 				}

 				tVal.iv_sentenceList.add(sentence);

 				iv_truthMap.put(recordID, tVal);

 			} else {
 				iv_logger.warn("Malformed line " + lineNum + ": " + line);
 			}

 			line = br.readLine();
 			lineNum++;
 		}
 		br.close();

 		if (iv_logger.isInfoEnabled())
 			iv_logger.info("Truth data loaded for "
 					+ iv_truthMap.keySet().size() + " records");
 	}

 	/*
 	 * (non-Javadoc)
 	 *
 	 * @see
 	 * org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.
 	 * apache.uima.jcas.impl.JCas,
 	 * org.apache.uima.analysis_engine.ResultSpecification)
 	 */
 	public void process( final JCas jcas ) throws AnalysisEngineProcessException {
 		// cleanup

 		iv_entryIndexMap.clear();
 		iv_procEntryList.clear();
 		iv_segList.clear();

 		List<ClassifiableEntry> entryList = new ArrayList<ClassifiableEntry>();
 		String recordID = null;

 		if (iv_logger.isInfoEnabled()) {
 		 	JFSIndexRepository indexes = jcas.getJFSIndexRepository();
 		 	FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS(DocumentID.type);
 			if (documentIDIterator.hasNext()) {
 				DocumentID didAnn = (DocumentID) documentIDIterator.next();
 				recordID = didAnn.getDocumentID();

 				if (iv_logger.isInfoEnabled())
 					iv_logger.info("Processing record [" + recordID + "]");
 			}
 		}

 		Iterator<?> sentItr = jcas.getJFSIndexRepository().getAnnotationIndex(
 				Sentence.type).iterator();
 		while (sentItr.hasNext()) {
 			Sentence sentAnn = (Sentence) sentItr.next();
 			/**
 			 * This is not to use the specified section.
 			 * 2-23-2009
 			 * The sentence adjuster has a separate means to check for skipped segments, so
 			 * this needs to be tested here as well.
 			 * 9-8-2011
 			 */
 			// ---
 //			if (sectionsToIgnore.contains(sentAnn.getSegmentId())) {
 //				// System.out.println("---"+sentAnn.getSegmentId()+"|"+sentAnn.getCoveredText());
 //				continue;
 //			}
 			Iterator<?> segItr = jcas.getJFSIndexRepository().getAnnotationIndex(Segment.type).iterator();
 			Boolean skip = false;
 			while (segItr.hasNext() && !skip) {
 				Segment segment = (Segment) segItr.next();
 				if (segment.getBegin() <= sentAnn.getBegin() && segment.getEnd() >= sentAnn.getEnd()
 						&& sectionsToIgnore.contains(segment.getId()))
 					skip = true;

 			}
 			// ---
 			if (!skip) {
 				ClassifiableEntry entry = new ClassifiableEntry();
 				entry.iv_recordID = recordID;
 				entry.iv_begin = sentAnn.getBegin();
 				entry.iv_end = sentAnn.getEnd();
 				entry.iv_text = sentAnn.getCoveredText();
 				entryList.add(entry);
 			}
 		}

 		// collect segment annotations
 		Iterator<?> segItr = jcas.getJFSIndexRepository().getAnnotationIndex(
 				Segment.type).iterator();
 		while (segItr.hasNext()) {
 			Segment segAnn = (Segment) segItr.next();
 			iv_segList.add(segAnn);
 		}

 		iv_entryIndexMap.put(recordID, entryList);

 		buildProcEntryList();

 		/**
 		 * cycle through the procEntryList to process one sentence at a time
 		 */

 		try {
 			for (iv_classifiableIdx = 0; iv_classifiableIdx < iv_procEntryList
 					.size(); iv_classifiableIdx++) {
 				jcas_local.reset();
 				// create a new JCas object
 				// jcas_local.setDocumentText(jcas.getDocumentText());

 				// all sentences should be added to one list in iv_entryIndexMap
 				ClassifiableEntry entry = (ClassifiableEntry) iv_procEntryList
 						.get(iv_classifiableIdx);
 				// add object to CAS that captures entry data
 				RecordSentence rs = new RecordSentence(jcas_local);
 				rs.setRecordID(entry.iv_recordID);

 				/**
 				 * To optimize processing, pcs classifier step will process just
 				 * the sentence that was classified as "Known" by
 				 * KURuleBasedClassifier. Document Text is thus restricted to
 				 * contain Sentence Text and NOT the complete document text.
 				 * Thus, begin and end cannot be directly copied from
 				 * entry.iv_begin and entry.iv_end
 				 */
 				rs.setBegin(0);
 				rs.setRecordTextBegin(0);
 				rs.setEnd(entry.iv_text.length());
 				rs.setRecordTextEnd(entry.iv_text.length());

 				// No CAS Initiliazer
 				jcas_local.setDocumentText(entry.iv_text);

 				// get segment for the sentence, assume boundaries to be that of
 				// the sentence
 				Segment sa = getSegment(entry);
 				if (sa != null) {
 					Segment copy_sa = new Segment(jcas_local);
 					copy_sa.setBegin(rs.getBegin());
 					copy_sa.setEnd(rs.getEnd());
 					copy_sa.setId(sa.getId());
 					copy_sa.addToIndexes();
 				} else {
 					if (iv_logger.isDebugEnabled())
 						iv_logger.error("Invalid Segment for sentence ["
 								+ rs.getCoveredText() + "]");
 				}

 				// assign classification value if applicable
 				// this only happens when a Truth data file was specified
 				if (entry.iv_classification != null) {
 					rs.setClassification(entry.iv_classification);
 				}
 				rs.addToIndexes();

 				taeStep1.process(jcas_local);

 				if (isSmokingStatusKnown(jcas_local))
 					taeStep2.process(jcas_local);

 				ra.process(jcas_local);

 				performRecordResolution(jcas_local);
 			}

 			// final doc classification needs to be added to the original cas
 			collectionProcessComplete(jcas);
 		} catch (Exception aep) {
 			try {
 				throw new AnnotatorProcessException(aep);
 			} catch (AnnotatorProcessException e) {
 				// TODO Auto-generated catch block
 				e.printStackTrace();
 			}
 		}

 	}

 	public void destroy() {
 		super.destroy();

 		taeStep1.destroy();
 		taeStep2.destroy();
 	}

 	/**
 	 * determines of value set by KUClassifier
 	 *
 	 * @param jcas_local
 	 * @return
 	 */
 	private boolean isSmokingStatusKnown(JCas jcas_local) {
 		boolean known = true;
 		Iterator<?> nominalAttrItr = jcas_local.getJFSIndexRepository()
 				.getAnnotationIndex(NominalAttributeValue.type).iterator();

 		while (nominalAttrItr.hasNext()) {
 			NominalAttributeValue nav = (NominalAttributeValue) nominalAttrItr
 					.next();

 			if (nav.getAttributeName().equalsIgnoreCase("smoking_status")
 					&& nav.getNominalValue().equalsIgnoreCase("UNKNOWN"))
 				known = false;
 			// System.err.println("attr name:"+nav.getAttributeName()+" val:"+nav.getNominalValue());

 		}

 		return known;
 	}

 	private Segment getSegment(ClassifiableEntry rs) {
 		Segment sa;
 		for (int i = 0; i < iv_segList.size(); i++) {
 			sa = (Segment) iv_segList.get(i);

 			if (rs.iv_begin >= sa.getBegin() && rs.iv_end <= sa.getEnd())
 				return sa;
 		}

 		return null;
 	}

 	private void performRecordResolution(JCas jcas_local)
 			throws AnnotatorProcessException {
 		// CAS represents a single sentence
 		try {
 			// should be only one RecordSentence object produced by
 			// the I2B2XmlReader collection reader
 			Iterator<?> rsItr = jcas_local.getJFSIndexRepository()
 					.getAnnotationIndex(RecordSentence.type).iterator();

 			if (rsItr.hasNext()) {

 				// should be only one final NominalAttributeValue object
 				// produced by
 				// the ResolutionAnnotator.
 				Iterator<?> navItr = jcas_local.getJFSIndexRepository()
 						.getAnnotationIndex(NominalAttributeValue.type)
 						.iterator();
 				while (navItr.hasNext()) {
 					NominalAttributeValue nav = (NominalAttributeValue) navItr
 							.next();
 					String classification = nav.getNominalValue();

 					storeAssignedClasses(classification);
 				}

 			}
 		} catch (Exception e) {
 			e.printStackTrace();
 			throw new AnnotatorProcessException(e);
 		}

 	}

 	public void collectionProcessComplete(JCas jcas)
 			throws ResourceProcessException, IOException {
 		try {
 			/**
 			 * sort record IDs ascending For production environment, we must
 			 * have just one record in the collection
 			 */
 			String finalClassification = resolveClassification();

 			SmokingDocumentClassification docClass = new SmokingDocumentClassification(
 					jcas);
 			docClass.addToIndexes();
 			docClass.setClassification(finalClassification);

 			resetCounts();
 		} catch (Exception e) {
 			throw new ResourceProcessException(e);
 		}
 	}

 	/**
 	 * Given all the unique classifications for a given record, resolve it down
 	 * to a single final classifcation.
 	 *
 	 * @return -
 	 */
 	private String resolveClassification() {
 		// If (all sentences in a report are classified as UNKNOWN)
 		// then mark that report as UNKNOWN;
 		//

 		if (iUnknownCtr > 0 && iSmokerCtr == 0 && iPastSmokerCtr == 0
 				&& iCurrentCtr == 0 && iNonSmokerCtr == 0)
 			return Const.CLASS_UNKNOWN;
 		else if (iNonSmokerCtr >= 1 && iUnknownCtr >= 0 && iPastSmokerCtr == 0
 				&& iCurrentCtr == 0 && iSmokerCtr == 0)
 			return Const.CLASS_NON_SMOKER;
 		else if (iCurrentCtr >= 1)
 			return Const.CLASS_CURR_SMOKER;
 		else if (iPastSmokerCtr >= 1 && iCurrentCtr <= 0)
 			return Const.CLASS_PAST_SMOKER;
 		else if (iSmokerCtr >= 1 && iCurrentCtr <= 0 && iPastSmokerCtr <= 0)
 			return Const.CLASS_SMOKER;
 		else
 			return null;
 	}

 	private void storeAssignedClasses(String smokClass) {
 		if (smokClass.equals(Const.CLASS_CURR_SMOKER))
 			iCurrentCtr++;
 		else if (smokClass.equals(Const.CLASS_NON_SMOKER))
 			iNonSmokerCtr++;
 		else if (smokClass.equals(Const.CLASS_PAST_SMOKER))
 			iPastSmokerCtr++;
 		else if (smokClass.equals(Const.CLASS_SMOKER))
 			iSmokerCtr++;
 		else if (smokClass.equals(Const.CLASS_UNKNOWN))
 			iUnknownCtr++;
 	}

 	private void resetCounts() {
 		iSmokerCtr = 0;
 		iPastSmokerCtr = 0;
 		iCurrentCtr = 0;
 		iNonSmokerCtr = 0;
 		iUnknownCtr = 0;
 	}

 	private void buildProcEntryList() {
 		// assemble the final list of entries that will be considered
 		// part of the collection to process
 		int allowedCnt = 0;
 		int disallowedCnt = 0;
 		Iterator<String> recItr = iv_entryIndexMap.keySet().iterator();

 		while (recItr.hasNext()) {
 			String recordID = (String) recItr.next();
 			Iterator<?> entryItr = ((List<?>) iv_entryIndexMap.get(recordID))
 					.iterator();
 			while (entryItr.hasNext()) {
 				ClassifiableEntry entry = (ClassifiableEntry) entryItr.next();

 				if ((iv_allowedClassifications == null)
 						|| (iv_allowedClassifications
 								.contains(entry.iv_classification))) {
 					iv_procEntryList.add(entry);
 					allowedCnt++;
 				} else {
 					if (iv_logger.isInfoEnabled())
 						iv_logger.info("disallowed value:"
 								+ entry.iv_classification);
 					disallowedCnt++;
 				}
 				// System.out.println("****Sentence: " +entry.iv_text);
 			}
 		}

 		int totalCnt = allowedCnt + disallowedCnt;
 		if (iv_logger.isInfoEnabled()) {
 			iv_logger.info("# total sentences: " + totalCnt);
 			iv_logger.info("# allowed sentences: " + allowedCnt);
 			iv_logger.info("# disallowed sentences: " + disallowedCnt);
 		}
 	}

 	// index into the <x>annotations contained in a file, value increments as
 	// the CollectionReader consumes objects
 	private int iv_classifiableIdx;

 	// list of ClassifiableEntry objects to be processed
 	private List<ClassifiableEntry> iv_procEntryList;

 	// list of segments
 	private List<Segment> iv_segList;

 	// Map used to index ALL ClassifiableEntry objects by their record ID
 	// key = record ID (java.lang.Integer)
 	// val = list of ClassifiableEntry objects
 	private Map<String, List<ClassifiableEntry>> iv_entryIndexMap;

 	// key = record ID (java.lang.Integer)
 	// val = list of TruthValue objects
 	private Map<Integer, TruthValue> iv_truthMap;

 	// set of classification values of type java.lang.String
 	// see org.apache.ctakes.smokingstatus.Const for proper String values
 	private Set<String> iv_allowedClassifications;

 	private AnalysisEngine taeStep1;
 	private AnalysisEngine taeStep2;

 	private ResourceSpecifier taeSpecifierStep1;
 	private ResourceSpecifier taeSpecifierStep2;

 	// LOG4J logger based on class name
 	protected Logger iv_logger = Logger.getLogger(getClass().getName());

 	// counters to track sentence classification
 	private int iSmokerCtr;
 	private int iPastSmokerCtr;
 	private int iCurrentCtr;
 	private int iNonSmokerCtr;
 	private int iUnknownCtr;
 	//private String apiMacroHome = "\\$main_root";
 	private JCas jcas_local;
 	private ResolutionAnnotator ra;
 	private ResourceManager ResMgr;
 	private Set<String> sectionsToIgnore;

 }
	/**
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/
	/**
	*
	*/
	package org.apache.ctakes.smokingstatus.ae;

	import java.io.BufferedReader;
	import java.io.File;
	import java.io.FileReader;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.StringTokenizer;

	import org.apache.log4j.Logger;
	import org.apache.uima.UIMAFramework;
	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_engine.AnalysisEngine;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.analysis_engine.annotator.AnnotatorProcessException;
	import org.apache.uima.cas.FSIterator;
	import org.apache.uima.fit.component.JCasAnnotator_ImplBase;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.jcas.JFSIndexRepository;
	import org.apache.uima.jcas.cas.TOP;
	import org.apache.uima.resource.ResourceInitializationException;
	import org.apache.uima.resource.ResourceManager;
	import org.apache.uima.resource.ResourceProcessException;
	import org.apache.uima.resource.ResourceSpecifier;
	import org.apache.uima.util.CasCreationUtils;
	import org.apache.uima.util.XMLInputSource;

	import org.apache.ctakes.smokingstatus.i2b2.type.RecordSentence;

	import org.apache.ctakes.smokingstatus.type.SmokingDocumentClassification;

	import org.apache.ctakes.core.resource.FileResource;
	import org.apache.ctakes.smokingstatus.Const;
	import org.apache.ctakes.smokingstatus.util.ClassifiableEntry;
	import org.apache.ctakes.smokingstatus.util.TruthValue;
	import org.apache.ctakes.typesystem.type.structured.DocumentID;
	import org.apache.ctakes.typesystem.type.textspan.Segment;
	import org.apache.ctakes.typesystem.type.textspan.Sentence;
	import org.apache.ctakes.smokingstatus.type.libsvm.NominalAttributeValue;

	/**
	* @author Mayo Clinic Intended as an annotator to generate ClassifiableEntries
	* for SmokingStatus pipeline
	*/
	public class ClassifiableEntries extends JCasAnnotator_ImplBase {


	// TODO @ConfigurationParameter all of these parameters.

	/**
	* Name of configuration parameter that must be set to the filepath of the
	* UIMA descriptor ProductionPostSentenceAggregate.xml
	*/
	// public static final String PARAM_SMOKING_STATUS_DESC_STEP1 = "UimaDescriptorStep1";
	// public static final String PARAM_SMOKING_STATUS_DESC_STEP2 = "UimaDescriptorStep2";
	public static final String PARAM_SMOKING_STATUS_DESC_STEP1KEY = "UimaDescriptorStep1Key";
	public static final String PARAM_SMOKING_STATUS_DESC_STEP2KEY = "UimaDescriptorStep2Key";

	/**
	* Name of configuration parameter that must be set to the filepath of the
	* delimited truth file. This is optional.
	*/
	public static final String PARAM_TRUTH_FILE = "TruthFile";

	/**
	* Name of configuration parameter that describes the character delimiter
	* used in the delimited truth file. This is optional.
	*/
	public static final String PARAM_TRUTH_FILE_DELIMITER = "TruthFileDelimiter";

	/**
	* Name of configuration parameter that determines the allowed
	* Classification values. This is optional and only gets used if a Truth
	* file is specified.
	*/
	public static final String PARAM_ALLOWED_CLASSES = "AllowedClassifications";

	/**
	* Name of configuration parameter that determines whether section headers
	* will be parsed out and Segments made to cover the section text.
	*/
	public static final String PARAM_PARSE_SECTIONS = "ParseSections";

	/**
	* Sections NOT to be entered in ClassifiableEntries
	*/
	public static final String PARAM_IGNORE_SECTIONS = "SectionsToIgnore";

	public void initialize( final UimaContext aContext ) throws ResourceInitializationException {
	super.initialize( aContext );
	boolean windowsSystem = true;
	try {
	ResMgr = UIMAFramework.newDefaultResourceManager();
	iv_procEntryList = new ArrayList<ClassifiableEntry>();
	iv_entryIndexMap = new HashMap<String, List<ClassifiableEntry>>();
	iv_segList = new ArrayList<Segment>();

	// load (optional) truth data
	initTruthData();
	// What type of operating system type are we running on? Unix or Windows
	if (System.getProperty("file.separator").matches("/"))
	windowsSystem = false;

	// load TAE from XML descriptor
	FileResource fResrc = (FileResource) aContext.getResourceObject(PARAM_SMOKING_STATUS_DESC_STEP1KEY);
	File descFile = fResrc.getFile();
	taeSpecifierStep1 = UIMAFramework.getXMLParser().parseResourceSpecifier(
	new XMLInputSource(fResrc.getFile()));

	fResrc = (FileResource) aContext.getResourceObject(PARAM_SMOKING_STATUS_DESC_STEP2KEY);
	descFile = fResrc.getFile();

	taeSpecifierStep2 = UIMAFramework.getXMLParser().parseResourceSpecifier(
	new XMLInputSource(fResrc.getFile()));

	ra = new ResolutionAnnotator();
	ra.initialize(aContext);
	String dataPath = aContext.getDataPath();
	System.out.println("descFile "+descFile.getAbsolutePath());
	// if (!descFile.getAbsolutePath().contains(apiMacroHome)) {
	// ClassLoader thisBundle = this.getClass().getClassLoader();
	// iv_logger.info("Using data path : "+dataPath);
	// if (!windowsSystem) {
	// //thisBundle.getSystemResources(dataPath);
	// dataPath = "\""+dataPath+"\"";
	// // This bundle is needed for Linux issues w/ embedded blanks in path names
	// //this.getClass().getClassLoader().getResource(dataPath);
	// ResMgr.setExtensionClassPath(thisBundle, dataPath, true);
	// } else {
	// ResMgr.setExtensionClassPath(dataPath, true);
	// }
	// }
	// else if (iv_logger.isInfoEnabled()) {
	// iv_logger.warn("Shouldn't need to set the classpath "+descFile.getAbsolutePath());
	// }
	taeStep1 = UIMAFramework.produceAnalysisEngine(taeSpecifierStep1, ResMgr, null);
	taeStep2 = UIMAFramework.produceAnalysisEngine(taeSpecifierStep2, ResMgr, null);
	jcas_local = CasCreationUtils.createCas(taeStep1.getAnalysisEngineMetaData()).getJCas();

	// if (iv_logger.isInfoEnabled())
	// iv_logger.info("Loaded UIMA TAE from descriptor: "
	// + descFile.getAbsolutePath().replaceAll(apiMacroHome, "."));

	// get sections to ignore
	String[] sections = (String[]) getContext().getConfigParameterValue(PARAM_IGNORE_SECTIONS);
	sectionsToIgnore = new HashSet<String>();
	for (int i = 0; i < sections.length; i++)
	sectionsToIgnore.add(sections[i]);
	} catch (Exception e) {
	throw new ResourceInitializationException(e);
	}
	}

	private void initTruthData() throws Exception {
	String truthFilePath = (String) getContext().getConfigParameterValue(
	PARAM_TRUTH_FILE);
	if (truthFilePath != null && truthFilePath.length() > 0) {
	String delimiter = "\t";
	File truthFile = new File(truthFilePath);
	loadTruthData(truthFile, delimiter);

	String[] allowedArr = (String[]) getContext()
	.getConfigParameterValue(PARAM_ALLOWED_CLASSES);
	iv_allowedClassifications = new HashSet<String>();
	for (int i = 0; i < allowedArr.length; i++) {
	String classification = allowedArr[i];
	if (classification.equals(Const.CLASS_CURR_SMOKER)
	\|\| classification.equals(Const.CLASS_NON_SMOKER)
	\|\| classification.equals(Const.CLASS_PAST_SMOKER)
	\|\| classification.equals(Const.CLASS_SMOKER)
	\|\| classification.equals(Const.CLASS_UNKNOWN)) {
	iv_allowedClassifications.add(classification);
	} else {
	throw new Exception(
	"Invalid classification value for param "
	+ PARAM_ALLOWED_CLASSES + ":"
	+ classification);
	}
	}
	}
	}

	/**
	* Parses the TRUTH file in delimited format. Stores data in maps.
	*
	* @param truthFile
	* @param delimiter
	* @throws Exception
	*/
	private void loadTruthData(File truthFile, String delimiter)
	throws Exception {
	iv_truthMap = new HashMap<Integer, TruthValue>();

	BufferedReader br = new BufferedReader(new FileReader(truthFile));
	int lineNum = 1;
	String line = br.readLine();
	while (line != null) {
	StringTokenizer st = new StringTokenizer(line, delimiter);
	if (st.countTokens() == 4) {
	Integer recordID = new Integer(st.nextToken().trim());
	String truthVal = st.nextToken().trim();
	String sentence = st.nextToken().trim();
	// String section = st.nextToken().trim();

	String ssClass = null;
	if (truthVal.equals("CURRENT SMOKER")) {
	ssClass = Const.CLASS_CURR_SMOKER;
	} else if (truthVal.equals("PAST SMOKER")) {
	ssClass = Const.CLASS_PAST_SMOKER;
	} else if (truthVal.equals("SMOKER")) {
	ssClass = Const.CLASS_SMOKER;
	} else if (truthVal.equals("NON-SMOKER")) {
	ssClass = Const.CLASS_NON_SMOKER;
	} else if (truthVal.equals("UNKNOWN")) {
	ssClass = Const.CLASS_UNKNOWN;
	} else {
	throw new Exception("Invalid truth value for line:" + line);
	}

	TruthValue tVal = (TruthValue) iv_truthMap.get(recordID);
	if (tVal == null) {
	tVal = new TruthValue();
	tVal.iv_sentenceList = new ArrayList<String>();
	tVal.iv_classification = ssClass;
	}

	tVal.iv_sentenceList.add(sentence);

	iv_truthMap.put(recordID, tVal);

	} else {
	iv_logger.warn("Malformed line " + lineNum + ": " + line);
	}

	line = br.readLine();
	lineNum++;
	}
	br.close();

	if (iv_logger.isInfoEnabled())
	iv_logger.info("Truth data loaded for "
	+ iv_truthMap.keySet().size() + " records");
	}

	/*
	* (non-Javadoc)
	*
	* @see
	* org.apache.uima.analysis_engine.annotator.JTextAnnotator#process(org.
	* apache.uima.jcas.impl.JCas,
	* org.apache.uima.analysis_engine.ResultSpecification)
	*/
	public void process( final JCas jcas ) throws AnalysisEngineProcessException {
	// cleanup

	iv_entryIndexMap.clear();
	iv_procEntryList.clear();
	iv_segList.clear();

	List<ClassifiableEntry> entryList = new ArrayList<ClassifiableEntry>();
	String recordID = null;

	if (iv_logger.isInfoEnabled()) {
	JFSIndexRepository indexes = jcas.getJFSIndexRepository();
	FSIterator<TOP> documentIDIterator = indexes.getAllIndexedFS(DocumentID.type);
	if (documentIDIterator.hasNext()) {
	DocumentID didAnn = (DocumentID) documentIDIterator.next();
	recordID = didAnn.getDocumentID();

	if (iv_logger.isInfoEnabled())
	iv_logger.info("Processing record [" + recordID + "]");
	}
	}

	Iterator<?> sentItr = jcas.getJFSIndexRepository().getAnnotationIndex(
	Sentence.type).iterator();
	while (sentItr.hasNext()) {
	Sentence sentAnn = (Sentence) sentItr.next();
	/**
	* This is not to use the specified section.
	* 2-23-2009
	* The sentence adjuster has a separate means to check for skipped segments, so
	* this needs to be tested here as well.
	* 9-8-2011
	*/
	// ---
	// if (sectionsToIgnore.contains(sentAnn.getSegmentId())) {
	// // System.out.println("---"+sentAnn.getSegmentId()+"\|"+sentAnn.getCoveredText());
	// continue;
	// }
	Iterator<?> segItr = jcas.getJFSIndexRepository().getAnnotationIndex(Segment.type).iterator();
	Boolean skip = false;
	while (segItr.hasNext() && !skip) {
	Segment segment = (Segment) segItr.next();
	if (segment.getBegin() <= sentAnn.getBegin() && segment.getEnd() >= sentAnn.getEnd()
	&& sectionsToIgnore.contains(segment.getId()))
	skip = true;

	}
	// ---
	if (!skip) {
	ClassifiableEntry entry = new ClassifiableEntry();
	entry.iv_recordID = recordID;
	entry.iv_begin = sentAnn.getBegin();
	entry.iv_end = sentAnn.getEnd();
	entry.iv_text = sentAnn.getCoveredText();
	entryList.add(entry);
	}
	}

	// collect segment annotations
	Iterator<?> segItr = jcas.getJFSIndexRepository().getAnnotationIndex(
	Segment.type).iterator();
	while (segItr.hasNext()) {
	Segment segAnn = (Segment) segItr.next();
	iv_segList.add(segAnn);
	}

	iv_entryIndexMap.put(recordID, entryList);

	buildProcEntryList();

	/**
	* cycle through the procEntryList to process one sentence at a time
	*/

	try {
	for (iv_classifiableIdx = 0; iv_classifiableIdx < iv_procEntryList
	.size(); iv_classifiableIdx++) {
	jcas_local.reset();
	// create a new JCas object
	// jcas_local.setDocumentText(jcas.getDocumentText());

	// all sentences should be added to one list in iv_entryIndexMap
	ClassifiableEntry entry = (ClassifiableEntry) iv_procEntryList
	.get(iv_classifiableIdx);
	// add object to CAS that captures entry data
	RecordSentence rs = new RecordSentence(jcas_local);
	rs.setRecordID(entry.iv_recordID);

	/**
	* To optimize processing, pcs classifier step will process just
	* the sentence that was classified as "Known" by
	* KURuleBasedClassifier. Document Text is thus restricted to
	* contain Sentence Text and NOT the complete document text.
	* Thus, begin and end cannot be directly copied from
	* entry.iv_begin and entry.iv_end
	*/
	rs.setBegin(0);
	rs.setRecordTextBegin(0);
	rs.setEnd(entry.iv_text.length());
	rs.setRecordTextEnd(entry.iv_text.length());

	// No CAS Initiliazer
	jcas_local.setDocumentText(entry.iv_text);

	// get segment for the sentence, assume boundaries to be that of
	// the sentence
	Segment sa = getSegment(entry);
	if (sa != null) {
	Segment copy_sa = new Segment(jcas_local);
	copy_sa.setBegin(rs.getBegin());
	copy_sa.setEnd(rs.getEnd());
	copy_sa.setId(sa.getId());
	copy_sa.addToIndexes();
	} else {
	if (iv_logger.isDebugEnabled())
	iv_logger.error("Invalid Segment for sentence ["
	+ rs.getCoveredText() + "]");
	}

	// assign classification value if applicable
	// this only happens when a Truth data file was specified
	if (entry.iv_classification != null) {
	rs.setClassification(entry.iv_classification);
	}
	rs.addToIndexes();

	taeStep1.process(jcas_local);

	if (isSmokingStatusKnown(jcas_local))
	taeStep2.process(jcas_local);

	ra.process(jcas_local);

	performRecordResolution(jcas_local);
	}

	// final doc classification needs to be added to the original cas
	collectionProcessComplete(jcas);
	} catch (Exception aep) {
	try {
	throw new AnnotatorProcessException(aep);
	} catch (AnnotatorProcessException e) {
	// TODO Auto-generated catch block
	e.printStackTrace();
	}
	}

	}

	public void destroy() {
	super.destroy();

	taeStep1.destroy();
	taeStep2.destroy();
	}

	/**
	* determines of value set by KUClassifier
	*
	* @param jcas_local
	* @return
	*/
	private boolean isSmokingStatusKnown(JCas jcas_local) {
	boolean known = true;
	Iterator<?> nominalAttrItr = jcas_local.getJFSIndexRepository()
	.getAnnotationIndex(NominalAttributeValue.type).iterator();

	while (nominalAttrItr.hasNext()) {
	NominalAttributeValue nav = (NominalAttributeValue) nominalAttrItr
	.next();

	if (nav.getAttributeName().equalsIgnoreCase("smoking_status")
	&& nav.getNominalValue().equalsIgnoreCase("UNKNOWN"))
	known = false;
	// System.err.println("attr name:"+nav.getAttributeName()+" val:"+nav.getNominalValue());

	}

	return known;
	}

	private Segment getSegment(ClassifiableEntry rs) {
	Segment sa;
	for (int i = 0; i < iv_segList.size(); i++) {
	sa = (Segment) iv_segList.get(i);

	if (rs.iv_begin >= sa.getBegin() && rs.iv_end <= sa.getEnd())
	return sa;
	}

	return null;
	}

	private void performRecordResolution(JCas jcas_local)
	throws AnnotatorProcessException {
	// CAS represents a single sentence
	try {
	// should be only one RecordSentence object produced by
	// the I2B2XmlReader collection reader
	Iterator<?> rsItr = jcas_local.getJFSIndexRepository()
	.getAnnotationIndex(RecordSentence.type).iterator();

	if (rsItr.hasNext()) {

	// should be only one final NominalAttributeValue object
	// produced by
	// the ResolutionAnnotator.
	Iterator<?> navItr = jcas_local.getJFSIndexRepository()
	.getAnnotationIndex(NominalAttributeValue.type)
	.iterator();
	while (navItr.hasNext()) {
	NominalAttributeValue nav = (NominalAttributeValue) navItr
	.next();
	String classification = nav.getNominalValue();

	storeAssignedClasses(classification);
	}

	}
	} catch (Exception e) {
	e.printStackTrace();
	throw new AnnotatorProcessException(e);
	}

	}

	public void collectionProcessComplete(JCas jcas)
	throws ResourceProcessException, IOException {
	try {
	/**
	* sort record IDs ascending For production environment, we must
	* have just one record in the collection
	*/
	String finalClassification = resolveClassification();

	SmokingDocumentClassification docClass = new SmokingDocumentClassification(
	jcas);
	docClass.addToIndexes();
	docClass.setClassification(finalClassification);

	resetCounts();
	} catch (Exception e) {
	throw new ResourceProcessException(e);
	}
	}

	/**
	* Given all the unique classifications for a given record, resolve it down
	* to a single final classifcation.
	*
	* @return -
	*/
	private String resolveClassification() {
	// If (all sentences in a report are classified as UNKNOWN)
	// then mark that report as UNKNOWN;
	//

	if (iUnknownCtr > 0 && iSmokerCtr == 0 && iPastSmokerCtr == 0
	&& iCurrentCtr == 0 && iNonSmokerCtr == 0)
	return Const.CLASS_UNKNOWN;
	else if (iNonSmokerCtr >= 1 && iUnknownCtr >= 0 && iPastSmokerCtr == 0
	&& iCurrentCtr == 0 && iSmokerCtr == 0)
	return Const.CLASS_NON_SMOKER;
	else if (iCurrentCtr >= 1)
	return Const.CLASS_CURR_SMOKER;
	else if (iPastSmokerCtr >= 1 && iCurrentCtr <= 0)
	return Const.CLASS_PAST_SMOKER;
	else if (iSmokerCtr >= 1 && iCurrentCtr <= 0 && iPastSmokerCtr <= 0)
	return Const.CLASS_SMOKER;
	else
	return null;
	}

	private void storeAssignedClasses(String smokClass) {
	if (smokClass.equals(Const.CLASS_CURR_SMOKER))
	iCurrentCtr++;
	else if (smokClass.equals(Const.CLASS_NON_SMOKER))
	iNonSmokerCtr++;
	else if (smokClass.equals(Const.CLASS_PAST_SMOKER))
	iPastSmokerCtr++;
	else if (smokClass.equals(Const.CLASS_SMOKER))
	iSmokerCtr++;
	else if (smokClass.equals(Const.CLASS_UNKNOWN))
	iUnknownCtr++;
	}

	private void resetCounts() {
	iSmokerCtr = 0;
	iPastSmokerCtr = 0;
	iCurrentCtr = 0;
	iNonSmokerCtr = 0;
	iUnknownCtr = 0;
	}

	private void buildProcEntryList() {
	// assemble the final list of entries that will be considered
	// part of the collection to process
	int allowedCnt = 0;
	int disallowedCnt = 0;
	Iterator<String> recItr = iv_entryIndexMap.keySet().iterator();

	while (recItr.hasNext()) {
	String recordID = (String) recItr.next();
	Iterator<?> entryItr = ((List<?>) iv_entryIndexMap.get(recordID))
	.iterator();
	while (entryItr.hasNext()) {
	ClassifiableEntry entry = (ClassifiableEntry) entryItr.next();

	if ((iv_allowedClassifications == null)
	\|\| (iv_allowedClassifications
	.contains(entry.iv_classification))) {
	iv_procEntryList.add(entry);
	allowedCnt++;
	} else {
	if (iv_logger.isInfoEnabled())
	iv_logger.info("disallowed value:"
	+ entry.iv_classification);
	disallowedCnt++;
	}
	// System.out.println("****Sentence: " +entry.iv_text);
	}
	}

	int totalCnt = allowedCnt + disallowedCnt;
	if (iv_logger.isInfoEnabled()) {
	iv_logger.info("# total sentences: " + totalCnt);
	iv_logger.info("# allowed sentences: " + allowedCnt);
	iv_logger.info("# disallowed sentences: " + disallowedCnt);
	}
	}

	// index into the <x>annotations contained in a file, value increments as
	// the CollectionReader consumes objects
	private int iv_classifiableIdx;

	// list of ClassifiableEntry objects to be processed
	private List<ClassifiableEntry> iv_procEntryList;

	// list of segments
	private List<Segment> iv_segList;

	// Map used to index ALL ClassifiableEntry objects by their record ID
	// key = record ID (java.lang.Integer)
	// val = list of ClassifiableEntry objects
	private Map<String, List<ClassifiableEntry>> iv_entryIndexMap;

	// key = record ID (java.lang.Integer)
	// val = list of TruthValue objects
	private Map<Integer, TruthValue> iv_truthMap;

	// set of classification values of type java.lang.String
	// see org.apache.ctakes.smokingstatus.Const for proper String values
	private Set<String> iv_allowedClassifications;

	private AnalysisEngine taeStep1;
	private AnalysisEngine taeStep2;

	private ResourceSpecifier taeSpecifierStep1;
	private ResourceSpecifier taeSpecifierStep2;

	// LOG4J logger based on class name
	protected Logger iv_logger = Logger.getLogger(getClass().getName());

	// counters to track sentence classification
	private int iSmokerCtr;
	private int iPastSmokerCtr;
	private int iCurrentCtr;
	private int iNonSmokerCtr;
	private int iUnknownCtr;
	//private String apiMacroHome = "\\$main_root";
	private JCas jcas_local;
	private ResolutionAnnotator ra;
	private ResourceManager ResMgr;
	private Set<String> sectionsToIgnore;

	}