sandbox-2.3.0-05/TikaAnnotator/src/main/java/org/apache/uima/tika/MarkupAnnotator.java - uima-sandbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.uima.tika;

 import java.io.IOException;
 import java.io.InputStream;
 import java.net.URL;
 import java.util.Iterator;

 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.AutoDetectParser;
 import org.apache.tika.parser.Parser;
 import org.apache.uima.UimaContext;
 import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
 import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
 import org.apache.uima.cas.CAS;
 import org.apache.uima.cas.CASException;
 import org.apache.uima.cas.Type;
 import org.apache.uima.jcas.JCas;
 import org.apache.uima.jcas.cas.FSArray;
 import org.apache.uima.resource.ResourceInitializationException;


 /** Uses TIKA to convert original markup into UIMA annotations**/
 public class MarkupAnnotator extends CasAnnotator_ImplBase {


 	private final static String ORIGINAL_VIEW_PARAM_NAME = "ORIGINAL_VIEW_PARAM_NAME";
 	private final static String TEXT_VIEW_PARAM_NAME = "TEXT_VIEW_PARAM_NAME";
 	private final static String SET_TEXT_VIEW_DEFAULT_PARAM_NAME = "SET_TEXT_VIEW_DEFAULT_PARAM_NAME";

 	private final static String tika_file_param = "tikaConfigFile";

 	// takes an option indicating the name of the view containing the binary document
 	private String originalViewName = "_InitialView";

 	// takes an option indicating the name of the view containing the text version of the document
 	private String textViewName = "textView";

 	// whether to make the text view default or not
 	private Boolean makeTextDefaultView = true;

 	// configuration for TIKA - can be created by specifying a custom resource
 	private TikaConfig config = null;

 	public void initialize(UimaContext aContext) throws ResourceInitializationException {
 		super.initialize(aContext);
 		// Get config param setting
 		originalViewName  = (String) aContext.getConfigParameterValue(ORIGINAL_VIEW_PARAM_NAME);

 		textViewName = (String) aContext.getConfigParameterValue(TEXT_VIEW_PARAM_NAME);
 		if (textViewName==null) {
 			System.err.println("Parameter TEXT_VIEW_PARAM_NAME is null; setting to \"textView\"");
 			textViewName = "textView";
 		}
 		else System.err.println("Parameter TEXT_VIEW_PARAM_NAME is "+textViewName);

 		makeTextDefaultView = (Boolean) aContext.getConfigParameterValue(SET_TEXT_VIEW_DEFAULT_PARAM_NAME);
 		if (makeTextDefaultView==null) {
 			System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is null; setting to \"true\"");
 			makeTextDefaultView = new Boolean(true);
 		}
 		else System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is "+makeTextDefaultView);

 		// initialise TIKA parser
 		// try to get a custom config
 		URL tikaConfigURL = null;
 		try {
 			tikaConfigURL = getContext().getResourceURL(tika_file_param);
 			config = new TikaConfig(tikaConfigURL);
 		} catch (Exception e1) {
 			// to log
 			System.err.println("Failed to load TIKA config file from "+tikaConfigURL);
 			config = null;
 		}

 		// if not rely on default one
 		if (config==null){
 			try {
 				config = TikaConfig.getDefaultConfig();
 			} catch (TikaException e) {
 				throw new ResourceInitializationException(e);
 			}
 		}

 	}

 	public void process(CAS cas) throws AnalysisEngineProcessException {
 	    CAS originalCas = null;
 	    try {
 	    originalCas = cas.getView(originalViewName);
 	    }
 	    catch (Exception e){
 	    	String viewName = cas.getViewName();
 	    	// can't find originalViewName
 	    	System.err.println("can't find view "+originalViewName+" using "+viewName+" instead");
 	    	originalCas = cas.getCurrentView();
 	    }

 	    InputStream originalStream = originalCas.getSofa().getSofaDataStream();

 	    String lang = null;

 	    // parsing with TIKA

 	    // TODO if content type is known then we use it
 	    // otherwise we guess

 	    Parser parser = new AutoDetectParser(config);

 	    Metadata md = new Metadata();
 	    MarkupHandler handler  = new MarkupHandler();

 	    try {
 	    	parser.parse(originalStream,handler , md);
 	    }
 	    catch (Exception e){
 	    	// if we have a problem just dump the message and continue
 	    	System.err.println("Problem converting file : "+e.getMessage());
 	    	// PROBLEM => trying to serialize binary content in XML crash!
 	    	return;
 	    }
 	    finally {
 	    	try {
 				originalStream.close();
 			} catch (IOException e) {
 			}
 	    }

 	    CAS plainTextView = cas.createView(textViewName);


 	    handler.populateCAS(plainTextView);
 	    plainTextView.setDocumentLanguage(lang);

 	    // get additional metadata about the document
 	    // e.g content type etc...
 	    // TODO add possibility to define type as parameter and discover
 	    // feature names on the fly
 	    JCas ptv=null;
 		try {
 			ptv = plainTextView.getJCas();
 		} catch (CASException e) {
 			e.printStackTrace();
 			return;
 		}

 	    Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.SourceDocumentAnnotation");
 	    Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
 	    SourceDocumentAnnotation docAnnotation = null;
 	    // do we already have one?
 	    if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
 	    // otherwise let's create a new annotation
 	    else docAnnotation = new SourceDocumentAnnotation(ptv);

 	    // now iterate on the metadata found by Tika and add them to the info
 	    if (docAnnotation.getFeatures()==null)
 	    	docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size())) ;

 	    for (int i=0;i<md.size();i++){
 	    	String name = md.names()[i];
 	    	String value = md.get(name);
 	    	FeatureValue fv = new FeatureValue(ptv);
 	    	fv.setName(name);
 	    	fv.setValue(value);
 	    	docAnnotation.setFeatures(i,fv);
 	    }
 	    docAnnotation.addToIndexes();

 	}

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.uima.tika;

	import java.io.IOException;
	import java.io.InputStream;
	import java.net.URL;
	import java.util.Iterator;

	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.AutoDetectParser;
	import org.apache.tika.parser.Parser;
	import org.apache.uima.UimaContext;
	import org.apache.uima.analysis_component.CasAnnotator_ImplBase;
	import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
	import org.apache.uima.cas.CAS;
	import org.apache.uima.cas.CASException;
	import org.apache.uima.cas.Type;
	import org.apache.uima.jcas.JCas;
	import org.apache.uima.jcas.cas.FSArray;
	import org.apache.uima.resource.ResourceInitializationException;


	/ Uses TIKA to convert original markup into UIMA annotations/
	public class MarkupAnnotator extends CasAnnotator_ImplBase {


	private final static String ORIGINAL_VIEW_PARAM_NAME = "ORIGINAL_VIEW_PARAM_NAME";
	private final static String TEXT_VIEW_PARAM_NAME = "TEXT_VIEW_PARAM_NAME";
	private final static String SET_TEXT_VIEW_DEFAULT_PARAM_NAME = "SET_TEXT_VIEW_DEFAULT_PARAM_NAME";

	private final static String tika_file_param = "tikaConfigFile";

	// takes an option indicating the name of the view containing the binary document
	private String originalViewName = "_InitialView";

	// takes an option indicating the name of the view containing the text version of the document
	private String textViewName = "textView";

	// whether to make the text view default or not
	private Boolean makeTextDefaultView = true;

	// configuration for TIKA - can be created by specifying a custom resource
	private TikaConfig config = null;

	public void initialize(UimaContext aContext) throws ResourceInitializationException {
	super.initialize(aContext);
	// Get config param setting
	originalViewName = (String) aContext.getConfigParameterValue(ORIGINAL_VIEW_PARAM_NAME);

	textViewName = (String) aContext.getConfigParameterValue(TEXT_VIEW_PARAM_NAME);
	if (textViewName==null) {
	System.err.println("Parameter TEXT_VIEW_PARAM_NAME is null; setting to \"textView\"");
	textViewName = "textView";
	}
	else System.err.println("Parameter TEXT_VIEW_PARAM_NAME is "+textViewName);

	makeTextDefaultView = (Boolean) aContext.getConfigParameterValue(SET_TEXT_VIEW_DEFAULT_PARAM_NAME);
	if (makeTextDefaultView==null) {
	System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is null; setting to \"true\"");
	makeTextDefaultView = new Boolean(true);
	}
	else System.err.println("Parameter SET_TEXT_VIEW_DEFAULT_PARAM_NAME is "+makeTextDefaultView);

	// initialise TIKA parser
	// try to get a custom config
	URL tikaConfigURL = null;
	try {
	tikaConfigURL = getContext().getResourceURL(tika_file_param);
	config = new TikaConfig(tikaConfigURL);
	} catch (Exception e1) {
	// to log
	System.err.println("Failed to load TIKA config file from "+tikaConfigURL);
	config = null;
	}

	// if not rely on default one
	if (config==null){
	try {
	config = TikaConfig.getDefaultConfig();
	} catch (TikaException e) {
	throw new ResourceInitializationException(e);
	}
	}

	}

	public void process(CAS cas) throws AnalysisEngineProcessException {
	CAS originalCas = null;
	try {
	originalCas = cas.getView(originalViewName);
	}
	catch (Exception e){
	String viewName = cas.getViewName();
	// can't find originalViewName
	System.err.println("can't find view "+originalViewName+" using "+viewName+" instead");
	originalCas = cas.getCurrentView();
	}

	InputStream originalStream = originalCas.getSofa().getSofaDataStream();

	String lang = null;

	// parsing with TIKA

	// TODO if content type is known then we use it
	// otherwise we guess

	Parser parser = new AutoDetectParser(config);

	Metadata md = new Metadata();
	MarkupHandler handler = new MarkupHandler();

	try {
	parser.parse(originalStream,handler , md);
	}
	catch (Exception e){
	// if we have a problem just dump the message and continue
	System.err.println("Problem converting file : "+e.getMessage());
	// PROBLEM => trying to serialize binary content in XML crash!
	return;
	}
	finally {
	try {
	originalStream.close();
	} catch (IOException e) {
	}
	}

	CAS plainTextView = cas.createView(textViewName);


	handler.populateCAS(plainTextView);
	plainTextView.setDocumentLanguage(lang);

	// get additional metadata about the document
	// e.g content type etc...
	// TODO add possibility to define type as parameter and discover
	// feature names on the fly
	JCas ptv=null;
	try {
	ptv = plainTextView.getJCas();
	} catch (CASException e) {
	e.printStackTrace();
	return;
	}

	Type docAnnotationType = ptv.getTypeSystem().getType("org.apache.uima.SourceDocumentAnnotation");
	Iterator iter = ptv.getAnnotationIndex(docAnnotationType).iterator();
	SourceDocumentAnnotation docAnnotation = null;
	// do we already have one?
	if (iter.hasNext()) docAnnotation = (SourceDocumentAnnotation) iter.next();
	// otherwise let's create a new annotation
	else docAnnotation = new SourceDocumentAnnotation(ptv);

	// now iterate on the metadata found by Tika and add them to the info
	if (docAnnotation.getFeatures()==null)
	docAnnotation.setFeatures((FSArray) cas.createArrayFS(md.size())) ;

	for (int i=0;i<md.size();i++){
	String name = md.names()[i];
	String value = md.get(name);
	FeatureValue fv = new FeatureValue(ptv);
	fv.setName(name);
	fv.setValue(value);
	docAnnotation.setFeatures(i,fv);
	}
	docAnnotation.addToIndexes();

	}

	}