src/test/src/uima/parser_interface.hpp - uima-uimacpp - Git at Google

 /** \file parser_interface.hpp .

  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.


    \brief  Interface class for parsers.

 -------------------------------------------------------------------------- */
 #ifndef UIMA_PARSER_INTERFACE_HPP
 #define UIMA_PARSER_INTERFACE_HPP


 // ---------------------------------------------------------------------------
 //  Includes
 // ---------------------------------------------------------------------------

 #include "uima/pragmas.hpp" //must be first to surpress warnings
 #include <iostream>

 namespace uima {

   class ParserConfiguration;
   class TextAnalysisEngine;
   class CAS;

   /**
    * The class ParserInterface is used as an abstract base class for all
    * document parsers.
    * This is just a sketch of how this interface should be used.
    * Things like (the facade?) creating instances of objects implementing
    * this interface are not thought through yet.
    * <pre>
    * TAE engine = createTAE...
    * ParserFacade parserFacade(engine);
    * parserFacade.setMultiDocCallback(...); // optional for mulit-doc formats
    * // for each supported parser beyond the pre-defined ones
    * parserFacade.registerParserForType(p, t, config);
    * for each document {
    *    parserFacade.parseDocument(d,[config]);    // pre-fills the CAS
    *    engine.process(...);  // annotators fill the CAS
    *    ... read out results ...
    *    engine.reset(...)     // flush CAS
    * }
    * engine.destroy();
    * </pre>
    */
   class ParserInterface {
   public:
     /**
      * Callback interface to make it possible for applications to get
      * notified every time an embedded document is done, so  that
      * they can retrieve their results
      *
      * @see ParserInterface::setMultiDocCallback
      */
     class MultiDocCallbackInterface {
     public:
       /**
        * Called <em>after</em> a multi-doc parser detects the end
        * of document.
        *
        * An application should call Engine::processDocument() there
        * and retrieve the results of document processing after that
        * using iterators over the CAS or TCAS.
        * Finally an application should call resetDocument().
        *
        * An application should <em>not</em> call addDocPartsFinish()
        * or addDocPartsFinish() in this function since a conforming
        * parser is supposed to do any doc part processing.
        */
       virtual
       void documentBoundaryReached(UChar const * cpBuffer, size_t uiLength) = 0;
     };

     /**
      * Initialize the parser.
      * The parser is beeing passed an engine object and not a CAS because
      * for (XML) files that contain multiple documents the parser must be
      * able to call the process functions for each embedded document.
      *
      * An implementation needs to store the argument objects for later
      * use in function parse().
      *
      * Called once per session.
      *
      * @param config     The configuration for the parser
      * @param engine     The engine object into which the results go
      * @param fallback   An encoding to use in
      *                   case the parser can't determine the encoding
      *                   by other means
      *
      * @return           UIMA_ERR_NONE if OK, error code otherwise
      */
     virtual
     TyErrorId init(ParserConfiguration const & config, TextAnalysisEngine & engine, const char * = "Latin1" ) = 0;
     /**
      * @see MultiDocCallbackInterface
      */
     virtual
     void setMultiDocCallback(MultiDocCallbackInterface & callbackObject) = 0;

     /**
      * Do the parsing add the tag free text to the CAS and potentially
      * translate tag information to CAS annotations.
      *
      * Called once per document.
      *
      * @param inputFileName The input to process
      * @return              UIMA_ERR_NONE if OK, error code otherwise
      */
     virtual
     TyErrorId parseDocument(char* const inputFileName) = 0;
     virtual
     TyErrorId parseDocument(std::istream & inputFileStream) = 0;

     /**
      * Returns the number of documents parsed by the parser.
      * This will always be 1 for HTML but can be more for XML.
      *
      * Optionally called once per document.
      */
     virtual
     size_t getNumberOfDocumentsParsed() const = 0;

     /**
      * Returns the number of bytes parsed by the parser.
      * Information function to allow throughput computation by calling
      * environment.
      *
      * Optionally called once per document.
      */
     virtual
     size_t getNumberOfBytesParsed() const = 0;

     /**
      * De-initialize the parser (free ressources etc.)
      *
      * Called once per session.
      *
      * @return           UIMA_ERR_NONE if OK, error code otherwise
      */
     virtual
     TyErrorId deInit() = 0;
   }
   ; /* ParserInterface */

 } // namespace uima

 #endif //UIMA_PARSER_INTERFACE_HPP
	/** \file parser_interface.hpp .

	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.


	\brief Interface class for parsers.

	-------------------------------------------------------------------------- */
	#ifndef UIMA_PARSER_INTERFACE_HPP
	#define UIMA_PARSER_INTERFACE_HPP


	// ---------------------------------------------------------------------------
	// Includes
	// ---------------------------------------------------------------------------

	#include "uima/pragmas.hpp" //must be first to surpress warnings
	#include <iostream>

	namespace uima {

	class ParserConfiguration;
	class TextAnalysisEngine;
	class CAS;

	/**
	* The class ParserInterface is used as an abstract base class for all
	* document parsers.
	* This is just a sketch of how this interface should be used.
	* Things like (the facade?) creating instances of objects implementing
	* this interface are not thought through yet.
	* <pre>
	* TAE engine = createTAE...
	* ParserFacade parserFacade(engine);
	* parserFacade.setMultiDocCallback(...); // optional for mulit-doc formats
	* // for each supported parser beyond the pre-defined ones
	* parserFacade.registerParserForType(p, t, config);
	* for each document {
	* parserFacade.parseDocument(d,[config]); // pre-fills the CAS
	* engine.process(...); // annotators fill the CAS
	* ... read out results ...
	* engine.reset(...) // flush CAS
	* }
	* engine.destroy();
	* </pre>
	*/
	class ParserInterface {
	public:
	/**
	* Callback interface to make it possible for applications to get
	* notified every time an embedded document is done, so that
	* they can retrieve their results
	*
	* @see ParserInterface::setMultiDocCallback
	*/
	class MultiDocCallbackInterface {
	public:
	/**
	* Called <em>after</em> a multi-doc parser detects the end
	* of document.
	*
	* An application should call Engine::processDocument() there
	* and retrieve the results of document processing after that
	* using iterators over the CAS or TCAS.
	* Finally an application should call resetDocument().
	*
	* An application should <em>not</em> call addDocPartsFinish()
	* or addDocPartsFinish() in this function since a conforming
	* parser is supposed to do any doc part processing.
	*/
	virtual
	void documentBoundaryReached(UChar const * cpBuffer, size_t uiLength) = 0;
	};

	/**
	* Initialize the parser.
	* The parser is beeing passed an engine object and not a CAS because
	* for (XML) files that contain multiple documents the parser must be
	* able to call the process functions for each embedded document.
	*
	* An implementation needs to store the argument objects for later
	* use in function parse().
	*
	* Called once per session.
	*
	* @param config The configuration for the parser
	* @param engine The engine object into which the results go
	* @param fallback An encoding to use in
	* case the parser can't determine the encoding
	* by other means
	*
	* @return UIMA_ERR_NONE if OK, error code otherwise
	*/
	virtual
	TyErrorId init(ParserConfiguration const & config, TextAnalysisEngine & engine, const char * = "Latin1" ) = 0;
	/**
	* @see MultiDocCallbackInterface
	*/
	virtual
	void setMultiDocCallback(MultiDocCallbackInterface & callbackObject) = 0;

	/**
	* Do the parsing add the tag free text to the CAS and potentially
	* translate tag information to CAS annotations.
	*
	* Called once per document.
	*
	* @param inputFileName The input to process
	* @return UIMA_ERR_NONE if OK, error code otherwise
	*/
	virtual
	TyErrorId parseDocument(char* const inputFileName) = 0;
	virtual
	TyErrorId parseDocument(std::istream & inputFileStream) = 0;

	/**
	* Returns the number of documents parsed by the parser.
	* This will always be 1 for HTML but can be more for XML.
	*
	* Optionally called once per document.
	*/
	virtual
	size_t getNumberOfDocumentsParsed() const = 0;

	/**
	* Returns the number of bytes parsed by the parser.
	* Information function to allow throughput computation by calling
	* environment.
	*
	* Optionally called once per document.
	*/
	virtual
	size_t getNumberOfBytesParsed() const = 0;

	/**
	* De-initialize the parser (free ressources etc.)
	*
	* Called once per session.
	*
	* @return UIMA_ERR_NONE if OK, error code otherwise
	*/
	virtual
	TyErrorId deInit() = 0;
	}
	; /* ParserInterface */

	} // namespace uima

	#endif //UIMA_PARSER_INTERFACE_HPP