blob: ae0cf2aff0279be7c02d9b8a38e0eec7a1424ec3 [file] [log] [blame]
/** \file parser_interface.hpp .
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
\brief Interface class for parsers.
-------------------------------------------------------------------------- */
#ifndef UIMA_PARSER_INTERFACE_HPP
#define UIMA_PARSER_INTERFACE_HPP
// ---------------------------------------------------------------------------
// Includes
// ---------------------------------------------------------------------------
#include "uima/pragmas.hpp" //must be first to surpress warnings
#include <iostream>
namespace uima {
class ParserConfiguration;
class TextAnalysisEngine;
class CAS;
/**
* The class ParserInterface is used as an abstract base class for all
* document parsers.
* This is just a sketch of how this interface should be used.
* Things like (the facade?) creating instances of objects implementing
* this interface are not thought through yet.
* <pre>
* TAE engine = createTAE...
* ParserFacade parserFacade(engine);
* parserFacade.setMultiDocCallback(...); // optional for mulit-doc formats
* // for each supported parser beyond the pre-defined ones
* parserFacade.registerParserForType(p, t, config);
* for each document {
* parserFacade.parseDocument(d,[config]); // pre-fills the CAS
* engine.process(...); // annotators fill the CAS
* ... read out results ...
* engine.reset(...) // flush CAS
* }
* engine.destroy();
* </pre>
*/
class ParserInterface {
public:
/**
* Callback interface to make it possible for applications to get
* notified every time an embedded document is done, so that
* they can retrieve their results
*
* @see ParserInterface::setMultiDocCallback
*/
class MultiDocCallbackInterface {
public:
/**
* Called <em>after</em> a multi-doc parser detects the end
* of document.
*
* An application should call Engine::processDocument() there
* and retrieve the results of document processing after that
* using iterators over the CAS or TCAS.
* Finally an application should call resetDocument().
*
* An application should <em>not</em> call addDocPartsFinish()
* or addDocPartsFinish() in this function since a conforming
* parser is supposed to do any doc part processing.
*/
virtual
void documentBoundaryReached(UChar const * cpBuffer, size_t uiLength) = 0;
};
/**
* Initialize the parser.
* The parser is beeing passed an engine object and not a CAS because
* for (XML) files that contain multiple documents the parser must be
* able to call the process functions for each embedded document.
*
* An implementation needs to store the argument objects for later
* use in function parse().
*
* Called once per session.
*
* @param config The configuration for the parser
* @param engine The engine object into which the results go
* @param fallback An encoding to use in
* case the parser can't determine the encoding
* by other means
*
* @return UIMA_ERR_NONE if OK, error code otherwise
*/
virtual
TyErrorId init(ParserConfiguration const & config, TextAnalysisEngine & engine, const char * = "Latin1" ) = 0;
/**
* @see MultiDocCallbackInterface
*/
virtual
void setMultiDocCallback(MultiDocCallbackInterface & callbackObject) = 0;
/**
* Do the parsing add the tag free text to the CAS and potentially
* translate tag information to CAS annotations.
*
* Called once per document.
*
* @param inputFileName The input to process
* @return UIMA_ERR_NONE if OK, error code otherwise
*/
virtual
TyErrorId parseDocument(char* const inputFileName) = 0;
virtual
TyErrorId parseDocument(std::istream & inputFileStream) = 0;
/**
* Returns the number of documents parsed by the parser.
* This will always be 1 for HTML but can be more for XML.
*
* Optionally called once per document.
*/
virtual
size_t getNumberOfDocumentsParsed() const = 0;
/**
* Returns the number of bytes parsed by the parser.
* Information function to allow throughput computation by calling
* environment.
*
* Optionally called once per document.
*/
virtual
size_t getNumberOfBytesParsed() const = 0;
/**
* De-initialize the parser (free ressources etc.)
*
* Called once per session.
*
* @return UIMA_ERR_NONE if OK, error code otherwise
*/
virtual
TyErrorId deInit() = 0;
}
; /* ParserInterface */
} // namespace uima
#endif //UIMA_PARSER_INTERFACE_HPP