blob: 103b7b0caca28f364035fda9acae5c617a04a6e3 [file] [log] [blame]
/** \file filename annotator_dump.cpp
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
-------------------------------------------------------------------------- */
/* ----------------------------------------------------------------------- */
/* Include dependencies */
/* ----------------------------------------------------------------------- */
// this is included ONCE for the main source file of each binary
#include "uima/pragmas.hpp"
#include "uima/annotator_dump.hpp"
#include "uima/xmlwriter.hpp"
#include <iostream>
#include <algorithm>
#ifdef _MSC_VER
#include <minmax.h> // for min
#endif
using namespace std;
#include "uima/assertmsg.h"
#include "uima/macros.h"
#include "uima/trace.hpp"
#include "uima/strconvert.hpp"
//#include "uima/fixed_vector.hpp"
//config parameter names
/* ----------------------------------------------------------------------- */
/* Constants */
/* ----------------------------------------------------------------------- */
#define ANNOTATOR_DUMP_PARAM_OUTFILE _TEXT("OutputFile")
#define ANNOTATOR_DUMP_PARAM_APPEND _TEXT("AppendFile")
#define ANNOTATOR_DUMP_PARAM_DUMP_DOCBUFFER _TEXT("DumpDocBuffer")
#define ANNOTATOR_DUMP_PARAM_SAVE_DOCBUFFER _TEXT("SaveDocBuffer")
#define ANNOTATOR_DUMP_PARAM_STYLE _TEXT("OutputStyle")
#define ANNOTATOR_DUMP_OUTPUT_TYPES _TEXT("OutputTypes")
//error codes
const int ANNOTATOR_DUMP_ERROR_OFFSET = 100;
const int ANNOTATOR_DUMP_ERROR_OPEN = (0 + ANNOTATOR_DUMP_ERROR_OFFSET);
const int ANNOTATOR_DUMP_ERROR_AT = (1 + ANNOTATOR_DUMP_ERROR_OFFSET);
const int ANNOTATOR_DUMP_WARN_OFFSET = 200;
const int ANNOTATOR_DUMP_WARN_TET = (0 + ANNOTATOR_DUMP_WARN_OFFSET);
const int ANNOTATOR_DUMP_MSG_OFFSET = 300;
const int ANNOTATOR_DUMP_MSG_STYLE = (0 + ANNOTATOR_DUMP_MSG_OFFSET);
const int ANNOTATOR_DUMP_MSG_TYPES = (1 + ANNOTATOR_DUMP_MSG_OFFSET);
/* ----------------------------------------------------------------------- */
/* Implementation */
/* ----------------------------------------------------------------------- */
// Default Constructor
AnnotatorDump::AnnotatorDump(void) :
iv_clOutputStream() {}
AnnotatorDump::~AnnotatorDump(void) {
;
}
TyErrorId
AnnotatorDump::initialize(
AnnotatorContext & rclAnnotatorContext) {
TyErrorId tyErrId;
string strFileName;
int uiOutputStyle;
// Default Values
// in append mode all data in a session/collection is dumped into one file
// otherwise the same dump file is deleted and rewritten for each document
// in the session/collection
// default is false
iv_bAppendFile = false;
// we don't dump the Document Buffer
iv_bDumpDocBuffer = false;
iv_bSaveDocBuffer = false;
// the representation will be in Xml-Format
iv_enOutputStyle = Xml;
// Reading the Values from the Config-Section
//Filename for the Output-Stream
icu::UnicodeString us;
tyErrId = rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_PARAM_OUTFILE, us);
// Convert filename to default encoding
UnicodeStringRef usr(us);
usr.extract(strFileName);
if (tyErrId != UIMA_ERR_NONE) {
getAnnotatorContext().getLogger().logError(
_TEXT("Required option '" ANNOTATOR_DUMP_PARAM_OUTFILE "' not found"),
(long)ANNOTATOR_DUMP_ERROR_OPEN);
return(UIMA_ERR_USER_ANNOTATOR_CONFIG_INVALID_PARAM);
}
iv_clOutputFilename = strFileName.c_str();
(void) rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_PARAM_DUMP_DOCBUFFER, iv_bDumpDocBuffer);
(void) rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_PARAM_SAVE_DOCBUFFER, iv_bSaveDocBuffer);
(void) rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_PARAM_APPEND, iv_bAppendFile);
// in append mode all data in a session/collection is dumped into one file
// otherwise the same dump file is deleted and rewritten for each document
// in the session/collection
if (iv_bAppendFile) {
tyErrId = openOutputFile();
if (tyErrId != UIMA_ERR_NONE) {
return tyErrId;
}
}
//Output Style
uiOutputStyle = 0;
tyErrId = rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_PARAM_STYLE, uiOutputStyle);
if (tyErrId != UIMA_ERR_CONFIG_OPTION_NOT_FOUND) {
switch (uiOutputStyle) {
case 0:
iv_enOutputStyle = Xml;
break;
case 1:
iv_enOutputStyle = XCas;
break;
default:
getAnnotatorContext().getLogger().logWarning(
"Invalid Output Style. Use Default",
(long) ANNOTATOR_DUMP_MSG_STYLE);
break;
}
}
/*
int iDummy;
if ( rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_OUTPUT_TYPES, iDummy)){
getAnnotatorContext().getLogger().logWarning((long) ANNOTATOR_DUMP_MSG_TYPES,
_TEXT("Specification of output types currently not suported. All types will be dumped."));
}
*/
// Test getting a multi-valued parameter
vector<string*> vecOutputTypes;
tyErrId = rclAnnotatorContext.extractValue(ANNOTATOR_DUMP_OUTPUT_TYPES, vecOutputTypes);
if (tyErrId != UIMA_ERR_CONFIG_OPTION_NOT_FOUND) {
size_t i, len = 0;
for (i=0; i<vecOutputTypes.size(); ++i) {
string * rString = vecOutputTypes[i];
len += rString->length();
}
cout << " parameter OutputTypes has "<<i<<" values with a total of "<<len<<" characters." << endl;
// Release contents of vector allocated by extractValue in case library uses a different heap.
rclAnnotatorContext.release(vecOutputTypes);
}
// Release string buffer allocated by library in case it uses a different heap.
usr.release(strFileName);
return(TyErrorId)UIMA_ERR_NONE;
}
TyErrorId AnnotatorDump::typeSystemInit(uima::TypeSystem const & typeSystem) {
// Test that can get all types in a vector
std::vector<Type> allTypes;
typeSystem.getAllTypes(allTypes);
cout << " typeSystem has " << allTypes.size() << " types -" << endl << " from '"
<< allTypes[0].getName() << "' to '" << allTypes[allTypes.size()-1].getName() << "'" << endl;
typeSystem.release(allTypes); // Release storage allocated from library's heap
std::vector<Feature> allFeats;
typeSystem.getAllFeatures(allFeats);
cout << " typeSystem has " << allFeats.size() << " features -" << endl << " from '"
<< allFeats[0].getName() << "' to '" << allFeats[allFeats.size()-1].getName() << "'" << endl;
typeSystem.release(allFeats); // Release storage allocated from library's heap
return UIMA_ERR_NONE;
}
TyErrorId
AnnotatorDump::openOutputFile( void ) {
iv_clOutputStream.open(iv_clOutputFilename.getAsCString()); //overwrite
if (iv_clOutputStream.good()) {
return UIMA_ERR_NONE;
}
getAnnotatorContext().getLogger().logError(
string("Failed to open Output File ")+ iv_clOutputFilename.getAsCString(),
(long) ANNOTATOR_DUMP_ERROR_OPEN);
return(UIMA_ERR_USER_ANNOTATOR_IO_WRITE_PROBLEM);
}
void
AnnotatorDump::closeOutputFile( void ) {
iv_clOutputStream.close();
}
TyErrorId
AnnotatorDump::destroy(
) {
if (iv_bAppendFile) {
closeOutputFile();
}
return(TyErrorId)UIMA_ERR_NONE;
}
TyErrorId
AnnotatorDump::reconfigure(
) {
return(TyErrorId)UIMA_ERR_NONE;
}
TyErrorId
AnnotatorDump::process(CAS & tcas,
const ResultSpecification &
) {
TyErrorId tyErrId;
// in append mode all data in a session/collection is dumped into one file
// otherwise the same dump file is deleted and rewritten for each document
// in the session/collection
if (!iv_bAppendFile) {
tyErrId = openOutputFile();
if (tyErrId != UIMA_ERR_NONE) {
return tyErrId;
}
}
assert(iv_clOutputStream.good());
if (iv_bDumpDocBuffer) {
// Dumping the Document Buffer
UnicodeStringRef doc = tcas.getDocumentText();
outputDocBuffer(doc);
}
uima::CASWriterABase * writer = NULL;
switch (iv_enOutputStyle) {
case Xml:
writer = new uima::XMLDumpWriter(tcas, iv_bDumpDocBuffer);
break;
case XCas:
writer = new uima::XCASWriter(tcas, iv_bDumpDocBuffer);
break;
default:
assert(false);
}
assert( EXISTS(writer) );
auto_ptr<CASWriterABase> apWriter( writer );
apWriter->write(iv_clOutputStream);
// in append mode all data in a session/collection is dumped into one file
// otherwise the same dump file is deleted and rewritten for each document
// in the session/collection
if (!iv_bAppendFile) {
closeOutputFile();
}
return(TyErrorId)UIMA_ERR_NONE;
}
void AnnotatorDump::outputDocBuffer(UnicodeStringRef const & crclDoc) {
assert(crclDoc.length() > 0);
uima::util::Filename clFilename(iv_clOutputFilename);
ofstream clOutStream;
TyDocIndex tyIndexFirst = 0;
TyDocIndex tyIndexLast = crclDoc.length() - 1;
clFilename.setNewExtension(_TEXT(".asc"));
if (iv_bAppendFile) {
clOutStream.open(clFilename.getAsCString(), ios::out | ios::app); //append (create, resp.)
} else {
clOutStream.open(clFilename.getAsCString()); //overwrite
}
if (iv_clOutputStream.good()) {
clOutStream << "Document Buffer Status:" << endl
<< "=======================" << endl;
clOutStream << endl
<< "Document buffer length...........: " << crclDoc.length() << endl
<< "Document buffer size in memory...: " << (crclDoc.length() * sizeof(UChar)) << endl
<< "Document buffer index first......: " << tyIndexFirst << endl
<< "Document buffer index last.......: " << tyIndexLast << endl;
clOutStream << endl
<< "Document Buffer Dump:" << endl
<< "=====================" << endl;
assertWithMsg(sizeof(WORD16) == sizeof(UChar), "Port required");
DUMPHEX(clOutStream, (WORD16 const *)crclDoc.getBuffer(), crclDoc.length());
clOutStream << endl;
}
if (iv_bSaveDocBuffer) {
ofstream clOutStream;
clFilename.setNewExtension(_TEXT(".ucs"));
clOutStream.open(clFilename.getAsCString(), ios::binary); //overwrite
/** include byte-order-mark ??
clOutStream << CosClConverterABase::getUCS2HostEndianId();
**/
clOutStream.write((const char *) crclDoc.getBuffer(), (crclDoc.length() * sizeof(UChar)));
}
}
/* ----------------------------------------------------------------------- */
/* Mapping for generic C API wrapper */
/* ----------------------------------------------------------------------- */
typedef AnnotatorDump UserDefinedAnnotator;
// define for error/exception info in annotator_generic.inl
#define UIMA_ANNOTATOR_NAME "annotator_dump"
/* ----------------------------------------------------------------------- */
/* Include generic C API wrapper */
/* ----------------------------------------------------------------------- */
///#include "uima/annotator_generic.inl"
MAKE_AE(AnnotatorDump);
/* <EOF> */