blob: fc02dc6588c124dda3cde75e6b1672111a914853 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.manifoldcf.agents.transformation.opennlp;
import java.io.*;
import java.nio.charset.StandardCharsets;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
import java.util.List;
import java.util.Locale;
import java.util.Map;
import java.util.HashMap;
import java.util.TreeMap;
import java.util.Map.Entry;
import java.util.Set;
import java.util.HashSet;
import opennlp.tools.namefind.NameFinderME;
import opennlp.tools.sentdetect.SentenceDetector;
import opennlp.tools.tokenize.Tokenizer;
import opennlp.tools.util.Span;
import org.apache.commons.io.IOUtils;
import org.apache.manifoldcf.agents.interfaces.IOutputAddActivity;
import org.apache.manifoldcf.agents.interfaces.IOutputCheckActivity;
import org.apache.manifoldcf.agents.interfaces.RepositoryDocument;
import org.apache.manifoldcf.agents.interfaces.ServiceInterruption;
import org.apache.manifoldcf.agents.system.Logging;
import org.apache.manifoldcf.agents.system.ManifoldCF;
import org.apache.manifoldcf.agents.transformation.BaseTransformationConnector;
import org.apache.manifoldcf.core.interfaces.IHTTPOutput;
import org.apache.manifoldcf.core.interfaces.IPostParameters;
import org.apache.manifoldcf.core.interfaces.ManifoldCFException;
import org.apache.manifoldcf.core.interfaces.Specification;
import org.apache.manifoldcf.core.interfaces.SpecificationNode;
import org.apache.manifoldcf.core.interfaces.VersionContext;
public class OpenNlpExtractor extends BaseTransformationConnector {
private static final String EDIT_SPECIFICATION_JS = "editSpecification.js";
private static final String EDIT_SPECIFICATION_OPENNLP_HTML = "editSpecification_OpenNLP.html";
private static final String VIEW_SPECIFICATION_HTML = "viewSpecification.html";
protected static int maximumExtractionCharacters = 524288;
// Meta-data fields added by this connector
private static final String PERSONS = "ner_people";
private static final String LOCATIONS = "ner_locations";
private static final String ORGANIZATIONS = "ner_organizations";
protected static final String ACTIVITY_EXTRACT = "extract";
protected static final String[] activitiesList = new String[] { ACTIVITY_EXTRACT };
protected final File fileDirectory = ManifoldCF.getFileProperty(ManifoldCF.fileResourcesProperty);
/** We handle up to 64K in memory; after that we go to disk. */
protected static final long inMemoryMaximumFile = 65536;
/**
* Return a list of activities that this connector generates. The connector
* does NOT need to be connected before this method is called.
*
* @return the set of activities.
*/
@Override
public String[] getActivitiesList() {
return activitiesList;
}
/**
* Get a pipeline version string, given a pipeline specification object. The
* version string is used to uniquely describe the pertinent details of the
* specification and the configuration, to allow the Connector Framework to
* determine whether a document will need to be processed again. Note that
* the contents of any document cannot be considered by this method; only
* configuration and specification information can be considered.
*
* This method presumes that the underlying connector object has been
* configured.
*
* @param spec
* is the current pipeline specification object for this
* connection for the job that is doing the crawling.
* @return a string, of unlimited length, which uniquely describes
* configuration and specification in such a way that if two such
* strings are equal, nothing that affects how or whether the
* document is indexed will be different.
*/
@Override
public VersionContext getPipelineDescription(Specification os) throws ManifoldCFException, ServiceInterruption {
SpecPacker sp = new SpecPacker(os);
return new VersionContext(sp.toPackedString(), params, os);
}
/**
* Add (or replace) a document in the output data store using the connector.
* This method presumes that the connector object has been configured, and
* it is thus able to communicate with the output data store should that be
* necessary. The OutputSpecification is *not* provided to this method,
* because the goal is consistency, and if output is done it must be
* consistent with the output description, since that was what was partly
* used to determine if output should be taking place. So it may be
* necessary for this method to decode an output description string in order
* to determine what should be done.
*
* @param documentURI
* is the URI of the document. The URI is presumed to be the
* unique identifier which the output data store will use to
* process and serve the document. This URI is constructed by the
* repository connector which fetches the document, and is thus
* universal across all output connectors.
* @param outputDescription
* is the description string that was constructed for this
* document by the getOutputDescription() method.
* @param document
* is the document data to be processed (handed to the output
* data store).
* @param authorityNameString
* is the name of the authority responsible for authorizing any
* access tokens passed in with the repository document. May be
* null.
* @param activities
* is the handle to an object that the implementer of a pipeline
* connector may use to perform operations, such as logging
* processing activity, or sending a modified document to the
* next stage in the pipeline.
* @return the document status (accepted or permanently rejected).
* @throws IOException
* only if there's a stream error reading the document data.
*/
@Override
public int addOrReplaceDocumentWithException(String documentURI, VersionContext pipelineDescription,
RepositoryDocument document, String authorityNameString, IOutputAddActivity activities)
throws ManifoldCFException, ServiceInterruption, IOException {
// assumes use of Tika extractor before using this connector
Logging.agents.debug("Starting OpenNlp extraction");
SpecPacker sp = new SpecPacker(pipelineDescription.getSpecification());
// In order to be able to replay the input stream both for extraction and for downstream use,
// we need to page through it, some number of characters at a time, and write those into a local buffer.
// We can do this at the same time we're extracting, if we're clever.
// Set up to spool back the original content, using either memory or disk, whichever makes sense.
DestinationStorage ds;
if (document.getBinaryLength() <= inMemoryMaximumFile) {
ds = new MemoryDestinationStorage((int)document.getBinaryLength());
} else {
ds = new FileDestinationStorage();
}
try {
// For logging, we'll need all of this
long startTime = System.currentTimeMillis();
String resultCode = "OK";
String description = null;
Long length = null;
final MetadataAccumulator ma = new MetadataAccumulator(sp, document.getBinaryLength());
try {
// Page through document content, saving it aside into destination storage, while also extracting the content
final OutputStream os = ds.getOutputStream();
try {
// We presume that the content is utf-8!! Thus it has to have been run through the TikaExtractor, or equivalent.
//
// We're going to be paging through the input stream by chunks of characters. Each chunk will then be passed to the
// output stream (os) via a writer, as well as to the actual code that invokes the nlp sentence extraction.
// We need an output writer that converts the input into characters.
//
Writer w = new OutputStreamWriter(os, "utf-8");
try {
Reader r = new InputStreamReader(document.getBinaryStream(), "utf-8");
try {
// Now, page through!
// It's too bad we have to convert FROM utf-8 and then back TO utf-8, but that can't be helped.
char[] characterBuffer = new char[65536];
while (true) {
int amt = r.read(characterBuffer);
if (amt == -1) {
break;
}
// Write into the copy buffer
w.write(characterBuffer,0,amt);
// Also do the processing
ma.acceptCharacters(characterBuffer,amt);
}
// Do not close the reader; the underlying stream will be closed by our caller when the RepositoryDocument is done with
} catch (IOException e) {
// These are errors from reading the RepositoryDocument input stream; we handle them accordingly.
resultCode = e.getClass().getSimpleName().toUpperCase(Locale.ROOT);
description = e.getMessage();
throw e;
}
} finally {
w.flush();
}
}
finally
{
os.close();
length = new Long(ds.getBinaryLength());
}
}
finally
{
// Log the extraction processing
activities.recordActivity(new Long(startTime), ACTIVITY_EXTRACT, length, documentURI,
resultCode, description);
}
ma.done();
// Parsing complete!
// Create a copy of Repository Document
RepositoryDocument docCopy = document.duplicate();
// Get new stream length
long newBinaryLength = ds.getBinaryLength();
// Open new input stream
InputStream is = ds.getInputStream();
try
{
docCopy.setBinary(is,newBinaryLength);
// add named entity meta-data
Map<String,Set<String>> nerMap = ma.getMetadata();
if (!nerMap.isEmpty()) {
for (Entry<String, Set<String>> entry : nerMap.entrySet()) {
Set<String> neList = entry.getValue();
String[] neArray = neList.toArray(new String[0]);
docCopy.addField(entry.getKey(), neArray);
}
}
// Send new document downstream
return activities.sendDocument(documentURI,docCopy);
} finally {
is.close();
}
} finally {
ds.close();
}
}
private final static Set<String> acceptableMimeTypes = new HashSet<String>();
static
{
acceptableMimeTypes.add("text/plain;charset=utf-8");
acceptableMimeTypes.add("text/plain;charset=ascii");
acceptableMimeTypes.add("text/plain;charset=us-ascii");
acceptableMimeTypes.add("text/plain");
}
/** Detect if a mime type is acceptable or not. This method is used to determine whether it makes sense to fetch a document
* in the first place.
*@param pipelineDescription is the document's pipeline version string, for this connection.
*@param mimeType is the mime type of the document.
*@param checkActivity is an object including the activities that can be performed by this method.
*@return true if the mime type can be accepted by this connector.
*/
@Override
public boolean checkMimeTypeIndexable(VersionContext pipelineDescription, String mimeType, IOutputCheckActivity checkActivity)
throws ManifoldCFException, ServiceInterruption
{
if (mimeType == null || !acceptableMimeTypes.contains(mimeType.toLowerCase(Locale.ROOT))) {
return false;
}
// Do a downstream check too
return super.checkMimeTypeIndexable(pipelineDescription, mimeType, checkActivity);
}
// ////////////////////////
// UI Methods
// ////////////////////////
/**
* Obtain the name of the form check javascript method to call.
*
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return the name of the form check javascript method.
*/
@Override
public String getFormCheckJavascriptMethodName(int connectionSequenceNumber) {
return "s" + connectionSequenceNumber + "_checkSpecification";
}
/**
* Obtain the name of the form presave check javascript method to call.
*
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return the name of the form presave check javascript method.
*/
@Override
public String getFormPresaveCheckJavascriptMethodName(int connectionSequenceNumber) {
return "s" + connectionSequenceNumber + "_checkSpecificationForSave";
}
/**
* Output the specification header section. This method is called in the
* head section of a job page which has selected an output connection of the
* current type. Its purpose is to add the required tabs to the list, and to
* output any javascript methods that might be needed by the job editing
* HTML.
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param os
* is the current output specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param tabsArray
* is an array of tab names. Add to this array any tab names that
* are specific to the connector.
*/
@Override
public void outputSpecificationHeader(IHTTPOutput out, Locale locale, Specification os,
int connectionSequenceNumber, List<String> tabsArray) throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
tabsArray.add(Messages.getString(locale, "OpenNlpExtractor.OpenNLPTabName"));
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_JS, paramMap);
}
/**
* Output the specification body section. This method is called in the body
* section of a job page which has selected an output connection of the
* current type. Its purpose is to present the required form elements for
* editing. The coder can presume that the HTML that is output from this
* configuration will be within appropriate <html>, <body>, and <form> tags.
* The name of the form is "editjob".
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param os
* is the current output specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param actualSequenceNumber
* is the connection within the job that has currently been
* selected.
* @param tabName
* is the current tab name.
*/
@Override
public void outputSpecificationBody(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber,
int actualSequenceNumber, String tabName) throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("TABNAME", tabName);
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
paramMap.put("SELECTEDNUM", Integer.toString(actualSequenceNumber));
fillInOpenNLPSpecificationMap(paramMap, os);
setUpOpenNLPSpecificationMap(paramMap);
Messages.outputResourceWithVelocity(out, locale, EDIT_SPECIFICATION_OPENNLP_HTML, paramMap);
}
/**
* Process a specification post. This method is called at the start of job's
* edit or view page, whenever there is a possibility that form data for a
* connection has been posted. Its purpose is to gather form information and
* modify the output specification accordingly. The name of the posted form
* is "editjob".
*
* @param variableContext
* contains the post data, including binary file-upload
* information.
* @param locale
* is the preferred local of the output.
* @param os
* is the current output specification for this job.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @return null if all is well, or a string error message if there is an
* error that should prevent saving of the job (and cause a
* redirection to an error page).
*/
@Override
public String processSpecificationPost(IPostParameters variableContext, Locale locale, Specification os,
int connectionSequenceNumber) throws ManifoldCFException {
String seqPrefix = "s" + connectionSequenceNumber + "_";
SpecificationNode node = new SpecificationNode(OpenNlpExtractorConfig.NODE_SMODEL_PATH);
String smodelPath = variableContext.getParameter(seqPrefix + "smodelpath");
if (smodelPath != null) {
node.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_VALUE, smodelPath);
} else {
node.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_VALUE, "");
}
os.addChild(os.getChildCount(), node);
node = new SpecificationNode(OpenNlpExtractorConfig.NODE_TMODEL_PATH);
String tmodelPath = variableContext.getParameter(seqPrefix + "tmodelpath");
if (tmodelPath != null) {
node.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_VALUE, tmodelPath);
} else {
node.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_VALUE, "");
}
os.addChild(os.getChildCount(), node);
String modelCount = variableContext.getParameter(seqPrefix+"model_count");
if (modelCount != null)
{
int count = Integer.parseInt(modelCount);
// Delete old spec data, including legacy node types we no longer use
int i = 0;
while (i < os.getChildCount())
{
SpecificationNode cn = os.getChild(i);
if (cn.getType().equals(OpenNlpExtractorConfig.NODE_FINDERMODEL))
os.removeChild(i);
else
i++;
}
// Now, go through form data
for (int j = 0; j < count; j++)
{
String op = variableContext.getParameter(seqPrefix+"model_"+j+"_op");
if (op != null && op.equals("Delete"))
continue;
String paramName = variableContext.getParameter(seqPrefix+"model_"+j+"_parametername");
String modelFile = variableContext.getParameter(seqPrefix+"model_"+j+"_modelfile");
SpecificationNode sn = new SpecificationNode(OpenNlpExtractorConfig.NODE_FINDERMODEL);
sn.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_PARAMETERNAME,paramName);
sn.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_MODELFILE,modelFile);
os.addChild(os.getChildCount(),sn);
}
// Look for add operation
String addOp = variableContext.getParameter(seqPrefix+"model_op");
if (addOp != null && addOp.equals("Add"))
{
String paramName = variableContext.getParameter(seqPrefix+"model_parametername");
String modelFile = variableContext.getParameter(seqPrefix+"model_modelfile");
SpecificationNode sn = new SpecificationNode(OpenNlpExtractorConfig.NODE_FINDERMODEL);
sn.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_PARAMETERNAME,paramName);
sn.setAttribute(OpenNlpExtractorConfig.ATTRIBUTE_MODELFILE,modelFile);
os.addChild(os.getChildCount(),sn);
}
}
return null;
}
/**
* View specification. This method is called in the body section of a job's
* view page. Its purpose is to present the output specification information
* to the user. The coder can presume that the HTML that is output from this
* configuration will be within appropriate <html> and <body> tags.
*
* @param out
* is the output to which any HTML should be sent.
* @param locale
* is the preferred local of the output.
* @param connectionSequenceNumber
* is the unique number of this connection within the job.
* @param os
* is the current output specification for this job.
*/
@Override
public void viewSpecification(IHTTPOutput out, Locale locale, Specification os, int connectionSequenceNumber)
throws ManifoldCFException, IOException {
Map<String, Object> paramMap = new HashMap<String, Object>();
paramMap.put("SEQNUM", Integer.toString(connectionSequenceNumber));
fillInOpenNLPSpecificationMap(paramMap, os);
Messages.outputResourceWithVelocity(out, locale, VIEW_SPECIFICATION_HTML, paramMap);
}
protected void setUpOpenNLPSpecificationMap(Map<String, Object> paramMap)
throws ManifoldCFException {
final String[] fileNames = getModelList();
paramMap.put("FILENAMES", fileNames);
}
protected static void fillInOpenNLPSpecificationMap(Map<String, Object> paramMap, Specification os) {
String sModelPath = "";
String tModelPath = "";
final List<Map<String,String>> finderModels = new ArrayList<>();
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(OpenNlpExtractorConfig.NODE_SMODEL_PATH)) {
sModelPath = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_VALUE);
if (sModelPath == null) {
sModelPath = "";
}
} else if (sn.getType().equals(OpenNlpExtractorConfig.NODE_TMODEL_PATH)) {
tModelPath = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_VALUE);
if (tModelPath == null) {
tModelPath = "";
}
} else if (sn.getType().equals(OpenNlpExtractorConfig.NODE_FINDERMODEL)) {
final String parameterName = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_PARAMETERNAME);
final String modelFile = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_MODELFILE);
final Map<String,String> modelRecord = new HashMap<>();
modelRecord.put("parametername", parameterName);
modelRecord.put("modelfile", modelFile);
finderModels.add(modelRecord);
}
}
paramMap.put("SMODELPATH", sModelPath);
paramMap.put("TMODELPATH", tModelPath);
paramMap.put("MODELS", finderModels);
}
protected static int handleIOException(IOException e)
throws ManifoldCFException
{
// IOException reading from our local storage...
if (e instanceof InterruptedIOException)
throw new ManifoldCFException(e.getMessage(),e,ManifoldCFException.INTERRUPTED);
throw new ManifoldCFException(e.getMessage(),e);
}
protected String[] getModelList() throws ManifoldCFException {
if (fileDirectory == null) {
return new String[0];
}
final String[] files = fileDirectory.list(new FileFilter());
// Sort it!!
java.util.Arrays.sort(files);
return files;
}
protected static class FileFilter implements FilenameFilter {
@Override
public boolean accept(final File dir, final String name) {
return new File(dir, name).isFile();
}
}
/** An instance of this class receives characters in 64K chunks, and needs to accumulate
* extracted metadata that this transformer will pass down.
*/
protected class MetadataAccumulator {
char[] characterBuffer = null;
int bufferPointer = 0;
final int bufferSize;
final SentenceDetector sentenceDetector;
final Tokenizer tokenizer;
final Map<String,NameFinderME> finders = new HashMap<>();
final Map<String,Set<String>> tokenLists = new HashMap<>();
public MetadataAccumulator(final SpecPacker sp,
final long bytesize)
throws ManifoldCFException {
try {
sentenceDetector = OpenNlpExtractorConfig.sentenceDetector(new File(fileDirectory,sp.getSModelPath()));
tokenizer = OpenNlpExtractorConfig.tokenizer(new File(fileDirectory,sp.getTModelPath()));
final Map<String,String> finderFiles = sp.getFinderModels();
for (String paramName : finderFiles.keySet()) {
finders.put(paramName, OpenNlpExtractorConfig.finder(new File(fileDirectory,finderFiles.get(paramName))));
}
} catch (IOException e) {
throw new ManifoldCFException(e.getMessage(), e);
}
if (bytesize > maximumExtractionCharacters) {
bufferSize = maximumExtractionCharacters;
} else {
bufferSize = (int)bytesize;
}
}
/** Accept characters, including actual count.
*/
public void acceptCharacters(final char[] buffer, int amt) {
if (characterBuffer == null) {
characterBuffer = new char[bufferSize];
}
int copyAmt;
if (amt > bufferSize - bufferPointer) {
copyAmt = bufferSize - bufferPointer;
} else {
copyAmt = amt;
}
int sourcePtr = 0;
while (copyAmt > 0) {
characterBuffer[bufferPointer++] = buffer[sourcePtr++];
copyAmt--;
}
}
public void done() {
if (bufferPointer == 0 || characterBuffer == null) {
return;
}
// Make a string freom the character array
final String textContent = new String(characterBuffer, 0, bufferPointer);
// Break into sentences, tokens, and then people, locations, and organizations
String[] sentences = sentenceDetector.sentDetect(textContent);
for (String sentence : sentences) {
String[] tokens = tokenizer.tokenize(sentence);
for (String parameterName : finders.keySet()) {
Set<String> stringSet = tokenLists.get(parameterName);
if (stringSet == null) {
stringSet = new HashSet<String>();
tokenLists.put(parameterName, stringSet);
}
Span[] spans = finders.get(parameterName).find(tokens);
stringSet.addAll(Arrays.asList(Span.spansToStrings(spans, tokens)));
}
}
}
public Map<String,Set<String>> getMetadata() {
return tokenLists;
}
}
protected static interface DestinationStorage {
/** Get the output stream to write to. Caller should explicitly close this stream when done writing.
*/
public OutputStream getOutputStream()
throws ManifoldCFException;
/** Get new binary length.
*/
public long getBinaryLength()
throws ManifoldCFException;
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
*/
public InputStream getInputStream()
throws ManifoldCFException;
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
*/
public void close()
throws ManifoldCFException;
}
protected static class FileDestinationStorage implements DestinationStorage {
protected final File outputFile;
protected final OutputStream outputStream;
public FileDestinationStorage()
throws ManifoldCFException
{
File outputFile;
OutputStream outputStream;
try
{
outputFile = File.createTempFile("mcftika","tmp");
outputStream = new FileOutputStream(outputFile);
}
catch (IOException e)
{
handleIOException(e);
outputFile = null;
outputStream = null;
}
this.outputFile = outputFile;
this.outputStream = outputStream;
}
@Override
public OutputStream getOutputStream()
throws ManifoldCFException
{
return outputStream;
}
/** Get new binary length.
*/
@Override
public long getBinaryLength()
throws ManifoldCFException
{
return outputFile.length();
}
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
*/
@Override
public InputStream getInputStream()
throws ManifoldCFException
{
try
{
return new FileInputStream(outputFile);
}
catch (IOException e)
{
handleIOException(e);
return null;
}
}
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
*/
@Override
public void close()
throws ManifoldCFException
{
outputFile.delete();
}
}
protected static class MemoryDestinationStorage implements DestinationStorage {
protected final ByteArrayOutputStream outputStream;
public MemoryDestinationStorage(int sizeHint)
{
outputStream = new ByteArrayOutputStream(sizeHint);
}
@Override
public OutputStream getOutputStream()
throws ManifoldCFException
{
return outputStream;
}
/** Get new binary length.
*/
@Override
public long getBinaryLength()
throws ManifoldCFException
{
return outputStream.size();
}
/** Get the input stream to read from. Caller should explicitly close this stream when done reading.
*/
@Override
public InputStream getInputStream()
throws ManifoldCFException
{
return new ByteArrayInputStream(outputStream.toByteArray());
}
/** Close the object and clean up everything.
* This should be called when the data is no longer needed.
*/
public void close()
throws ManifoldCFException
{
}
}
protected static class SpecPacker {
private final String sModelPath;
private final String tModelPath;
private final Map<String, String> models = new TreeMap<>();
public SpecPacker(Specification os) {
String sModelPath = null;
String tModelPath = null;
for (int i = 0; i < os.getChildCount(); i++) {
SpecificationNode sn = os.getChild(i);
if (sn.getType().equals(OpenNlpExtractorConfig.NODE_SMODEL_PATH)) {
sModelPath = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_VALUE);
}
if (sn.getType().equals(OpenNlpExtractorConfig.NODE_TMODEL_PATH)) {
tModelPath = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_VALUE);
}
if (sn.getType().equals(OpenNlpExtractorConfig.NODE_FINDERMODEL)) {
final String parameterName = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_PARAMETERNAME);
final String modelFile = sn.getAttributeValue(OpenNlpExtractorConfig.ATTRIBUTE_MODELFILE);
models.put(parameterName, modelFile);
}
}
this.sModelPath = sModelPath;
this.tModelPath = tModelPath;
}
public String toPackedString() {
StringBuilder sb = new StringBuilder();
// extract nouns
if (sModelPath != null)
sb.append(sModelPath);
sb.append(",");
if (tModelPath != null)
sb.append(tModelPath);
sb.append("[");
for (String parameterName : models.keySet()) {
sb.append(parameterName).append("=").append(models.get(parameterName)).append(",");
}
return sb.toString();
}
public String getSModelPath() {
return sModelPath;
}
public String getTModelPath() {
return tModelPath;
}
public Map<String, String> getFinderModels() {
return models;
}
}
}