/* | |
* Licensed to the Apache Software Foundation (ASF) under one | |
* or more contributor license agreements. See the NOTICE file | |
* distributed with this work for additional information | |
* regarding copyright ownership. The ASF licenses this file | |
* to you under the Apache License, Version 2.0 (the | |
* "License"); you may not use this file except in compliance | |
* with the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, | |
* software distributed under the License is distributed on an | |
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY | |
* KIND, either express or implied. See the License for the | |
* specific language governing permissions and limitations | |
* under the License. | |
*/ | |
package org.apache.uima.examples.casMultiplier; | |
import java.util.HashSet; | |
import java.util.Iterator; | |
import java.util.Set; | |
import org.apache.uima.UimaContext; | |
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase; | |
import org.apache.uima.analysis_engine.AnalysisEngineProcessException; | |
import org.apache.uima.cas.AbstractCas; | |
import org.apache.uima.cas.FSIndex; | |
import org.apache.uima.cas.FSIterator; | |
import org.apache.uima.cas.FeatureStructure; | |
import org.apache.uima.cas.Type; | |
import org.apache.uima.examples.SourceDocumentInformation; | |
import org.apache.uima.jcas.JCas; | |
import org.apache.uima.jcas.tcas.Annotation; | |
import org.apache.uima.resource.ResourceInitializationException; | |
import org.apache.uima.util.CasCopier; | |
/** | |
* An example CasMultiplier, which merges text documents into larger ones. It attempts to merge all | |
* of the segments that came from one original artifact. This is done by checking the "lastSegment" | |
* feature of the SourceDocumentInformation FeatureStructure, which is expected to be populated by | |
* the CollectionReader or CasMultiplier that produced the input CASes. | |
* <p> | |
* Limitations: if the lastSegment feature is never set to true by the component producing the input | |
* CASes, the merger will never produce any output. Also, this implementation relies on the CASes | |
* arriving in order, which could be a problem in a mulithreaded framework implementation. The order | |
* requirement could be relieved by recording a segment number in the SourceDocumentInformation, but | |
* that would also make this example more complicated. | |
*/ | |
public class SimpleTextMerger extends JCasMultiplier_ImplBase { | |
public static final String MESSAGE_DIGEST = "org.apache.uima.examples.casMultiplier.ExampleCasMultiplierMessages"; | |
public static final String MISSING_SOURCE_DOCUMENT_INFO = "missing_source_document_info"; | |
public static final String NO_NEXT_CAS = "no_next_cas"; | |
private StringBuffer mDocBuf = new StringBuffer(); | |
private JCas mMergedCas; | |
private boolean mReadyToOutput = false; | |
private String[] mAnnotationTypesToCopy; | |
/* | |
* (non-Javadoc) | |
* | |
* @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext) | |
*/ | |
public void initialize(UimaContext aContext) throws ResourceInitializationException { | |
super.initialize(aContext); | |
mAnnotationTypesToCopy = (String[]) aContext.getConfigParameterValue("AnnotationTypesToCopy"); | |
} | |
/* | |
* (non-Javadoc) | |
* | |
* @see JCasMultiplier_ImplBase#process(JCas) | |
*/ | |
public void process(JCas aJCas) throws AnalysisEngineProcessException { | |
// procure a new CAS if we don't have one already | |
if (mMergedCas == null) { | |
mMergedCas = getEmptyJCas(); | |
} | |
// append document text | |
String docText = aJCas.getDocumentText(); | |
int prevDocLen = mDocBuf.length(); | |
mDocBuf.append(docText); | |
// copy specified annotation types | |
CasCopier copier = new CasCopier(aJCas.getCas(), mMergedCas.getCas()); | |
Set copiedIndexedFs = new HashSet(); // needed in case one annotation is in two indexes (could | |
// happen if specified annotation types overlap) | |
for (int i = 0; i < mAnnotationTypesToCopy.length; i++) { | |
Type type = mMergedCas.getTypeSystem().getType(mAnnotationTypesToCopy[i]); | |
FSIndex index = aJCas.getCas().getAnnotationIndex(type); | |
Iterator iter = index.iterator(); | |
while (iter.hasNext()) { | |
FeatureStructure fs = (FeatureStructure) iter.next(); | |
if (!copiedIndexedFs.contains(fs)) { | |
Annotation copyOfFs = (Annotation) copier.copyFs(fs); | |
// update begin and end | |
copyOfFs.setBegin(copyOfFs.getBegin() + prevDocLen); | |
copyOfFs.setEnd(copyOfFs.getEnd() + prevDocLen); | |
mMergedCas.addFsToIndexes(copyOfFs); | |
copiedIndexedFs.add(fs); | |
} | |
} | |
} | |
// get the SourceDocumentInformation FS, which indicates the sourceURI of the document | |
// and whether the incoming CAS is the last segment | |
FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator(); | |
if (!it.hasNext()) { | |
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, MISSING_SOURCE_DOCUMENT_INFO, | |
new Object[0]); | |
} | |
SourceDocumentInformation sourceDocInfo = (SourceDocumentInformation) it.next(); | |
if (sourceDocInfo.getLastSegment()) { | |
// time to produce an output CAS | |
// set the document text | |
mMergedCas.setDocumentText(mDocBuf.toString()); | |
// add source document info to destination CAS | |
SourceDocumentInformation destSDI = new SourceDocumentInformation(mMergedCas); | |
destSDI.setUri(sourceDocInfo.getUri()); | |
destSDI.setOffsetInSource(0); | |
destSDI.setLastSegment(true); | |
destSDI.addToIndexes(); | |
mDocBuf = new StringBuffer(); | |
mReadyToOutput = true; | |
} | |
} | |
/* | |
* (non-Javadoc) | |
* | |
* @see org.apache.uima.analysis_component.AnalysisComponent#hasNext() | |
*/ | |
public boolean hasNext() throws AnalysisEngineProcessException { | |
return mReadyToOutput; | |
} | |
/* | |
* (non-Javadoc) | |
* | |
* @see org.apache.uima.analysis_component.AnalysisComponent#next() | |
*/ | |
public AbstractCas next() throws AnalysisEngineProcessException { | |
if (!mReadyToOutput) { | |
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, NO_NEXT_CAS, new Object[0]); | |
} | |
JCas casToReturn = mMergedCas; | |
mMergedCas = null; | |
mReadyToOutput = false; | |
return casToReturn; | |
} | |
} |