blob: 7f3d4b5f89f34f88ba0131d4eecf9576fd13cce1 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.examples.casMultiplier;
import java.util.HashSet;
import java.util.Iterator;
import java.util.Set;
import org.apache.uima.UimaContext;
import org.apache.uima.analysis_component.JCasMultiplier_ImplBase;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.AbstractCas;
import org.apache.uima.cas.FSIndex;
import org.apache.uima.cas.FSIterator;
import org.apache.uima.cas.FeatureStructure;
import org.apache.uima.cas.Type;
import org.apache.uima.examples.SourceDocumentInformation;
import org.apache.uima.jcas.JCas;
import org.apache.uima.jcas.tcas.Annotation;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.util.CasCopier;
/**
* An example CasMultiplier, which merges text documents into larger ones. It attempts to merge all
* of the segments that came from one original artifact. This is done by checking the "lastSegment"
* feature of the SourceDocumentInformation FeatureStructure, which is expected to be populated by
* the CollectionReader or CasMultiplier that produced the input CASes.
* <p>
* Limitations: if the lastSegment feature is never set to true by the component producing the input
* CASes, the merger will never produce any output. Also, this implementation relies on the CASes
* arriving in order, which could be a problem in a mulithreaded framework implementation. The order
* requirement could be relieved by recording a segment number in the SourceDocumentInformation, but
* that would also make this example more complicated.
*/
public class SimpleTextMerger extends JCasMultiplier_ImplBase {
public static final String MESSAGE_DIGEST = "org.apache.uima.examples.casMultiplier.ExampleCasMultiplierMessages";
public static final String MISSING_SOURCE_DOCUMENT_INFO = "missing_source_document_info";
public static final String NO_NEXT_CAS = "no_next_cas";
private StringBuffer mDocBuf = new StringBuffer();
private JCas mMergedCas;
private boolean mReadyToOutput = false;
private String[] mAnnotationTypesToCopy;
/*
* (non-Javadoc)
*
* @see org.apache.uima.analysis_component.AnalysisComponent_ImplBase#initialize(org.apache.uima.UimaContext)
*/
public void initialize(UimaContext aContext) throws ResourceInitializationException {
super.initialize(aContext);
mAnnotationTypesToCopy = (String[]) aContext.getConfigParameterValue("AnnotationTypesToCopy");
}
/*
* (non-Javadoc)
*
* @see JCasMultiplier_ImplBase#process(JCas)
*/
public void process(JCas aJCas) throws AnalysisEngineProcessException {
// procure a new CAS if we don't have one already
if (mMergedCas == null) {
mMergedCas = getEmptyJCas();
}
// append document text
String docText = aJCas.getDocumentText();
int prevDocLen = mDocBuf.length();
mDocBuf.append(docText);
// copy specified annotation types
CasCopier copier = new CasCopier(aJCas.getCas(), mMergedCas.getCas());
Set copiedIndexedFs = new HashSet(); // needed in case one annotation is in two indexes (could
// happen if specified annotation types overlap)
for (int i = 0; i < mAnnotationTypesToCopy.length; i++) {
Type type = mMergedCas.getTypeSystem().getType(mAnnotationTypesToCopy[i]);
FSIndex index = aJCas.getCas().getAnnotationIndex(type);
Iterator iter = index.iterator();
while (iter.hasNext()) {
FeatureStructure fs = (FeatureStructure) iter.next();
if (!copiedIndexedFs.contains(fs)) {
Annotation copyOfFs = (Annotation) copier.copyFs(fs);
// update begin and end
copyOfFs.setBegin(copyOfFs.getBegin() + prevDocLen);
copyOfFs.setEnd(copyOfFs.getEnd() + prevDocLen);
mMergedCas.addFsToIndexes(copyOfFs);
copiedIndexedFs.add(fs);
}
}
}
// get the SourceDocumentInformation FS, which indicates the sourceURI of the document
// and whether the incoming CAS is the last segment
FSIterator it = aJCas.getAnnotationIndex(SourceDocumentInformation.type).iterator();
if (!it.hasNext()) {
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, MISSING_SOURCE_DOCUMENT_INFO,
new Object[0]);
}
SourceDocumentInformation sourceDocInfo = (SourceDocumentInformation) it.next();
if (sourceDocInfo.getLastSegment()) {
// time to produce an output CAS
// set the document text
mMergedCas.setDocumentText(mDocBuf.toString());
// add source document info to destination CAS
SourceDocumentInformation destSDI = new SourceDocumentInformation(mMergedCas);
destSDI.setUri(sourceDocInfo.getUri());
destSDI.setOffsetInSource(0);
destSDI.setLastSegment(true);
destSDI.addToIndexes();
mDocBuf = new StringBuffer();
mReadyToOutput = true;
}
}
/*
* (non-Javadoc)
*
* @see org.apache.uima.analysis_component.AnalysisComponent#hasNext()
*/
public boolean hasNext() throws AnalysisEngineProcessException {
return mReadyToOutput;
}
/*
* (non-Javadoc)
*
* @see org.apache.uima.analysis_component.AnalysisComponent#next()
*/
public AbstractCas next() throws AnalysisEngineProcessException {
if (!mReadyToOutput) {
throw new AnalysisEngineProcessException(MESSAGE_DIGEST, NO_NEXT_CAS, new Object[0]);
}
JCas casToReturn = mMergedCas;
mMergedCas = null;
mReadyToOutput = false;
return casToReturn;
}
}