blob: 79fb4346efafb9e914efc770533d019bf0b578af [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.uima.fit.pipeline;
import static org.apache.uima.fit.factory.AnalysisEngineFactory.createEngine;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Collection;
import java.util.Iterator;
import org.apache.uima.analysis_engine.AnalysisEngine;
import org.apache.uima.analysis_engine.AnalysisEngineProcessException;
import org.apache.uima.cas.CASException;
import org.apache.uima.collection.CollectionException;
import org.apache.uima.collection.CollectionReader;
import org.apache.uima.fit.component.NoOpAnnotator;
import org.apache.uima.fit.internal.ResourceManagerFactory;
import org.apache.uima.fit.util.LifeCycleUtil;
import org.apache.uima.jcas.JCas;
import org.apache.uima.resource.Resource;
import org.apache.uima.resource.ResourceInitializationException;
import org.apache.uima.resource.ResourceManager;
import org.apache.uima.resource.metadata.MetaDataObject;
import org.apache.uima.resource.metadata.TypeSystemDescription;
import org.apache.uima.util.CasCreationUtils;
/**
* A class implementing iteration over a the documents of a collection. Each element in the Iterable
* is a JCas containing a single document. The documents have been loaded by the CollectionReader
* and processed by the AnalysisEngine (if any).
*/
public class JCasIterator implements Iterator<JCas> {
private final CollectionReader collectionReader;
private final AnalysisEngine[] analysisEngines;
private final JCas jCas;
private boolean selfComplete = false;
private boolean selfDestroy = false;
private boolean destroyed = false;
private ResourceManager resMgr;
private boolean resourceManagerCreatedInternally = false;
/**
* Iterate over the documents loaded by the CollectionReader, running the AnalysisEngine on each
* one before yielding them. By default, components get no lifecycle events, such as
* collectionProcessComplete or destroy when this constructor is used.
*
* @param aReader
* The CollectionReader for loading documents.
* @param aEngines
* The AnalysisEngines for processing documents.
* @throws ResourceInitializationException
* if a failure occurs during initialization of the components
* @throws CASException
* if the JCas could not be initialized
*/
public JCasIterator(final CollectionReader aReader, final AnalysisEngine... aEngines)
throws CASException, ResourceInitializationException {
this(ResourceManagerFactory.newResourceManager(), aReader, aEngines);
resourceManagerCreatedInternally = true;
}
/**
* Iterate over the documents loaded by the CollectionReader, running the AnalysisEngine on each
* one before yielding them. By default, components get no lifecycle events, such as
* collectionProcessComplete or destroy when this constructor is used.
* @param aResMgr
* The ResourceManager. Should be the one also used by the CollectionReader and
* AnalysisEngines.
* @param aReader
* The CollectionReader for loading documents.
* @param aEngines
* The AnalysisEngines for processing documents.
* @throws ResourceInitializationException
* if a failure occurs during initialization of the components
* @throws CASException
* if the JCas could not be initialized
*/
public JCasIterator(final ResourceManager aResMgr, final CollectionReader aReader,
final AnalysisEngine... aEngines) throws CASException, ResourceInitializationException {
resMgr = aResMgr;
collectionReader = aReader;
analysisEngines = aEngines;
Collection<MetaDataObject> metaData = new ArrayList<MetaDataObject>();
metaData.add(aReader.getProcessingResourceMetaData());
for (AnalysisEngine ae : aEngines) {
metaData.add(ae.getProcessingResourceMetaData());
}
jCas = CasCreationUtils.createCas(metaData, null, aResMgr).getJCas();
collectionReader.typeSystemInit(jCas.getTypeSystem());
}
/**
* Iterate over the documents loaded by the CollectionReader. (Uses an JCasAnnotatorAdapter to
* create the document JCas.) By default, components get no lifecycle events, such as
* collectionProcessComplete or destroy when this constructor is used.
*
* @param aReader
* The CollectionReader for loading documents.
* @param aTypeSystemDescription
* a type system description
* @throws ResourceInitializationException
* if a failure occurs during initialization of the components
* @throws CASException
* if the JCas could not be initialized
*/
public JCasIterator(final CollectionReader aReader,
final TypeSystemDescription aTypeSystemDescription) throws CASException,
ResourceInitializationException {
this(aReader, createEngine(NoOpAnnotator.class, aTypeSystemDescription));
}
@Override
public boolean hasNext() {
if (destroyed) {
return false;
}
boolean error = true;
try {
boolean hasOneMore = collectionReader.hasNext();
error = false;
return hasOneMore;
} catch (CollectionException e) {
throw new IllegalStateException(e);
} catch (IOException e) {
throw new IllegalStateException(e);
} finally {
if (error && selfDestroy) {
destroy();
}
}
}
@Override
public JCas next() {
jCas.reset();
boolean error = true;
boolean destroyed = false;
try {
collectionReader.getNext(jCas.getCas());
for (AnalysisEngine engine : analysisEngines) {
engine.process(jCas);
}
// Only call hasNext() if auto complete or destroy is enabled.
if ((selfComplete || selfDestroy) && !hasNext()) {
if (selfComplete) {
collectionProcessComplete();
}
if (selfDestroy) {
destroy();
destroyed = true;
}
}
error = false;
} catch (CollectionException e) {
throw new IllegalStateException(e);
} catch (IOException e) {
throw new IllegalStateException(e);
} catch (AnalysisEngineProcessException e) {
throw new IllegalStateException(e);
} finally {
if (error && selfDestroy && !destroyed) {
destroy();
}
}
return jCas;
}
@Override
public void remove() {
throw new UnsupportedOperationException();
}
/**
* Notify analysis engines that the collection process is complete.
*
* @throws AnalysisEngineProcessException
* if there was a problem completing the process
*/
public void collectionProcessComplete() throws AnalysisEngineProcessException {
LifeCycleUtil.collectionProcessComplete(analysisEngines);
}
/**
* Close and destroy all components.s
*/
public void destroy() {
if (!destroyed) {
LifeCycleUtil.close(collectionReader);
LifeCycleUtil.destroy(collectionReader);
LifeCycleUtil.destroy(analysisEngines);
if (resourceManagerCreatedInternally) {
LifeCycleUtil.destroy(resMgr);
}
destroyed = true;
}
}
/**
* Get whether {@link #collectionProcessComplete()} is automatically called.
*
* @return whether {@link #collectionProcessComplete()} is automatically called.
*/
public boolean isSelfComplete() {
return selfComplete;
}
/**
* Send a {@link #collectionProcessComplete()} call to analysis engines when the reader has no
* further CASes to produce.
*
* @param aSelfComplete
* whether to enable the automatic call to {@link #collectionProcessComplete()}
*/
public void setSelfComplete(boolean aSelfComplete) {
selfComplete = aSelfComplete;
}
/**
* Get whether {@link #destroy()} is automatically called.
*
* @return whether {@link #destroy()} is automatically called.
*/
public boolean isSelfDestroy() {
return selfDestroy;
}
/**
* Send a destroy call to analysis engines when the reader has no further CASes to produce or if
* an error occurs.
*
* @param aSelfDestroy
* whether to enable the automatic call to {@link Resource#destroy()}
*/
public void setSelfDestroy(boolean aSelfDestroy) {
selfDestroy = aSelfDestroy;
}
}