Merge pull request #22 from mac-op/enhancement/capability-tests Test cases for Aggregate CAS Multiplier, by @mac-op

commit: a710eb90b7eed3d17673bddb464cd29e6d654315 [log] [tgz]
author: Pablo Duboue <pablo.duboue@gmail.com> Wed Jul 03 23:42:27 2024 -0700
committer: GitHub <noreply@github.com> Wed Jul 03 23:42:27 2024 -0700
tree: 71fb33eea6efe8b2cbbfa43f8f3a3b5697efba94
parent: 1349ce51d0635c8341aa19a651fd25741450a3ee [diff]
parent: a1b69d4164e93ac12c409d173829a74108fbeb57 [diff]
diff --git a/src/test/Makefile.am b/src/test/Makefile.am
index 7a45dfa..c7bd673 100644
--- a/src/test/Makefile.am
+++ b/src/test/Makefile.am

@@ -26,7 +26,7 @@
 
 
 
-TST_LIB=libtoknz.la libdump.la libSofaStreamHandlerFile.la libSimpleTextSegmenter.la libTextSegmentConsumer.la libMeetingAnnotator.la
+TST_LIB=libtoknz.la libdump.la libSofaStreamHandlerFile.la libSimpleTextSegmenter.la libTextSegmentConsumer.la libMeetingAnnotator.la libDaveDetector.la libSimpleTextMerger.la
 
 TST_BIN=test_cas test_engine test_typepriority test_language  \
  test_iterators test_casserializer test_sofa test_primitivetypes test_xcasdeserialization test_xmideserialization
@@ -34,8 +34,10 @@
 test_tgt: $(TST_BIN) $(TST_LIB)
 
 clean:
-	rm -f $(TST_BIN) $(test_engine_OBJECTS) $(test_typepriority_OBJECTS) $(test_sofa_OBJECTS) $(test_language_OBJECTS) $(test_cas_OBJECTS)  $(test_iterators_OBJECTS) $(test_casserializer_OBJECTS) $(test_primitivetypes_OBJECTS) $(test_xcasdeserialization_OBJECTS) $(test_xmideserialization_OBJECTS)
-	rm -f $(TST_LIB) $(libtoknz_la_OBJECTS) $(libdump_la_OBJECTS) $(libSofaStreamHandlerFile_la_OBJECTS) $(libSimpleTextSegmenter_la_OBJECTS) $(libTextSegmentConsumer_la_OBJECTS) $(libMeetingAnnotator_la_OBJECTS)
+	rm -f $(TST_BIN) $(test_engine_OBJECTS) $(test_typepriority_OBJECTS) $(test_sofa_OBJECTS) $(test_language_OBJECTS) \
+		$(test_cas_OBJECTS)  $(test_iterators_OBJECTS) $(test_casserializer_OBJECTS) $(test_primitivetypes_OBJECTS) $(test_xcasdeserialization_OBJECTS) $(test_xmideserialization_OBJECTS)
+	rm -f $(TST_LIB) $(libtoknz_la_OBJECTS) $(libdump_la_OBJECTS) $(libSofaStreamHandlerFile_la_OBJECTS) $(libSimpleTextSegmenter_la_OBJECTS) \
+		$(libTextSegmentConsumer_la_OBJECTS) $(libMeetingAnnotator_la_OBJECTS) $(libDaveDetector_la_OBJECTS) $(libSimpleTextMerger_la_OBJECTS)
 
 AM_CPPFLAGS  = -Isrc
 AM_CPPFLAGS += -I../cas
@@ -122,3 +124,11 @@
 libMeetingAnnotator_la_SOURCES = src/MeetingAnnotator.cpp
 libMeetingAnnotator_la_LIBADD = $(LIBADD)
 libMeetingAnnotator_la_LDFLAGS = $(LT_FLAGS)
+
+libDaveDetector_la_SOURCES = src/DaveDetector.cpp
+libDaveDetector_la_LIBADD = $(LIBADD)
+libDaveDetector_la_LDFLAGS = $(LT_FLAGS)
+
+libSimpleTextMerger_la_SOURCES = src/SimpleTextMerger.cpp
+libSimpleTextMerger_la_LIBADD = $(LIBADD)
+libSimpleTextMerger_la_LDFLAGS = $(LT_FLAGS)
\ No newline at end of file

diff --git a/src/test/data/descriptors/AggregateCASMultiplier.xml b/src/test/data/descriptors/AggregateCASMultiplier.xml
new file mode 100644
index 0000000..14c190f
--- /dev/null
+++ b/src/test/data/descriptors/AggregateCASMultiplier.xml

@@ -0,0 +1,58 @@
+<?xml version="1.0" encoding="UTF-8"?>
+<!--
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ * 
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ -->
+
+<taeDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.cpp</frameworkImplementation>
+  <primitive>false</primitive>
+  <delegateAnalysisEngineSpecifiers>
+    <delegateAnalysisEngine key="Segmenter">
+      <import location="SimpleTextSegmenter.xml"/>
+    </delegateAnalysisEngine>
+    <delegateAnalysisEngine key="Dave">
+      <import location="DaveDetector.xml"/>
+    </delegateAnalysisEngine>
+  </delegateAnalysisEngineSpecifiers>
+	
+  <analysisEngineMetaData>
+    <name>Example Aggregate TAE</name>
+    <description>
+      For testing C++ aggregate engine's ability to handle CAS Multipliers.
+    </description>
+    <configurationParameters/>
+    <configurationParameterSettings/>
+    <flowConstraints>
+      <fixedFlow>
+        <node>Segmenter</node>
+        <node>Dave</node>
+      </fixedFlow>
+    </flowConstraints>
+    <capabilities>
+      <capability>
+        <inputs/>
+        <outputs/>
+      </capability>
+    </capabilities>
+	<operationalProperties>
+		<modifiesCas>false</modifiesCas>
+		<multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+		<outputsNewCASes>true</outputsNewCASes>
+	</operationalProperties>
+  </analysisEngineMetaData>
+</taeDescription>

diff --git a/src/test/data/descriptors/DaveDetector.xml b/src/test/data/descriptors/DaveDetector.xml
new file mode 100644
index 0000000..79a765f
--- /dev/null
+++ b/src/test/data/descriptors/DaveDetector.xml

@@ -0,0 +1,97 @@
+<?xml version="1.0" encoding="UTF-8" ?> 
+
+	<!--
+	 ***************************************************************
+	 * Licensed to the Apache Software Foundation (ASF) under one
+	 * or more contributor license agreements.  See the NOTICE file
+	 * distributed with this work for additional information
+	 * regarding copyright ownership.  The ASF licenses this file
+	 * to you under the Apache License, Version 2.0 (the
+	 * "License"); you may not use this file except in compliance
+	 * with the License.  You may obtain a copy of the License at
+         *
+	 *   http://www.apache.org/licenses/LICENSE-2.0
+	 * 
+	 * Unless required by applicable law or agreed to in writing,
+	 * software distributed under the License is distributed on an
+	 * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+	 * KIND, either express or implied.  See the License for the
+	 * specific language governing permissions and limitations
+	 * under the License.
+	 ***************************************************************
+   -->
+
+<taeDescription xmlns="http://uima.apache.org/resourceSpecifier">
+  <frameworkImplementation>org.apache.uima.cpp</frameworkImplementation>
+  <primitive>true</primitive>
+<annotatorImplementationName>DaveDetector</annotatorImplementationName>
+
+
+<analysisEngineMetaData>
+  <name>Dave Detector</name>
+  <description>Detects Daves in text, and annotates them.</description>
+  <version>1.0</version>
+  <vendor>IBM</vendor>
+
+
+<!--
+  Configuration Parameter Definitions
+-->
+        <configurationParameters>
+            <configurationParameter>
+                <name>DaveString</name>
+                <description>simple string for finding a Dave</description>
+                <type>String</type>
+                <multiValued>false</multiValued>
+                <mandatory>true</mandatory>
+            </configurationParameter>
+        </configurationParameters>
+
+<!--
+        Values for the configuration parameters
+-->
+        <configurationParameterSettings>
+          <nameValuePair>
+            <name>DaveString</name>
+            <value>
+              <string>Dave</string>
+            </value>
+          </nameValuePair>
+        </configurationParameterSettings>
+ 
+
+<!--
+        TypeSystem Definition
+-->
+
+<typeSystemDescription>
+  <types>
+    <typeDescription>
+      <name>org.apache.uima.examples.David</name>
+      <description></description>
+      <supertypeName>uima.tcas.Annotation</supertypeName>
+      <features>
+      </features>
+    </typeDescription>
+  </types>
+</typeSystemDescription>
+
+
+<!--
+Capabilities: Inputs, Outputs, and Preconditions
+-->
+<capabilities>
+  <capability>
+    <inputs/>
+    <outputs>
+      <type allAnnotatorFeatures="true">org.apache.uima.examples.David</type>
+    </outputs> 
+    <languagesSupported>
+      <language>x-unspecified</language>
+    </languagesSupported>
+  </capability>
+</capabilities>
+
+</analysisEngineMetaData>
+</taeDescription>
+

diff --git a/src/test/data/descriptors/SegmentAnnotateMerge.xml b/src/test/data/descriptors/SegmentAnnotateMerge.xml
new file mode 100644
index 0000000..98ff94a
--- /dev/null
+++ b/src/test/data/descriptors/SegmentAnnotateMerge.xml

@@ -0,0 +1,114 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+
+<!--
+ ***************************************************************
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+     *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ ***************************************************************
+-->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+    <frameworkImplementation>org.apache.uima.cpp</frameworkImplementation>
+    <primitive>false</primitive>
+
+    <delegateAnalysisEngineSpecifiers>
+        <delegateAnalysisEngine key="Segmenter">
+            <import location="SimpleTextSegmenter.xml" />
+        </delegateAnalysisEngine>
+
+        <delegateAnalysisEngine key="Tokenizer">
+            <import location="tok.xml" />
+        </delegateAnalysisEngine>
+
+        <delegateAnalysisEngine key="Merger">
+            <import location="SimpleTextMerger.xml"/>
+        </delegateAnalysisEngine>
+    </delegateAnalysisEngineSpecifiers>
+
+    <analysisEngineMetaData>
+        <name>Aggregate CAS Multiplier example that segments, annotates and merges CASes</name>
+        <description>
+            Splits a document into pieces (by default sentences, see SimpleTextSegmenter.xml) and then annotate each
+            piece independently and then merges them into one document while keeping only specific annotation types.
+        </description>
+
+        <configurationParameters>
+            <configurationParameter>
+                <name>AnnotationTypesToPreserve</name>
+                <description>Names of annotation types to keep in the merged CAS</description>
+                <type>String</type>
+                <multiValued>true</multiValued>
+                <mandatory>false</mandatory>
+                <overrides>
+                    <parameter>Merger/AnnotationTypesToCopy</parameter>
+                </overrides>
+            </configurationParameter>
+            <configurationParameter>
+                <name>CASOutputFreq</name>
+                <type>Integer</type>
+                <multiValued>false</multiValued>
+                <mandatory>false</mandatory>
+                <overrides>
+                    <parameter>Merger/OutputFrequency</parameter>
+                </overrides>
+            </configurationParameter>
+        </configurationParameters>
+
+        <configurationParameterSettings>
+            <nameValuePair>
+                <name>AnnotationTypesToPreserve</name>
+                <value>
+                    <array>
+                        <string>uima.tt.TokenAnnotation</string>
+                    </array>
+                </value>
+            </nameValuePair>
+            <nameValuePair>
+                <name>CASOutputFreq</name>
+                <value>
+                    <integer>2</integer>
+                </value>
+            </nameValuePair>
+        </configurationParameterSettings>
+
+        <flowConstraints>
+            <fixedFlow>
+                <node>Segmenter</node>
+                <node>Tokenizer</node>
+                <node>Merger</node>
+            </fixedFlow>
+        </flowConstraints>
+
+        <capabilities>
+            <capability>
+                <inputs />
+                <outputs>
+                    <type>uima.tt.TokenAnnotation</type>
+                </outputs>
+                <languagesSupported>
+                    <language>en</language>
+                </languagesSupported>
+            </capability>
+        </capabilities>
+
+        <operationalProperties>
+            <modifiesCas>true</modifiesCas>
+            <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+            <outputsNewCASes>true</outputsNewCASes>
+        </operationalProperties>
+    </analysisEngineMetaData>
+</analysisEngineDescription>

diff --git a/src/test/data/descriptors/SimpleTextMerger.xml b/src/test/data/descriptors/SimpleTextMerger.xml
new file mode 100644
index 0000000..a378a0d
--- /dev/null
+++ b/src/test/data/descriptors/SimpleTextMerger.xml

@@ -0,0 +1,101 @@
+<?xml version="1.0" encoding="UTF-8" ?>
+<!--
+***************************************************************
+* Licensed to the Apache Software Foundation (ASF) under one
+* or more contributor license agreements.  See the NOTICE file
+* distributed with this work for additional information
+* regarding copyright ownership.  The ASF licenses this file
+* to you under the Apache License, Version 2.0 (the
+* "License"); you may not use this file except in compliance
+* with the License.  You may obtain a copy of the License at
+*
+*   http://www.apache.org/licenses/LICENSE-2.0
+*
+* Unless required by applicable law or agreed to in writing,
+* software distributed under the License is distributed on an
+* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+* KIND, either express or implied.  See the License for the
+* specific language governing permissions and limitations
+* under the License.
+***************************************************************
+-->
+
+<analysisEngineDescription xmlns="http://uima.apache.org/resourceSpecifier">
+    <frameworkImplementation>org.apache.uima.cpp</frameworkImplementation>
+    <primitive>true</primitive>
+    <annotatorImplementationName>libSimpleTextMerger</annotatorImplementationName>
+
+    <analysisEngineMetaData>
+        <name>Simple Text Merger</name>
+        <description>Merges text documents into larger ones, producing a merged CAS.  Also copies
+            selected annotation types to the merged CAS.  The input CAS requires an instance of
+            the type uima.tt.SourceDocumentInformation (which is produced by SimpleTextSegmenter).
+            Input CASes will be merged until a CAS is encountered whose SourceDocumentInformation FS has its
+            lastSegment feature set to true.  At that point a merged CAS will be output which
+            includes the content up to and including that CAS.  If additional CASes are then
+            received, a second merged CAS will be built, including all content from that point until
+            the next CAS with lastSegment = true, and so on.</description>
+        <version>1.0</version>
+        <vendor>The Apache Software Foundation</vendor>
+
+        <configurationParameters>
+            <configurationParameter>
+                <name>AnnotationTypesToCopy</name>
+                <description>
+                    Names of annotation types to be copied from source CASes into the merged CAS.
+                    Type uima.tt.SourceDocumentInformation will specifically be ignored.
+                </description>
+                <type>String</type>
+                <multiValued>true</multiValued>
+                <mandatory>true</mandatory>
+            </configurationParameter>
+
+
+            <configurationParameter>
+                <name>OutputFrequency</name>
+                <description>
+                    How frequent this merger will output a new CAS. If no value or (0) is specified, it
+                    will only ouput a CAS at the end.
+                    For example: if there are a total of 5 inputs and OutputFrequency is set to 2, this merger will
+                    produce 3 CASes: (1 + 2), (3 + 4), (5)
+                </description>
+                <type>Integer</type>
+                <multiValued>false</multiValued>
+                <mandatory>false</mandatory>
+            </configurationParameter>
+        </configurationParameters>
+        <configurationParameterSettings>
+            <nameValuePair>
+                <name>AnnotationTypesToCopy</name>
+                <value>
+                    <array>
+                        <string>uima.tt.TokenAnnotation</string>
+                    </array>
+                </value>
+            </nameValuePair>
+        </configurationParameterSettings>
+
+        <typeSystemDescription>
+            <imports>
+                <import location="tt_typesystem.xml"/>
+            </imports>
+        </typeSystemDescription>
+
+        <capabilities>
+            <capability>
+                <inputs>
+                    <type>uima.tt.SourceDocumentInformation</type>
+                    <feature>uima.tt.SourceDocumentInformation:lastSegment</feature>
+                </inputs>
+                <outputs/>
+            </capability>
+        </capabilities>
+
+        <operationalProperties>
+            <modifiesCas>false</modifiesCas>
+            <multipleDeploymentAllowed>true</multipleDeploymentAllowed>
+            <outputsNewCASes>true</outputsNewCASes>
+        </operationalProperties>
+
+    </analysisEngineMetaData>
+</analysisEngineDescription>
\ No newline at end of file

diff --git a/src/test/data/descriptors/SimpleTextSegmenter.xml b/src/test/data/descriptors/SimpleTextSegmenter.xml
index 9499ef8..db5d7fe 100644
--- a/src/test/data/descriptors/SimpleTextSegmenter.xml
+++ b/src/test/data/descriptors/SimpleTextSegmenter.xml

@@ -1,4 +1,4 @@
-<?xml version="1.0" encoding="UTF-8" ?> 
+<?xml version="1.0" encoding="UTF-8" ?> 
 
    <!--
     ***************************************************************
@@ -30,7 +30,11 @@
  
 <analysisEngineMetaData>
 	<name>Simple Text Segmenter</name>
-	<description>Splits a text document into pieces. The point at which the text is split is determined by SegmentDelimiter configuration parameter which defaults to new line ('\n')</description>
+	<description>
+		Splits a text document into pieces. The point at which the text is split is determined by
+		SegmentDelimiter configuration parameter which defaults to new line ('\n').
+		The last segment in the document will have lastSegment set to true.
+	</description>
 	<version>1.0</version>
 	<vendor>IBM</vendor>
 	
@@ -43,6 +47,7 @@
 			<mandatory>false</mandatory>
 		</configurationParameter>
 	</configurationParameters>
+
 	<configurationParameterSettings>
 		<nameValuePair>
 			<name>SegmentDelimiter</name>
@@ -52,7 +57,11 @@
 		</nameValuePair>
 	</configurationParameterSettings>
 
-	<typeSystemDescription/>
+	<typeSystemDescription>
+		<imports>
+			<import location="tt_typesystem.xml"/>
+		</imports>
+	</typeSystemDescription>
 
 	<fsIndexCollection>
 	 <fsIndexes>
@@ -67,7 +76,10 @@
 		
 	<capabilities>
 		<capability>
-			<outputs/>
+			<outputs>
+				<type>uima.tt.SourceDocumentInformation</type>
+				<feature>uima.tt.SourceDocumentInformation:lastSegment</feature>
+			</outputs>
 		</capability>
 	</capabilities>
 	

diff --git a/src/test/data/descriptors/tt_typesystem.xml b/src/test/data/descriptors/tt_typesystem.xml
index 17bc1ae..8ced9ee 100644
--- a/src/test/data/descriptors/tt_typesystem.xml
+++ b/src/test/data/descriptors/tt_typesystem.xml

@@ -591,6 +591,33 @@
             <description></description>
             <supertypeName>uima.tt.VariantKind</supertypeName>
         </typeDescription>
+        <typeDescription>
+            <name>uima.tt.SourceDocumentInformation</name>
+            <description>Stores detailed information about the original source document from which the current CAS was initialized. All information (like size) refers to the source document and not to the document in the CAS which may be converted and filtered by a CAS Initializer. For example this information will be written to the Semantic Search index so that the original document contents can be retrieved by queries.</description>
+            <supertypeName>uima.tcas.Annotation</supertypeName>
+            <features>
+                <featureDescription>
+                    <name>uri</name>
+                    <description>URI of document. (For example, file:///MyDirectory/myFile.txt for a simple file or http://uima.apache.org for content from a web source.)</description>
+                    <rangeTypeName>uima.cas.String</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>offsetInSource</name>
+                    <description>Byte offset of the start of document content within original source file or other input source. Only used if the CAS document was retrieved from an source where one physical source file contained several conceptual documents. Zero otherwise.</description>
+                    <rangeTypeName>uima.cas.Integer</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>documentSize</name>
+                    <description>Size of original document in bytes before processing by CAS Initializer. Either absolute file size of size within file or other source.</description>
+                    <rangeTypeName>uima.cas.Integer</rangeTypeName>
+                </featureDescription>
+                <featureDescription>
+                    <name>lastSegment</name>
+                    <description>For a CAS that represents a segment of a larger source document, this flag indicates whether this CAS is the final segment of the source document.  This is useful for downstream components that want to take some action after having seen all of the segments of a particular source document. </description>
+                    <rangeTypeName>uima.cas.Boolean</rangeTypeName>
+                </featureDescription>
+            </features>
+        </typeDescription>
     </types>
 </typeSystemDescription>
 

diff --git a/src/test/src/DaveDetector.cpp b/src/test/src/DaveDetector.cpp
new file mode 100644
index 0000000..13a3f84
--- /dev/null
+++ b/src/test/src/DaveDetector.cpp

@@ -0,0 +1,124 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "uima/api.hpp"
+using namespace std;
+using namespace uima;
+
+class DaveDetector : public Annotator {
+private:
+  Type david;
+  CAS *tcas;
+  icu::UnicodeString us_DaveString;
+
+public:
+
+  DaveDetector(void) {
+    cout << "DaveDetector: Constructor" << endl;
+  }
+
+  ~DaveDetector(void) {
+    cout << "DaveDetector: Destructor" << endl;
+  }
+
+  /** */
+  TyErrorId initialize(AnnotatorContext & rclAnnotatorContext) {
+    cout << "DaveDetector: initialize()" << endl;
+
+
+    if (!rclAnnotatorContext.isParameterDefined("DaveString") ||
+        rclAnnotatorContext.extractValue("DaveString", us_DaveString) != UIMA_ERR_NONE) {
+      /* log the error condition */
+      rclAnnotatorContext.getLogger().logError("Required configuration parameter \"DaveString\" not found in component descriptor");
+      cout << "DaveDetector::initialize() - Error. See logfile." << endl;
+      return UIMA_ERR_USER_ANNOTATOR_COULD_NOT_INIT;
+    }
+
+    /* log the configuration parameter setting */
+    rclAnnotatorContext.getLogger().logMessage("DaveString = '" + us_DaveString + "'");
+
+    cout << "DaveDetector::initialize() .. us_DaveString.getBuffer: "
+    << us_DaveString << endl;
+
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+  /** */
+  TyErrorId typeSystemInit(TypeSystem const & crTypeSystem) {
+    cout << "DaveDetector:: typeSystemInit()" << endl;
+    david  = crTypeSystem.getType("org.apache.uima.examples.David");
+    if (!david.isValid()) {
+      getAnnotatorContext().getLogger().logError("Error getting Type object for org.apache.uima.examples.David");
+      cout << "DaveDetector::typeSystemInit - Error. See logfile" << endl;
+      return (TyErrorId)UIMA_ERR_RESMGR_INVALID_RESOURCE;
+    }
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+  /** */
+  TyErrorId destroy() {
+    cout << "DaveDetector: destroy()" << endl;
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+  /** */
+  TyErrorId process(CAS & tcas, ResultSpecification const & crResultSpecification) {
+    cout << "DaveDetector::process() begins" << endl;
+    FSIndexRepository & indexRep = tcas.getIndexRepository();
+
+    /* This is a shallow pointer object containing a reference to document text*/
+    UnicodeStringRef ulstrDoc = tcas.getDocumentText();
+    /* Conventional pointer to mark beginning of the buffer*/
+    const UChar * cpszDocTextBegin = ulstrDoc.getBuffer();
+    /* Pointer to the document text remaining to be scanned */
+    const UChar * remainingTextP = cpszDocTextBegin;
+    /* Pointer to the match string */
+    const UChar * DaveStringP = us_DaveString.getBuffer();
+    /* Get number of Unicode chars (UTF-16 code units) of a couple strings */
+    size_t uiDocLen = ulstrDoc.length();
+    size_t uiMatchLen = us_DaveString.length();
+    size_t remainingLen = uiDocLen;
+
+    cout << "DaveDetector::process() .. uiDocLen: " << uiDocLen << endl;
+    getAnnotatorContext().getLogger().logMessage("process called");
+
+    UChar * gotDaveP;
+    while ( NULL !=
+            (gotDaveP = u_strFindFirst(remainingTextP, remainingLen, DaveStringP, uiMatchLen)) ) {
+      size_t uiExprBeginPos = gotDaveP - cpszDocTextBegin;
+      size_t uiExprEndPos = uiExprBeginPos + uiMatchLen;
+      remainingLen = uiDocLen - uiExprBeginPos;
+      remainingTextP = gotDaveP + uiMatchLen;
+
+      cout << "DaveDetector::process() .. Gotta Dave begin: " << uiExprBeginPos << "  end: "
+      << uiExprEndPos << "  remaining: " << remainingLen << endl;
+
+      AnnotationFS fsNewExp =
+        tcas.createAnnotation(david, uiExprBeginPos, uiExprEndPos);
+      indexRep.addFS(fsNewExp);
+    }
+
+    cout << "DaveDetector::process() ends" << endl;
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+};
+
+// This macro exports an entry point that is used to create the annotator.
+
+MAKE_AE(DaveDetector);

diff --git a/src/test/src/SimpleTextMerger.cpp b/src/test/src/SimpleTextMerger.cpp
new file mode 100644
index 0000000..caaa956
--- /dev/null
+++ b/src/test/src/SimpleTextMerger.cpp

@@ -0,0 +1,164 @@
+/** @name SimpleTextMerger.cpp
+
+* Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+-------------------------------------------------------------------------- */
+
+
+#include "uima/api.hpp"
+#include <iostream>
+
+using namespace uima;
+using namespace std;
+
+class SimpleTextMerger : public Annotator {
+  CAS *pCas{nullptr};
+  AnnotatorContext* pAnc;
+
+  icu::UnicodeString usMergedDoc{};
+  bool readyToOutput{false};
+
+  vector<icu::UnicodeString> usAnnotationT;
+  Type srcDocInfo;
+  Feature lastSegment;
+
+  int outputFreq{0};
+  int numSegments{0};
+
+  TyErrorId getConfigValues() {
+    return UIMA_ERR_NONE;
+  }
+
+public:
+
+  SimpleTextMerger() { cout << "SimpleTextMerger: constructor" << endl; }
+  ~SimpleTextMerger() { cout << "SimpleTextMerger: destructor" << endl; }
+
+  int getCasInstancesRequired() override {
+    return 1;
+  }
+
+  TyErrorId initialize(AnnotatorContext &rclAnnotatorContext) override {
+    cout << "SimpleTextMerger: initialize" << endl;
+
+    pAnc = &rclAnnotatorContext;
+    if (rclAnnotatorContext.isParameterDefined("AnnotationTypesToCopy") &&
+      rclAnnotatorContext.extractValue("AnnotationTypesToCopy", usAnnotationT) == UIMA_ERR_NONE) {
+
+      cout << "Copying types: " << endl;
+      for (auto const & type : usAnnotationT) {
+        cout << '\t' << type << endl;
+      }
+    } else {
+      pAnc->getLogger().logError("initialize: Cannot get AnnotationTypesToCopy");
+      UIMA_EXC_THROW_NEW(Exception,
+                   UIMA_ERR_USER_ANNOTATOR_COULD_NOT_INIT,
+                   UIMA_MSG_ID_EXCON_CONFIG_VALUE_EXTRACT,
+                   ErrorMessage(UIMA_MSG_ID_LITERAL_STRING, "Invalid value for AnnotationTypesToCopy"),
+                   ErrorInfo::unrecoverable);
+    }
+    if (rclAnnotatorContext.isParameterDefined("OutputFrequency")) {
+      rclAnnotatorContext.extractValue("OutputFrequency", outputFreq);
+    }
+
+    return UIMA_ERR_NONE;
+  }
+
+  TyErrorId typeSystemInit(TypeSystem const & crTypeSystem) override {
+    srcDocInfo = crTypeSystem.getType("uima.tt.SourceDocumentInformation");
+    lastSegment = srcDocInfo.getFeatureByBaseName("lastSegment");
+    return UIMA_ERR_NONE;
+  }
+
+  TyErrorId process(CAS &cas, ResultSpecification const &crResultSpecification) override {
+    if (!pCas)
+      pCas = &pAnc->getEmptyCAS();
+
+    int initialLen = usMergedDoc.length();
+    UnicodeStringRef docText = cas.getDocumentText();
+    usMergedDoc.append(docText.getBuffer(), docText.length());
+
+    FSIndexRepository& indexRep = pCas->getIndexRepository();
+
+    for (const auto & copiedType : usAnnotationT) {
+      Type type = pCas->getTypeSystem().getType(copiedType);
+      if (type == srcDocInfo)
+        continue;
+
+      ANIterator iter = cas.getAnnotationIndex(type).iterator();
+
+      while (iter.isValid()) {
+        AnnotationFS anFS = iter.get();
+        // !! This does not copy all Features of the AnnotationFS, unlike Java's CasCopier
+        // C++ does not have an equivalent ASAIK
+        // So we just make a bare Annotation
+        AnnotationFS copy = pCas->createAnnotation(type,
+          anFS.getBeginPosition() + initialLen, anFS.getEndPosition() + initialLen);
+        indexRep.addFS(copy);
+        iter.moveToNext();
+      }
+    }
+
+    ANIterator it = cas.getAnnotationIndex(srcDocInfo).iterator();
+    if (!it.isValid()) {
+      UIMA_EXC_THROW_NEW(Exception,
+                   UIMA_ERR_USER_ANNOTATOR_COULD_NOT_PROCESS,
+                   UIMA_MSG_ID_EXC_INVALID_ITERATOR,
+                   ErrorMessage(UIMA_MSG_ID_LITERAL_STRING, "Invalid Iterator: No SourceDocumentInformation found"),
+                   ErrorInfo::unrecoverable);
+    }
+
+    AnnotationFS info = it.get();
+    if (info.getBooleanValue(lastSegment) ||
+      (outputFreq && ++numSegments % outputFreq == 0))
+    {
+      pCas->setDocumentText(usMergedDoc);
+      AnnotationFS sdi = pCas->createAnnotation(srcDocInfo, 0, usMergedDoc.length());
+      sdi.setBooleanValue(lastSegment, true);
+      indexRep.addFS(sdi);
+
+      usMergedDoc = icu::UnicodeString();
+      readyToOutput = true;
+    }
+    return UIMA_ERR_NONE;
+  }
+
+  bool hasNext() override {
+    return readyToOutput;
+  }
+
+  CAS &next() override {
+    if (!readyToOutput) {
+      UIMA_EXC_THROW_NEW(Exception,
+                         UIMA_ERR_USER_ANNOTATOR_COULD_NOT_PROCESS,
+                         UIMA_MSG_ID_EXCON_PROCESSING_CAS,
+                         ErrorMessage(UIMA_MSG_ID_LITERAL_STRING, "There is not next() available."),
+                         ErrorInfo::unrecoverable);
+    }
+
+    CAS &ret = *pCas;
+    pCas = nullptr;
+    readyToOutput = false;
+    return ret;
+  }
+};
+
+// This macro exports an entry point that is used to create the annotator.
+
+MAKE_AE(SimpleTextMerger);

diff --git a/src/test/src/SimpleTextSegmenter.cpp b/src/test/src/SimpleTextSegmenter.cpp
deleted file mode 120000
index b4fd677..0000000
--- a/src/test/src/SimpleTextSegmenter.cpp
+++ /dev/null

@@ -1 +0,0 @@
-../../../examples/src/SimpleTextSegmenter.cpp
\ No newline at end of file

diff --git a/src/test/src/SimpleTextSegmenter.cpp b/src/test/src/SimpleTextSegmenter.cpp
new file mode 100644
index 0000000..7985004
--- /dev/null
+++ b/src/test/src/SimpleTextSegmenter.cpp

@@ -0,0 +1,208 @@
+/** SimpleTextSegmenter.cpp
+
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+
+
+Example CAS Multiplier
+*/
+#include <stdio.h>
+#include "uima/api.hpp"
+using namespace uima;
+using namespace std;
+
+class  SimpleTextSegmenter : public Annotator {
+private:
+
+  UnicodeStringRef docTextUS;
+
+  size_t threshhold;
+  size_t docLen;
+  size_t start;
+  size_t delimLen;
+  size_t remainingLen;
+
+  icu::UnicodeString delimUS;
+  const UChar * delimP;
+  const UChar * docTextBeginP;
+  const UChar * remainingTextP;
+  bool hasMore;
+
+  AnnotatorContext * pAnc;
+
+  Type srcDocInfo;
+  Feature lastSegment;
+
+  /* We have a separate function getConfigValues()
+     for initialize() and reconfigure()
+  */
+  TyErrorId getConfigValues() {
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+
+public:
+
+  SimpleTextSegmenter(void) {
+    //   cout << "SimpleTextSegmenter: Constructor" << endl;
+  }
+
+  ~SimpleTextSegmenter(void) {
+    //   cout << "SimpleTextSegmenter: Destructor" << endl;
+  }
+
+  /** */
+  TyErrorId initialize(AnnotatorContext & rclAnnotatorContext) {
+    cout << "SimpleTextSegmenter: initialize()" << endl;
+
+    pAnc = &rclAnnotatorContext;
+
+    /* default delimiter */
+    delimUS = "\n";
+
+    /* read in configuration parameter setting */
+    icu::UnicodeString param("SegmentDelimiter");
+    if (rclAnnotatorContext.isParameterDefined(param) ) {
+      rclAnnotatorContext.extractValue(param, delimUS);
+    }
+
+    delimP = delimUS.getBuffer();
+    delimLen = delimUS.length();
+    if (delimLen < 1 ) {
+      pAnc->getLogger().logError("initialize() Invalid delimiter specified. Must be at least one character in length");
+      UIMA_EXC_THROW_NEW(Exception,
+                         UIMA_ERR_USER_ANNOTATOR_COULD_NOT_INIT,
+                         UIMA_MSG_ID_EXCON_CONFIG_VALUE_EXTRACT,
+                         ErrorMessage(UIMA_MSG_ID_LITERAL_STRING, "Invalid value for SegmentDelimiter"),
+                         ErrorInfo::unrecoverable);
+    }
+
+    cout << "initialize() using segment delimiter " << delimUS << " with length " << delimLen << endl;
+
+    pAnc->getLogger().logMessage("initialize() Using Segment Delimiter '" + delimUS + "'");
+
+    return UIMA_ERR_NONE;
+  }
+
+  // Segment the input text.
+  TyErrorId process(CAS & cas, ResultSpecification const & crResultSpecification) {
+    cout  << "SimpleTextSegmenter: process()" << endl;
+
+	cout << endl << "Test custom index..............." << endl;
+	FSIndex featureIndex = cas.getIndexRepository().getIndex("TestIndex");
+	cout << "featureIndex.isValid() = " << featureIndex.isValid() << endl;
+	cout << "Test custom index ok..............." << endl << endl;
+
+    // Get the text document
+    docTextUS = cas.getDocumentText();
+    /* Get number of Unicode chars (UTF-16 code units)  */
+    docLen = docTextUS.length();
+    start = 0;
+    remainingTextP = docTextUS.getBuffer();
+    remainingLen=docLen;
+    docTextBeginP = docTextUS.getBuffer();
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+
+  bool  hasNext() {
+    cout << "SimpleTextSegmenter:hasNext() " << remainingLen << endl;
+
+    if (remainingLen < 1 || remainingTextP==NULL) {
+      hasMore=false;
+    } else {
+      hasMore = true;
+    }
+    return hasMore;
+  }
+
+  /** */
+  CAS &  next() {
+
+    cout << "SimpleTextSegmenter: next()" << endl;
+
+    if (!hasMore) {
+      UIMA_EXC_THROW_NEW(Exception,
+                         UIMA_ERR_USER_ANNOTATOR_COULD_NOT_PROCESS,
+                         UIMA_MSG_ID_EXCON_PROCESSING_CAS,
+                         ErrorMessage(UIMA_MSG_ID_LITERAL_STRING, "There is not next() available."),
+                         ErrorInfo::unrecoverable);
+    }
+
+    UChar * segEndP =  u_strFindFirst(remainingTextP, remainingLen, delimP, delimLen);
+    size_t segLen=remainingLen;
+
+    if (segEndP != NULL) {
+      segLen = segEndP - remainingTextP;
+    }
+
+    //get a CAS from pool.
+    CAS & cas = pAnc->getEmptyCAS();
+
+    //create a sofa for the segment
+    if (segEndP) {
+      AnnotationFS srcFS = cas.createAnnotation(srcDocInfo,
+        remainingTextP - docTextBeginP, segEndP + delimLen - docTextBeginP);
+
+      UnicodeStringRef segStr(remainingTextP, segLen+delimLen);
+      remainingTextP = segEndP + delimLen;
+      remainingLen = remainingLen-(segLen+delimLen);
+      cas.setDocumentText(segStr.getBuffer(), segStr.length());
+
+      if (remainingLen == 0){
+        srcFS.setBooleanValue(lastSegment, true);
+      }
+      cas.getIndexRepository().addFS(srcFS);
+
+    } else {
+      if (remainingLen > 0) {   //when delim not found, create Sofa with remaining
+        AnnotationFS srcFS = cas.createAnnotation(srcDocInfo, remainingTextP - docTextBeginP, docLen);
+        srcFS.setBooleanValue(lastSegment, true);
+        cas.getIndexRepository().addFS(srcFS);
+
+        UnicodeStringRef segStr(remainingTextP, remainingLen);
+        cas.setDocumentText(segStr.getBuffer(), segStr.length());
+      }
+      remainingLen=0;
+      remainingTextP = NULL;
+    }
+
+    return cas;
+  }
+
+  int  getCasInstancesRequired() {
+    return 1;
+  }
+
+  /** */
+  TyErrorId typeSystemInit(TypeSystem const & crTypeSystem) {
+    cout << " typeSystemInit()" << endl;
+    srcDocInfo = crTypeSystem.getType("uima.tt.SourceDocumentInformation");
+    lastSegment = srcDocInfo.getFeatureByBaseName("lastSegment");
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+
+  /** */
+  TyErrorId destroy() {
+    cout << "SimpleTextSegmenter: destroy()" << endl;
+    return (TyErrorId)UIMA_ERR_NONE;
+  }
+};
+
+// This macro exports an entry point that is used to create the annotator.
+
+MAKE_AE(SimpleTextSegmenter);

diff --git a/src/test/src/test_engine.cpp b/src/test/src/test_engine.cpp
index 49f3831..e7f5b1c 100644
--- a/src/test/src/test_engine.cpp
+++ b/src/test/src/test_engine.cpp

@@ -519,7 +519,107 @@
 }
 
 
+/* For now, aggregate engines do not handle CAS Multipliers correctly.
+   This test will fail if ran.
+   TODO: Implement CAS Multiplier for Aggregate
+ */
+void testAggregateCASMultiplier(const util::ConsoleUI &rclConsole)
+{
+  rclConsole.info("Test Aggregate CAS Multiplier start.");
+  const icu::UnicodeString descriptor("AggregateCASMultiplier.xml");
+  const icu::UnicodeString fileName = ResourceManager::resolveFilename(descriptor, descriptor);
+  ErrorInfo err;
+  auto pEngine = TextAnalysisEngine::createTextAnalysisEngine(UnicodeStringRef(fileName).asUTF8().c_str(), err);
 
+  failIfNotTrue(err.getErrorId() == UIMA_ERR_NONE);
+  failIfNotTrue(pEngine != nullptr);
+
+  //test operational properties settings
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->getOutputsNewCASes() == true);
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->getModifiesCas() == false);
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->isMultipleDeploymentAllowed() == true);
+
+  auto const cas = pEngine->newCAS();
+  cas->setDocumentText(icu::UnicodeString(
+    "This is a sentence with Dave. This is the second sentence with Dave. This is the third Dave sentence."));
+  Type dave = cas->getTypeSystem().getType("org.apache.uima.examples.David");
+
+  CASIterator iter = pEngine->processAndOutputNewCASes(*cas);
+  failIfNotTrue(iter.hasNext());
+  int numSegments = 0;
+
+  while (iter.hasNext()) {
+    ++numSegments;
+    CAS &rcas = iter.next();
+    ANIndex anIndex = rcas.getAnnotationIndex(dave);
+
+    // There should be one Dave in each segment
+    failIfNotTrue(anIndex.getSize() == 1);
+    pEngine->getAnnotatorContext().releaseCAS(rcas);
+  }
+
+  failIfNotTrue(numSegments == 3);
+  delete cas;
+  delete pEngine;
+
+  rclConsole.info("Test Aggregate CAS Multiplier end.");
+}
+
+
+/*
+ * This will also not work
+ */
+void testAggregateCASCombiner(const util::ConsoleUI &rclConsole)
+{
+  rclConsole.info("Test Aggregate CAS Combiner start.");
+  const icu::UnicodeString descriptor("SegmentAnnotateMerge.xml");
+  const icu::UnicodeString fileName = ResourceManager::resolveFilename(descriptor, descriptor);
+  ErrorInfo err;
+  auto pEngine = TextAnalysisEngine::createTextAnalysisEngine(UnicodeStringRef(fileName).asUTF8().c_str(), err);
+
+  failIfNotTrue(err.getErrorId() == UIMA_ERR_NONE);
+  failIfNotTrue(pEngine != nullptr);
+
+  //test operational properties settings
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->getOutputsNewCASes() == true);
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->getModifiesCas() == true);
+  failIfNotTrue(pEngine->getAnalysisEngineMetaData().getOperationalProperties()->isMultipleDeploymentAllowed() == true);
+
+  auto const cas = pEngine->newCAS();
+  cas->setDocumentText(icu::UnicodeString(
+    "First segment. Second segment. Third segment. Fourth segment."));
+  Type token = cas->getTypeSystem().getType("uima.tt.TokenAnnotation");
+  Type srcDocInfo = cas->getTypeSystem().getType("uima.tt.SourceDocumentInformation");
+  Feature lastSegment = srcDocInfo.getFeatureByBaseName("lastSegment");
+
+  CASIterator iter = pEngine->processAndOutputNewCASes(*cas);
+  failIfNotTrue(iter.hasNext());
+  int numOutputs = 0;
+
+  while (iter.hasNext()) {
+    ++numOutputs;
+    CAS &rcas = iter.next();
+    ANIndex tokenIdx = rcas.getAnnotationIndex(token);
+    // There should be three tokens in each segment, including the delimiter (.)
+    failIfNotTrue(tokenIdx.getSize() == 6);
+
+    // CAS should have a single SourceDocumentInformation whose lastSegment is true
+    ANIterator srcDocIt = rcas.getAnnotationIndex(srcDocInfo).iterator();
+    failIfNotTrue(srcDocIt.isValid());
+    AnnotationFS info = srcDocIt.get();
+    failIfNotTrue(info.getBooleanValue(lastSegment));
+    srcDocIt.moveToNext();
+    failIfNotTrue(srcDocIt.isValid());
+
+    pEngine->getAnnotatorContext().releaseCAS(rcas);
+  }
+
+  failIfNotTrue(numOutputs == 2);
+  delete cas;
+  delete pEngine;
+
+  rclConsole.info("Test Aggregate CAS Combiner end.");
+}
 
 
 void mainTest(uima::util::ConsoleUI & rclConsole,
@@ -536,6 +636,10 @@
     testCallingSequence3(rclConsole, cpszConfigFilename);
   }
   testCasMultiplier(rclConsole);
+#if 0
+  testAggregateCASMultiplier(rclConsole);
+  testAggregateCASCombiner(rclConsole);
+#endif
 }
 
 int main(int argc, char * argv[]) /*
commit	a710eb90b7eed3d17673bddb464cd29e6d654315	[log] [tgz]
author	Pablo Duboue <pablo.duboue@gmail.com>	Wed Jul 03 23:42:27 2024 -0700
committer	GitHub <noreply@github.com>	Wed Jul 03 23:42:27 2024 -0700
tree	71fb33eea6efe8b2cbbfa43f8f3a3b5697efba94
parent	1349ce51d0635c8341aa19a651fd25741450a3ee [diff]
parent	a1b69d4164e93ac12c409d173829a74108fbeb57 [diff]