blob: 9be1c471bde4343a8fdda8752679c394fbee3d6e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.standard;
import static org.apache.nifi.processors.standard.ExtractText.ENABLE_NAMED_GROUPS;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Test;
public class TestExtractTextNamedGroups {
final String SAMPLE_STRING = "foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n";
@Test
public void testProcessor() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty("regex.result1", "(?s)(?<ALL>.*)");
testRunner.setProperty("regex.result2", "(?s).*(?<BAR1>bar1).*");
testRunner.setProperty("regex.result3", "(?s).*?(?<BAR1>bar\\d).*");
testRunner.setProperty("regex.result4", "(?s).*?(?:bar\\d).*?(?<BAR2>bar\\d).*?(?<BAR3>bar3).*");
testRunner.setProperty("regex.result5", "(?s).*(?<BAR3>bar\\d).*");
testRunner.setProperty("regex.result6", "(?s)^(?<ALL>.*)$");
testRunner.setProperty("regex.result7", "(?s)(?<MISS>XXX)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
java.util.Map<String,String> attributes = out.getAttributes();
out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2.BAR1", "bar1");
out.assertAttributeEquals("regex.result3.BAR1", "bar1");
out.assertAttributeEquals("regex.result4.BAR2", "bar2");
out.assertAttributeEquals("regex.result4.BAR3", "bar3");
out.assertAttributeEquals("regex.result5.BAR3", "bar3");
out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7.MISS", null);
}
@Test
public void testWithUnmatchedOptionalCapturingGroup() {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty("regex", "abc(?<DEF>def)?(?<G>g)");
testRunner.enqueue("abcg");
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeNotExists("regex.DEF");
out.assertAttributeEquals("regex.G", "g");
testRunner.clearTransferState();
testRunner.enqueue("abcdefg");
testRunner.run();
final MockFlowFile out2 = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out2.assertAttributeEquals("regex.DEF", "def");
out2.assertAttributeEquals("regex.G", "g");
}
@Test
public void testProcessorWithDotall() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result1", "(?<TOUT>.*)");
testRunner.setProperty("regex.result2", ".*(?<BAR1>bar1).*");
testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*"); // reluctant gets first
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*"); // reluctant w/ repeated pattern gets second
testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*"); // greedy gets last
testRunner.setProperty("regex.result6", "^(?<TOUT>.*)$");
testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1.TOUT", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2.BAR1", "bar1");
out.assertAttributeEquals("regex.result3.BAR1", "bar1");
out.assertAttributeEquals("regex.result4.BAR2", "bar2");
out.assertAttributeEquals("regex.result5.BAR3", "bar3");
out.assertAttributeEquals("regex.result6.TOUT", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7.NO", null);
}
@Test
public void testProcessorWithMultiline() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty("regex.result1", "(?<ALL>.*)");
testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NULL>bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
testRunner.setProperty("regex.result5", ".*(?<BAR2>bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
testRunner.setProperty("regex.result7", "^(?<NO>XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1.ALL", "foo"); // matches everything on the first line
out.assertAttributeEquals("regex.result2.BAR1", "bar1");
out.assertAttributeEquals("regex.result3.BAR1", "bar1");
out.assertAttributeEquals("regex.result4.NULL", null); // null because no line has two bar's
out.assertAttributeEquals("regex.result4b.BAR2", "bar2"); // included newlines in regex
out.assertAttributeEquals("regex.result5.BAR2", "bar1"); //still gets first because no lines with multiple bar's
out.assertAttributeEquals("regex.result5b.BAR3", "bar3"); // included newlines in regex
out.assertAttributeEquals("regex.result6.ALL", "foo"); // matches all of first line
out.assertAttributeEquals("regex.result7.NO", null); // no match
}
@Test
public void testProcessorWithMultilineAndDotall() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result1", "(?<ALL>.*)");
testRunner.setProperty("regex.result2", "(?<BAR1>bar1)");
testRunner.setProperty("regex.result3", ".*?(?<BAR1>bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<BAR2>bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<BAR2>bar\\d)");
testRunner.setProperty("regex.result5", ".*(?<BAR3>bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<BAR3>bar\\d)");
testRunner.setProperty("regex.result6", "^(?<ALL>.*)$");
testRunner.setProperty("regex.result7", "^(?<MISS>XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1.ALL", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2.BAR1", "bar1");
out.assertAttributeEquals("regex.result3.BAR1", "bar1");
out.assertAttributeEquals("regex.result4.BAR2", "bar2");
out.assertAttributeEquals("regex.result4b.BAR2", "bar2");
out.assertAttributeEquals("regex.result5.BAR3", "bar3");
out.assertAttributeEquals("regex.result5b.BAR3", "bar3");
out.assertAttributeEquals("regex.result6.ALL", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7.MISS", null);
}
@Test
public void testProcessorWithNoMatches() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result2", "(?<NONE>bar1)");
testRunner.setProperty("regex.result3", ".*?(?<NONE>bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(?<NONE>bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(?<NONE>bar\\d)");
testRunner.setProperty("regex.result5", ".*(?<NONE>bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(?<NONE>bar\\d)");
testRunner.setProperty("regex.result7", "^(?<NONE>XXX)$");
testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_NO_MATCH).get(0);
out.assertAttributeEquals("regex.result1.NONE", null);
out.assertAttributeEquals("regex.result2.NONE", null);
out.assertAttributeEquals("regex.result3.NONE", null);
out.assertAttributeEquals("regex.result4.NONE", null);
out.assertAttributeEquals("regex.result4b.NONE", null);
out.assertAttributeEquals("regex.result5.NONE", null);
out.assertAttributeEquals("regex.result5b.NONE", null);
out.assertAttributeEquals("regex.result6.NONE", null);
out.assertAttributeEquals("regex.result7.NONE", null);
}
@Test
public void testNoFlowFile() throws UnsupportedEncodingException {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 0);
}
@Test
public void testMatchOutsideBuffer() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.MAX_BUFFER_SIZE, "3 B");//only read the first 3 chars ("foo")
testRunner.setProperty("regex.result1", "(?<FOO>foo)");
testRunner.setProperty("regex.result2", "(?<WORLD>world)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1.FOO", "foo");
out.assertAttributeEquals("regex.result2.WORLD", null); // null because outsk
}
@Test
public void testIncludeZeroCaptureGroupProperty() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey);
out.assertAttributeExists(attributeKey + ".ALL");
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
}
@Test
public void testFindAll() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(?<W>\\w+)");
testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey);
out.assertAttributeExists(attributeKey + ".W");
out.assertAttributeExists(attributeKey + ".W.1");
out.assertAttributeExists(attributeKey + ".W.2");
out.assertAttributeExists(attributeKey + ".W.3");
out.assertAttributeEquals(attributeKey, "This");
out.assertAttributeEquals(attributeKey + ".W", "This");
out.assertAttributeEquals(attributeKey + ".W.1", "is");
out.assertAttributeEquals(attributeKey + ".W.2", "my");
out.assertAttributeEquals(attributeKey + ".W.3", "text");
}
@Test
public void testFindAllPair() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?<LEFT>\\w+)=(?<RIGHT>\\d+)");
testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey);
out.assertAttributeExists(attributeKey + ".LEFT");
out.assertAttributeExists(attributeKey + ".RIGHT");
out.assertAttributeExists(attributeKey + ".LEFT.1");
out.assertAttributeExists(attributeKey + ".RIGHT.1");
out.assertAttributeExists(attributeKey + ".LEFT.2");
out.assertAttributeExists(attributeKey + ".RIGHT.2");
out.assertAttributeNotExists(attributeKey + ".LEFT.3"); // Ensure there's no more attributes
out.assertAttributeNotExists(attributeKey + ".RIGHT.3"); // Ensure there's no more attributes
out.assertAttributeEquals(attributeKey , "a=1");
out.assertAttributeEquals(attributeKey + ".LEFT", "a");
out.assertAttributeEquals(attributeKey + ".RIGHT", "1");
out.assertAttributeEquals(attributeKey + ".LEFT.1", "b");
out.assertAttributeEquals(attributeKey + ".RIGHT.1", "10");
out.assertAttributeEquals(attributeKey + ".LEFT.2", "c");
out.assertAttributeEquals(attributeKey + ".RIGHT.2", "100");
}
@Test
public void testIgnoreZeroCaptureGroupProperty() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(?<ALL>.*)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is not in the resultant attributes
out.assertAttributeNotExists(attributeKey);
out.assertAttributeEquals(attributeKey + ".ALL", SAMPLE_STRING);
}
@Test
public void testShouldAllowNoCaptureGroups() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
// Assert
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// There is no global capture group, but no named capture group either
// so attributeKey has the match
out.assertAttributeEquals(attributeKey , SAMPLE_STRING);
}
@Test(expected = AssertionError.class)
public void testShouldNotAllowNoCaptureGroupsIfZeroDisabled() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
// Validation should fail because nothing will match
testRunner.run();
}
@Test(expected = AssertionError.class)
public void testInvalidIfGroupCountsDoNotMatch() {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ENABLE_NAMED_GROUPS, "true");
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "notValidOne";
testRunner.setProperty(attributeKey,"^(beginning)\\s(middle)\\s(?<END>end)$");
// Act
testRunner.enqueue("beginning middle end".getBytes(StandardCharsets.UTF_8));
// Validation should fail because number of groups does not match number of named groups
testRunner.run();
}
}