blob: 7a1696bd9e733b0ce1e2cc464a6e82e2f0b644e4 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.nifi.processors.standard;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertTrue;
import java.io.UnsupportedEncodingException;
import java.nio.charset.StandardCharsets;
import java.util.Set;
import java.util.regex.Pattern;
import org.apache.nifi.processor.Relationship;
import org.apache.nifi.util.MockFlowFile;
import org.apache.nifi.util.TestRunner;
import org.apache.nifi.util.TestRunners;
import org.junit.Test;
public class TestExtractText {
final String SAMPLE_STRING = "foo\r\nbar1\r\nbar2\r\nbar3\r\nhello\r\nworld\r\n";
@Test
public void testProcessor() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty("regex.result1", "(?s)(.*)");
testRunner.setProperty("regex.result2", "(?s).*(bar1).*");
testRunner.setProperty("regex.result3", "(?s).*?(bar\\d).*"); // reluctant gets first
testRunner.setProperty("regex.result4", "(?s).*?(?:bar\\d).*?(bar\\d).*?(bar3).*"); // reluctant w/ repeated pattern gets second
testRunner.setProperty("regex.result5", "(?s).*(bar\\d).*"); // greedy gets last
testRunner.setProperty("regex.result6", "(?s)^(.*)$");
testRunner.setProperty("regex.result7", "(?s)(XXX)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2", "bar1");
out.assertAttributeEquals("regex.result3", "bar1");
out.assertAttributeEquals("regex.result4", "bar2");
out.assertAttributeEquals("regex.result4.0", SAMPLE_STRING);
out.assertAttributeEquals("regex.result4.1", "bar2");
out.assertAttributeEquals("regex.result4.2", "bar3");
out.assertAttributeNotExists("regex.result4.3");
out.assertAttributeEquals("regex.result5", "bar3");
out.assertAttributeEquals("regex.result6", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7", null);
}
@Test
public void testWithUnmatchedOptionalCapturingGroup() {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty("regex", "abc(def)?(g)");
testRunner.enqueue("abcg");
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeNotExists("regex.1");
out.assertAttributeEquals("regex.2", "g");
testRunner.clearTransferState();
testRunner.enqueue("abcdefg");
testRunner.run();
final MockFlowFile out2 = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out2.assertAttributeEquals("regex.1", "def");
out2.assertAttributeEquals("regex.2", "g");
}
@Test
public void testProcessorWithDotall() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result1", "(.*)");
testRunner.setProperty("regex.result2", ".*(bar1).*");
testRunner.setProperty("regex.result3", ".*?(bar\\d).*"); // reluctant gets first
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(bar\\d).*"); // reluctant w/ repeated pattern gets second
testRunner.setProperty("regex.result5", ".*(bar\\d).*"); // greedy gets last
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2", "bar1");
out.assertAttributeEquals("regex.result3", "bar1");
out.assertAttributeEquals("regex.result4", "bar2");
out.assertAttributeEquals("regex.result5", "bar3");
out.assertAttributeEquals("regex.result6", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7", null);
}
@Test
public void testProcessorWithMultiline() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty("regex.result1", "(.*)");
testRunner.setProperty("regex.result2", "(bar1)");
testRunner.setProperty("regex.result3", ".*?(bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(bar\\d)");
testRunner.setProperty("regex.result5", ".*(bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(bar\\d)");
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1", "foo"); // matches everything on the first line
out.assertAttributeEquals("regex.result2", "bar1");
out.assertAttributeEquals("regex.result3", "bar1");
out.assertAttributeEquals("regex.result4", null); // null because no line has two bar's
out.assertAttributeEquals("regex.result4b", "bar2"); // included newlines in regex
out.assertAttributeEquals("regex.result5", "bar1"); //still gets first because no lines with multiple bar's
out.assertAttributeEquals("regex.result5b", "bar3"); // included newlines in regex
out.assertAttributeEquals("regex.result6", "foo"); // matches all of first line
out.assertAttributeEquals("regex.result7", null); // no match
}
@Test
public void testProcessorWithMultilineAndDotall() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result1", "(.*)");
testRunner.setProperty("regex.result2", "(bar1)");
testRunner.setProperty("regex.result3", ".*?(bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(bar\\d)");
testRunner.setProperty("regex.result5", ".*(bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(bar\\d)");
testRunner.setProperty("regex.result6", "^(.*)$");
testRunner.setProperty("regex.result7", "^(XXX)$");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1", SAMPLE_STRING);
out.assertAttributeEquals("regex.result2", "bar1");
out.assertAttributeEquals("regex.result3", "bar1");
out.assertAttributeEquals("regex.result4", "bar2");
out.assertAttributeEquals("regex.result4b", "bar2");
out.assertAttributeEquals("regex.result5", "bar3");
out.assertAttributeEquals("regex.result5b", "bar3");
out.assertAttributeEquals("regex.result6", SAMPLE_STRING);
out.assertAttributeEquals("regex.result7", null);
}
@Test
public void testProcessorWithNoMatches() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.MULTILINE, "true");
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty("regex.result2", "(bar1)");
testRunner.setProperty("regex.result3", ".*?(bar\\d).*");
testRunner.setProperty("regex.result4", ".*?(?:bar\\d).*?(bar\\d).*");
testRunner.setProperty("regex.result4b", "bar\\d\\r\\n(bar\\d)");
testRunner.setProperty("regex.result5", ".*(bar\\d).*");
testRunner.setProperty("regex.result5b", "(?:bar\\d\\r?\\n)*(bar\\d)");
testRunner.setProperty("regex.result7", "^(XXX)$");
testRunner.enqueue("YYY".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_NO_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_NO_MATCH).get(0);
out.assertAttributeEquals("regex.result1", null);
out.assertAttributeEquals("regex.result2", null);
out.assertAttributeEquals("regex.result3", null);
out.assertAttributeEquals("regex.result4", null);
out.assertAttributeEquals("regex.result4b", null);
out.assertAttributeEquals("regex.result5", null);
out.assertAttributeEquals("regex.result5b", null);
out.assertAttributeEquals("regex.result6", null);
out.assertAttributeEquals("regex.result7", null);
}
@Test
public void testNoFlowFile() throws UnsupportedEncodingException {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 0);
}
@Test
public void testMatchOutsideBuffer() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.MAX_BUFFER_SIZE, "3 B");//only read the first 3 chars ("foo")
testRunner.setProperty("regex.result1", "(foo)");
testRunner.setProperty("regex.result2", "(world)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
out.assertAttributeEquals("regex.result1", "foo");
out.assertAttributeEquals("regex.result2", null); // null because outsk
}
@Test
public void testGetCompileFlags() {
final ExtractText processor = new ExtractText();
TestRunner testRunner;
int flags;
// NONE
testRunner = TestRunners.newTestRunner(processor);
flags = processor.getCompileFlags(testRunner.getProcessContext());
assertEquals(0, flags);
// UNIX_LINES
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.UNIX_LINES, "true");
assertEquals(Pattern.UNIX_LINES, processor.getCompileFlags(testRunner.getProcessContext()));
// CASE_INSENSITIVE
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.CASE_INSENSITIVE, "true");
assertEquals(Pattern.CASE_INSENSITIVE, processor.getCompileFlags(testRunner.getProcessContext()));
// COMMENTS
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.COMMENTS, "true");
assertEquals(Pattern.COMMENTS, processor.getCompileFlags(testRunner.getProcessContext()));
// MULTILINE
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.MULTILINE, "true");
assertEquals(Pattern.MULTILINE, processor.getCompileFlags(testRunner.getProcessContext()));
// LITERAL
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.LITERAL, "true");
assertEquals(Pattern.LITERAL, processor.getCompileFlags(testRunner.getProcessContext()));
// DOTALL
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.DOTALL, "true");
assertEquals(Pattern.DOTALL, processor.getCompileFlags(testRunner.getProcessContext()));
// UNICODE_CASE
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.UNICODE_CASE, "true");
assertEquals(Pattern.UNICODE_CASE, processor.getCompileFlags(testRunner.getProcessContext()));
// CANON_EQ
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.CANON_EQ, "true");
assertEquals(Pattern.CANON_EQ, processor.getCompileFlags(testRunner.getProcessContext()));
// UNICODE_CHARACTER_CLASS
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.UNICODE_CHARACTER_CLASS, "true");
assertEquals(Pattern.UNICODE_CHARACTER_CLASS, processor.getCompileFlags(testRunner.getProcessContext()));
// DOTALL and MULTILINE
testRunner = TestRunners.newTestRunner(processor);
testRunner.setProperty(ExtractText.DOTALL, "true");
testRunner.setProperty(ExtractText.MULTILINE, "true");
assertEquals(Pattern.DOTALL | Pattern.MULTILINE, processor.getCompileFlags(testRunner.getProcessContext()));
}
@Test
public void testGetRelationShips() throws Exception {
final ExtractText processor = new ExtractText();
final TestRunner testRunner = TestRunners.newTestRunner(processor);
testRunner.enqueue("foo".getBytes(StandardCharsets.UTF_8));
testRunner.run();
Set<Relationship> relationships = processor.getRelationships();
assertTrue(relationships.contains(ExtractText.REL_MATCH));
assertTrue(relationships.contains(ExtractText.REL_NO_MATCH));
assertEquals(2, relationships.size());
}
@Test
public void testIncludeZeroCaptureGroupProperty() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(.*)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey + ".0");
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
}
@Test
public void testFindAll() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(\\w+)");
testRunner.enqueue("This is my text".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey + ".0");
out.assertAttributeExists(attributeKey + ".1");
out.assertAttributeExists(attributeKey + ".2");
out.assertAttributeExists(attributeKey + ".3");
out.assertAttributeExists(attributeKey + ".4");
out.assertAttributeEquals(attributeKey, "This");
out.assertAttributeEquals(attributeKey + ".0", "This");
out.assertAttributeEquals(attributeKey + ".1", "This");
out.assertAttributeEquals(attributeKey + ".2", "is");
out.assertAttributeEquals(attributeKey + ".3", "my");
out.assertAttributeEquals(attributeKey + ".4", "text");
}
@Test
public void testFindAllPair() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.ENABLE_REPEATING_CAPTURE_GROUP, "true");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(\\w+)=(\\d+)");
testRunner.enqueue("a=1,b=10,c=100".getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is in the resultant attributes
out.assertAttributeExists(attributeKey + ".0");
out.assertAttributeExists(attributeKey + ".1");
out.assertAttributeExists(attributeKey + ".2");
out.assertAttributeExists(attributeKey + ".3");
out.assertAttributeExists(attributeKey + ".4");
out.assertAttributeExists(attributeKey + ".5");
out.assertAttributeExists(attributeKey + ".6");
out.assertAttributeNotExists(attributeKey + ".7"); // Ensure there's no more attributes
out.assertAttributeEquals(attributeKey, "a");
out.assertAttributeEquals(attributeKey + ".0", "a=1");
out.assertAttributeEquals(attributeKey + ".1", "a");
out.assertAttributeEquals(attributeKey + ".2", "1");
out.assertAttributeEquals(attributeKey + ".3", "b");
out.assertAttributeEquals(attributeKey + ".4", "10");
out.assertAttributeEquals(attributeKey + ".5", "c");
out.assertAttributeEquals(attributeKey + ".6", "100");
}
@Test
public void testIgnoreZeroCaptureGroupProperty() throws Exception {
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s)(.*)");
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// Ensure the zero capture group is not in the resultant attributes
out.assertAttributeNotExists(attributeKey + ".0");
out.assertAttributeEquals(attributeKey, SAMPLE_STRING);
}
@Test
public void testShouldAllowNoCaptureGroups() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
testRunner.run();
// Assert
testRunner.assertAllFlowFilesTransferred(ExtractText.REL_MATCH, 1);
final MockFlowFile out = testRunner.getFlowFilesForRelationship(ExtractText.REL_MATCH).get(0);
// There is no global capture group, so only "key.0" exists
out.assertAttributeNotExists(attributeKey);
out.assertAttributeEquals(attributeKey + ".0", SAMPLE_STRING);
}
@Test(expected = java.lang.AssertionError.class)
public void testShouldNotAllowNoCaptureGroupsIfZeroDisabled() throws Exception {
// Arrange
final TestRunner testRunner = TestRunners.newTestRunner(new ExtractText());
testRunner.setProperty(ExtractText.INCLUDE_CAPTURE_GROUP_ZERO, "false");
final String attributeKey = "regex.result";
testRunner.setProperty(attributeKey, "(?s).*");
// Act
testRunner.enqueue(SAMPLE_STRING.getBytes(StandardCharsets.UTF_8));
// Validation should fail because nothing will match
testRunner.run();
}
}