lucene/core/src/test/org/apache/lucene/util/automaton/TestRegExp.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.lucene.util.automaton;


 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;

 import java.util.regex.Matcher;
 import java.util.regex.Pattern;

 public class TestRegExp extends LuceneTestCase {

   /**
    * Simple smoke test for regular expression.
    */
   public void testSmoke() {
     RegExp r = new RegExp("a(b+|c+)d");
     Automaton a = r.toAutomaton();
     assertTrue(a.isDeterministic());
     CharacterRunAutomaton run = new CharacterRunAutomaton(a);
     assertTrue(run.run("abbbbbd"));
     assertTrue(run.run("acd"));
     assertFalse(run.run("ad"));
   }

   /**
    * Compiles a regular expression that is prohibitively expensive to
    * determinize and expexts to catch an exception for it.
    */
   public void testDeterminizeTooManyStates() {
     // LUCENE-6046
     String source = "[ac]*a[ac]{50,200}";
     TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
       new RegExp(source).toAutomaton();
     });
     assertTrue(expected.getMessage().contains(source));
   }

   public void testSerializeTooManyStatesToRepeat() throws Exception {
     String source = "a{50001}";
     TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
       new RegExp(source).toAutomaton(50000);
     });
     assertTrue(expected.getMessage().contains(source));
   }

   // LUCENE-6713
   public void testSerializeTooManyStatesToDeterminizeExc() throws Exception {
     // LUCENE-6046
     String source = "[ac]*a[ac]{50,200}";
     TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
       new RegExp(source).toAutomaton();
     });
     assertTrue(expected.getMessage().contains(source));
   }

   // LUCENE-6046
   public void testRepeatWithEmptyString() throws Exception {
     Automaton a = new RegExp("[^y]*{1,2}").toAutomaton(1000);
     // paranoia:
     assertTrue(a.toString().length() > 0);
   }

   public void testRepeatWithEmptyLanguage() throws Exception {
     Automaton a = new RegExp("#*").toAutomaton(1000);
     // paranoia:
     assertTrue(a.toString().length() > 0);
     a = new RegExp("#+").toAutomaton(1000);
     assertTrue(a.toString().length() > 0);
     a = new RegExp("#{2,10}").toAutomaton(1000);
     assertTrue(a.toString().length() > 0);
     a = new RegExp("#?").toAutomaton(1000);
     assertTrue(a.toString().length() > 0);
   }

   boolean caseSensitiveQuery = true;

   public void testCoreJavaParity() {
     // Generate random doc values and random regular expressions
     // and check for same matching behaviour as Java's Pattern class.
     for (int i = 0; i < 1000; i++) {
       caseSensitiveQuery = true;
       checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
     }
   }

   static String randomDocValue(int minLength) {
     String charPalette = "AAAaaaBbbCccc123456 \t";
     StringBuilder sb = new StringBuilder();
     for (int i = 0; i < minLength; i++) {
       sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
     }
     return sb.toString();
   }

   private static int randomInt(int bound) {
     return bound == 0 ? 0 : random().nextInt(bound);
   }

   protected String checkRandomExpression(String docValue) {
     // Generate and test a random regular expression which should match the given docValue
     StringBuilder result = new StringBuilder();
     // Pick a part of the string to change
     int substitutionPoint = randomInt(docValue.length() - 1);
     int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));

     // Add any head to the result, unchanged
     if (substitutionPoint > 0) {
       result.append(docValue.substring(0, substitutionPoint));
     }

     // Modify the middle...
     String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
     int mutation = random().nextInt(12);
     switch (mutation) {
       case 0:
         // OR with random alpha of same length
         result.append("(" + replacementPart + "|d" + randomDocValue(replacementPart.length()) + ")");
         break;
       case 1:
         // OR with non-existant value
         result.append("(" + replacementPart + "|doesnotexist)");
         break;
       case 2:
         // OR with another randomised regex (used to create nested levels of expression).
         result.append("(" + checkRandomExpression(replacementPart) + "|doesnotexist)");
         break;
       case 3:
         // Star-replace all ab sequences.
         result.append(replacementPart.replaceAll("ab", ".*"));
         break;
       case 4:
         // .-replace all b chars
         result.append(replacementPart.replaceAll("b", "."));
         break;
       case 5:
         // length-limited stars {1,2}
         result.append(".{1," + replacementPart.length() + "}");
         break;
       case 6:
         // replace all chars with .
         result.append(replacementPart.replaceAll(".", "."));
         break;
       case 7:
         // OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
         char[] chars = replacementPart.toCharArray();
         for (char c : chars) {
           result.append("[" + c + Character.toUpperCase(c) + "]");
         }
         break;
       case 8:
         // NOT a character - replace all b's with "not a"
         result.append(replacementPart.replaceAll("b", "[^a]"));
         break;
       case 9:
         // Make whole part repeatable 1 or more times
         result.append("(" + replacementPart + ")+");
         break;
       case 10:
         // Make whole part repeatable 0 or more times
         result.append("(" + replacementPart + ")?");
         break;
       case 11:
         // Switch case of characters
         StringBuilder switchedCase = new StringBuilder();
         replacementPart.codePoints().forEach(
             p -> {
               int switchedP = p;
               if (Character.isLowerCase(p)) {
                 switchedP = Character.toUpperCase(p);
               } else {
                 switchedP = Character.toLowerCase(p);
               }
               switchedCase.appendCodePoint(switchedP);
               if (p != switchedP) {
                 caseSensitiveQuery = false;
               }
             }
         );
         result.append(switchedCase.toString());
         break;
       default:
         break;
     }
     // add any remaining tail, unchanged
     if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
       result.append(docValue.substring(substitutionPoint + substitutionLength));
     }

     String regexPattern = result.toString();
     // Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
     Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern):
                                            Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
                                              ;
     Matcher matcher = pattern.matcher(docValue);
     assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());

     int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
     RegExp regex =  new RegExp(regexPattern, RegExp.ALL, matchFlags);
     Automaton automaton = regex.toAutomaton();
     ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
     BytesRef br = new BytesRef(docValue);
     assertTrue(
         "[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
             + docValue.length(),
         bytesMatcher.run(br.bytes, br.offset, br.length)
     );
     if (caseSensitiveQuery == false) {
       RegExp caseSensitiveRegex = new RegExp(regexPattern);
       Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
       ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
       assertFalse(
           "[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]",
           csBytesMatcher.run(br.bytes, br.offset, br.length)
       );

     }
     return regexPattern;
   }

 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.lucene.util.automaton;


	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.LuceneTestCase;

	import java.util.regex.Matcher;
	import java.util.regex.Pattern;

	public class TestRegExp extends LuceneTestCase {

	/**
	* Simple smoke test for regular expression.
	*/
	public void testSmoke() {
	RegExp r = new RegExp("a(b+\|c+)d");
	Automaton a = r.toAutomaton();
	assertTrue(a.isDeterministic());
	CharacterRunAutomaton run = new CharacterRunAutomaton(a);
	assertTrue(run.run("abbbbbd"));
	assertTrue(run.run("acd"));
	assertFalse(run.run("ad"));
	}

	/**
	* Compiles a regular expression that is prohibitively expensive to
	* determinize and expexts to catch an exception for it.
	*/
	public void testDeterminizeTooManyStates() {
	// LUCENE-6046
	String source = "[ac]*a[ac]{50,200}";
	TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
	new RegExp(source).toAutomaton();
	});
	assertTrue(expected.getMessage().contains(source));
	}

	public void testSerializeTooManyStatesToRepeat() throws Exception {
	String source = "a{50001}";
	TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
	new RegExp(source).toAutomaton(50000);
	});
	assertTrue(expected.getMessage().contains(source));
	}

	// LUCENE-6713
	public void testSerializeTooManyStatesToDeterminizeExc() throws Exception {
	// LUCENE-6046
	String source = "[ac]*a[ac]{50,200}";
	TooComplexToDeterminizeException expected = expectThrows(TooComplexToDeterminizeException.class, () -> {
	new RegExp(source).toAutomaton();
	});
	assertTrue(expected.getMessage().contains(source));
	}

	// LUCENE-6046
	public void testRepeatWithEmptyString() throws Exception {
	Automaton a = new RegExp("[^y]*{1,2}").toAutomaton(1000);
	// paranoia:
	assertTrue(a.toString().length() > 0);
	}

	public void testRepeatWithEmptyLanguage() throws Exception {
	Automaton a = new RegExp("#*").toAutomaton(1000);
	// paranoia:
	assertTrue(a.toString().length() > 0);
	a = new RegExp("#+").toAutomaton(1000);
	assertTrue(a.toString().length() > 0);
	a = new RegExp("#{2,10}").toAutomaton(1000);
	assertTrue(a.toString().length() > 0);
	a = new RegExp("#?").toAutomaton(1000);
	assertTrue(a.toString().length() > 0);
	}

	boolean caseSensitiveQuery = true;

	public void testCoreJavaParity() {
	// Generate random doc values and random regular expressions
	// and check for same matching behaviour as Java's Pattern class.
	for (int i = 0; i < 1000; i++) {
	caseSensitiveQuery = true;
	checkRandomExpression(randomDocValue(1 + random().nextInt(30)));
	}
	}

	static String randomDocValue(int minLength) {
	String charPalette = "AAAaaaBbbCccc123456 \t";
	StringBuilder sb = new StringBuilder();
	for (int i = 0; i < minLength; i++) {
	sb.append(charPalette.charAt(randomInt(charPalette.length() - 1)));
	}
	return sb.toString();
	}

	private static int randomInt(int bound) {
	return bound == 0 ? 0 : random().nextInt(bound);
	}

	protected String checkRandomExpression(String docValue) {
	// Generate and test a random regular expression which should match the given docValue
	StringBuilder result = new StringBuilder();
	// Pick a part of the string to change
	int substitutionPoint = randomInt(docValue.length() - 1);
	int substitutionLength = 1 + randomInt(Math.min(10, docValue.length() - substitutionPoint));

	// Add any head to the result, unchanged
	if (substitutionPoint > 0) {
	result.append(docValue.substring(0, substitutionPoint));
	}

	// Modify the middle...
	String replacementPart = docValue.substring(substitutionPoint, substitutionPoint + substitutionLength);
	int mutation = random().nextInt(12);
	switch (mutation) {
	case 0:
	// OR with random alpha of same length
	result.append("(" + replacementPart + "\|d" + randomDocValue(replacementPart.length()) + ")");
	break;
	case 1:
	// OR with non-existant value
	result.append("(" + replacementPart + "\|doesnotexist)");
	break;
	case 2:
	// OR with another randomised regex (used to create nested levels of expression).
	result.append("(" + checkRandomExpression(replacementPart) + "\|doesnotexist)");
	break;
	case 3:
	// Star-replace all ab sequences.
	result.append(replacementPart.replaceAll("ab", ".*"));
	break;
	case 4:
	// .-replace all b chars
	result.append(replacementPart.replaceAll("b", "."));
	break;
	case 5:
	// length-limited stars {1,2}
	result.append(".{1," + replacementPart.length() + "}");
	break;
	case 6:
	// replace all chars with .
	result.append(replacementPart.replaceAll(".", "."));
	break;
	case 7:
	// OR with uppercase chars eg [aA] (many of these sorts of expression in the wild..
	char[] chars = replacementPart.toCharArray();
	for (char c : chars) {
	result.append("[" + c + Character.toUpperCase(c) + "]");
	}
	break;
	case 8:
	// NOT a character - replace all b's with "not a"
	result.append(replacementPart.replaceAll("b", "[^a]"));
	break;
	case 9:
	// Make whole part repeatable 1 or more times
	result.append("(" + replacementPart + ")+");
	break;
	case 10:
	// Make whole part repeatable 0 or more times
	result.append("(" + replacementPart + ")?");
	break;
	case 11:
	// Switch case of characters
	StringBuilder switchedCase = new StringBuilder();
	replacementPart.codePoints().forEach(
	p -> {
	int switchedP = p;
	if (Character.isLowerCase(p)) {
	switchedP = Character.toUpperCase(p);
	} else {
	switchedP = Character.toLowerCase(p);
	}
	switchedCase.appendCodePoint(switchedP);
	if (p != switchedP) {
	caseSensitiveQuery = false;
	}
	}
	);
	result.append(switchedCase.toString());
	break;
	default:
	break;
	}
	// add any remaining tail, unchanged
	if (substitutionPoint + substitutionLength <= docValue.length() - 1) {
	result.append(docValue.substring(substitutionPoint + substitutionLength));
	}

	String regexPattern = result.toString();
	// Assert our randomly generated regex actually matches the provided raw input using java's expression matcher
	Pattern pattern = caseSensitiveQuery ? Pattern.compile(regexPattern):
	Pattern.compile(regexPattern, Pattern.CASE_INSENSITIVE);
	;
	Matcher matcher = pattern.matcher(docValue);
	assertTrue("Java regex " + regexPattern + " did not match doc value " + docValue, matcher.matches());

	int matchFlags = caseSensitiveQuery ? 0 : RegExp.ASCII_CASE_INSENSITIVE;
	RegExp regex = new RegExp(regexPattern, RegExp.ALL, matchFlags);
	Automaton automaton = regex.toAutomaton();
	ByteRunAutomaton bytesMatcher = new ByteRunAutomaton(automaton);
	BytesRef br = new BytesRef(docValue);
	assertTrue(
	"[" + regexPattern + "]should match [" + docValue + "]" + substitutionPoint + "-" + substitutionLength + "/"
	+ docValue.length(),
	bytesMatcher.run(br.bytes, br.offset, br.length)
	);
	if (caseSensitiveQuery == false) {
	RegExp caseSensitiveRegex = new RegExp(regexPattern);
	Automaton csAutomaton = caseSensitiveRegex.toAutomaton();
	ByteRunAutomaton csBytesMatcher = new ByteRunAutomaton(csAutomaton);
	assertFalse(
	"[" + regexPattern + "] with case sensitive setting should not match [" + docValue + "]",
	csBytesMatcher.run(br.bytes, br.offset, br.length)
	);

	}
	return regexPattern;
	}

	}