blob: 239b9ea7e11b607eb7915759f2541d5f1bd54743 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.util;
import com.carrotsearch.randomizedtesting.generators.RandomNumbers;
import com.carrotsearch.randomizedtesting.generators.RandomPicks;
import java.io.BufferedInputStream;
import java.io.ByteArrayOutputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.OutputStream;
import java.io.PrintStream;
import java.math.BigDecimal;
import java.math.BigInteger;
import java.nio.CharBuffer;
import java.nio.file.FileSystem;
import java.nio.file.Files;
import java.nio.file.Path;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collection;
import java.util.Collections;
import java.util.Comparator;
import java.util.HashMap;
import java.util.Iterator;
import java.util.List;
import java.util.Map;
import java.util.NoSuchElementException;
import java.util.Random;
import java.util.concurrent.ExecutorService;
import java.util.concurrent.TimeUnit;
import java.util.regex.Pattern;
import java.util.regex.PatternSyntaxException;
import java.util.zip.ZipEntry;
import java.util.zip.ZipInputStream;
import org.apache.lucene.codecs.Codec;
import org.apache.lucene.codecs.DocValuesFormat;
import org.apache.lucene.codecs.PostingsFormat;
import org.apache.lucene.codecs.VectorFormat;
import org.apache.lucene.codecs.asserting.AssertingCodec;
import org.apache.lucene.codecs.blockterms.LuceneFixedGap;
import org.apache.lucene.codecs.blocktreeords.BlockTreeOrdsPostingsFormat;
import org.apache.lucene.codecs.lucene90.Lucene90Codec;
import org.apache.lucene.codecs.lucene90.Lucene90DocValuesFormat;
import org.apache.lucene.codecs.lucene90.Lucene90HnswVectorFormat;
import org.apache.lucene.codecs.lucene90.Lucene90PostingsFormat;
import org.apache.lucene.codecs.perfield.PerFieldDocValuesFormat;
import org.apache.lucene.codecs.perfield.PerFieldPostingsFormat;
import org.apache.lucene.document.BinaryDocValuesField;
import org.apache.lucene.document.BinaryPoint;
import org.apache.lucene.document.Document;
import org.apache.lucene.document.Field;
import org.apache.lucene.document.NumericDocValuesField;
import org.apache.lucene.document.SortedDocValuesField;
import org.apache.lucene.index.CheckIndex;
import org.apache.lucene.index.CodecReader;
import org.apache.lucene.index.ConcurrentMergeScheduler;
import org.apache.lucene.index.DirectoryReader;
import org.apache.lucene.index.DocValuesType;
import org.apache.lucene.index.FieldInfo;
import org.apache.lucene.index.IndexFileNames;
import org.apache.lucene.index.IndexReader;
import org.apache.lucene.index.IndexWriter;
import org.apache.lucene.index.IndexableField;
import org.apache.lucene.index.LeafReader;
import org.apache.lucene.index.LeafReaderContext;
import org.apache.lucene.index.LogMergePolicy;
import org.apache.lucene.index.MergePolicy;
import org.apache.lucene.index.MergeScheduler;
import org.apache.lucene.index.MultiTerms;
import org.apache.lucene.index.PostingsEnum;
import org.apache.lucene.index.SlowCodecReaderWrapper;
import org.apache.lucene.index.Terms;
import org.apache.lucene.index.TermsEnum;
import org.apache.lucene.index.TieredMergePolicy;
import org.apache.lucene.mockfile.FilterFileSystem;
import org.apache.lucene.mockfile.VirusCheckingFS;
import org.apache.lucene.mockfile.WindowsFS;
import org.apache.lucene.search.FieldDoc;
import org.apache.lucene.search.ScoreDoc;
import org.apache.lucene.search.TopDocs;
import org.apache.lucene.search.TotalHits;
import org.apache.lucene.store.ByteBuffersDirectory;
import org.apache.lucene.store.Directory;
import org.apache.lucene.store.FSDirectory;
import org.apache.lucene.store.FilterDirectory;
import org.apache.lucene.store.IOContext;
import org.apache.lucene.store.NoLockFactory;
import org.junit.Assert;
/** General utility methods for Lucene unit tests. */
public final class TestUtil {
private TestUtil() {
//
}
/**
* A comparator that compares UTF-16 strings / char sequences according to Unicode code point
* order. This can be used to verify {@link BytesRef} order.
*
* <p><b>Warning:</b> This comparator is rather inefficient, because it converts the strings to a
* {@code int[]} array on each invocation.
*/
public static final Comparator<CharSequence> STRING_CODEPOINT_COMPARATOR =
(a, b) -> {
final int[] aCodePoints = a.codePoints().toArray();
final int[] bCodePoints = b.codePoints().toArray();
for (int i = 0, c = Math.min(aCodePoints.length, bCodePoints.length); i < c; i++) {
if (aCodePoints[i] < bCodePoints[i]) {
return -1;
} else if (aCodePoints[i] > bCodePoints[i]) {
return 1;
}
}
return aCodePoints.length - bCodePoints.length;
};
/**
* Convenience method unzipping zipName into destDir. You must pass it a clean destDir.
*
* <p>Closes the given InputStream after extracting!
*/
public static void unzip(InputStream in, Path destDir) throws IOException {
in = new BufferedInputStream(in);
try (ZipInputStream zipInput = new ZipInputStream(in)) {
ZipEntry entry;
byte[] buffer = new byte[8192];
while ((entry = zipInput.getNextEntry()) != null) {
Path targetFile = destDir.resolve(entry.getName());
// be on the safe side: do not rely on that directories are always extracted
// before their children (although this makes sense, but is it guaranteed?)
Files.createDirectories(targetFile.getParent());
if (!entry.isDirectory()) {
OutputStream out = Files.newOutputStream(targetFile);
int len;
while ((len = zipInput.read(buffer)) >= 0) {
out.write(buffer, 0, len);
}
out.close();
}
zipInput.closeEntry();
}
}
}
/**
* Checks that the provided iterator is well-formed.
*
* <ul>
* <li>is read-only: does not allow {@code remove}
* <li>returns {@code expectedSize} number of elements
* <li>does not return null elements, unless {@code allowNull} is true.
* <li>throws NoSuchElementException if {@code next} is called after {@code hasNext} returns
* false.
* </ul>
*/
public static <T> void checkIterator(Iterator<T> iterator, long expectedSize, boolean allowNull) {
for (long i = 0; i < expectedSize; i++) {
boolean hasNext = iterator.hasNext();
assert hasNext;
T v = iterator.next();
assert allowNull || v != null;
// for the first element, check that remove is not supported
if (i == 0) {
try {
iterator.remove();
throw new AssertionError("broken iterator (supports remove): " + iterator);
} catch (
@SuppressWarnings("unused")
UnsupportedOperationException expected) {
// ok
}
}
}
assert !iterator.hasNext();
try {
iterator.next();
throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator);
} catch (
@SuppressWarnings("unused")
NoSuchElementException expected) {
// ok
}
}
/**
* Checks that the provided iterator is well-formed.
*
* <ul>
* <li>is read-only: does not allow {@code remove}
* <li>does not return null elements.
* <li>throws NoSuchElementException if {@code next} is called after {@code hasNext} returns
* false.
* </ul>
*/
public static <T> void checkIterator(Iterator<T> iterator) {
while (iterator.hasNext()) {
T v = iterator.next();
assert v != null;
try {
iterator.remove();
throw new AssertionError("broken iterator (supports remove): " + iterator);
} catch (
@SuppressWarnings("unused")
UnsupportedOperationException expected) {
// ok
}
}
try {
iterator.next();
throw new AssertionError("broken iterator (allows next() when hasNext==false) " + iterator);
} catch (
@SuppressWarnings("unused")
NoSuchElementException expected) {
// ok
}
}
/**
* Checks that the provided collection is read-only.
*
* @see #checkIterator(Iterator)
*/
public static <T> void checkReadOnly(Collection<T> coll) {
int size = 0;
for (Iterator<?> it = coll.iterator(); it.hasNext(); ) {
it.next();
size += 1;
}
if (size != coll.size()) {
throw new AssertionError(
"broken collection, reported size is "
+ coll.size()
+ " but iterator has "
+ size
+ " elements: "
+ coll);
}
if (coll.isEmpty() == false) {
try {
coll.remove(coll.iterator().next());
throw new AssertionError("broken collection (supports remove): " + coll);
} catch (
@SuppressWarnings("unused")
UnsupportedOperationException e) {
// ok
}
}
try {
coll.add(null);
throw new AssertionError("broken collection (supports add): " + coll);
} catch (
@SuppressWarnings("unused")
UnsupportedOperationException e) {
// ok
}
try {
coll.addAll(Collections.singleton(null));
throw new AssertionError("broken collection (supports addAll): " + coll);
} catch (
@SuppressWarnings("unused")
UnsupportedOperationException e) {
// ok
}
checkIterator(coll.iterator());
}
public static void syncConcurrentMerges(IndexWriter writer) {
syncConcurrentMerges(writer.getConfig().getMergeScheduler());
}
public static void syncConcurrentMerges(MergeScheduler ms) {
if (ms instanceof ConcurrentMergeScheduler) ((ConcurrentMergeScheduler) ms).sync();
}
/**
* This runs the CheckIndex tool on the index in. If any issues are hit, a RuntimeException is
* thrown; else, true is returned.
*/
public static CheckIndex.Status checkIndex(Directory dir) throws IOException {
return checkIndex(dir, true);
}
public static CheckIndex.Status checkIndex(Directory dir, boolean doSlowChecks)
throws IOException {
return checkIndex(dir, doSlowChecks, false, null);
}
/**
* If failFast is true, then throw the first exception when index corruption is hit, instead of
* moving on to other fields/segments to look for any other corruption.
*/
public static CheckIndex.Status checkIndex(
Directory dir, boolean doSlowChecks, boolean failFast, ByteArrayOutputStream output)
throws IOException {
if (output == null) {
output = new ByteArrayOutputStream(1024);
}
// TODO: actually use the dir's locking, unless test uses a special method?
// some tests e.g. exception tests become much more complicated if they have to close the writer
try (CheckIndex checker =
new CheckIndex(dir, NoLockFactory.INSTANCE.obtainLock(dir, "bogus"))) {
checker.setDoSlowChecks(doSlowChecks);
checker.setFailFast(failFast);
checker.setInfoStream(new PrintStream(output, false, IOUtils.UTF_8), false);
CheckIndex.Status indexStatus = checker.checkIndex(null);
if (indexStatus == null || indexStatus.clean == false) {
System.out.println("CheckIndex failed");
System.out.println(output.toString(IOUtils.UTF_8));
throw new RuntimeException("CheckIndex failed");
} else {
if (LuceneTestCase.INFOSTREAM) {
System.out.println(output.toString(IOUtils.UTF_8));
}
return indexStatus;
}
}
}
/**
* This runs the CheckIndex tool on the Reader. If any issues are hit, a RuntimeException is
* thrown
*/
public static void checkReader(IndexReader reader) throws IOException {
for (LeafReaderContext context : reader.leaves()) {
checkReader(context.reader(), true);
}
}
public static void checkReader(LeafReader reader, boolean doSlowChecks) throws IOException {
ByteArrayOutputStream bos = new ByteArrayOutputStream(1024);
PrintStream infoStream = new PrintStream(bos, false, IOUtils.UTF_8);
final CodecReader codecReader;
if (reader instanceof CodecReader) {
codecReader = (CodecReader) reader;
reader.checkIntegrity();
} else {
codecReader = SlowCodecReaderWrapper.wrap(reader);
}
CheckIndex.testLiveDocs(codecReader, infoStream, true);
CheckIndex.testFieldInfos(codecReader, infoStream, true);
CheckIndex.testFieldNorms(codecReader, infoStream, true);
CheckIndex.testPostings(codecReader, infoStream, false, doSlowChecks, true);
CheckIndex.testStoredFields(codecReader, infoStream, true);
CheckIndex.testTermVectors(codecReader, infoStream, false, doSlowChecks, true);
CheckIndex.testDocValues(codecReader, infoStream, true);
CheckIndex.testPoints(codecReader, infoStream, true);
// some checks really against the reader API
checkReaderSanity(reader);
if (LuceneTestCase.INFOSTREAM) {
System.out.println(bos.toString(IOUtils.UTF_8));
}
// FieldInfos should be cached at the reader and always return the same instance
if (reader.getFieldInfos() != reader.getFieldInfos()) {
throw new RuntimeException(
"getFieldInfos() returned different instances for class: " + reader.getClass());
}
}
// used by TestUtil.checkReader to check some things really unrelated to the index,
// just looking for bugs in indexreader implementations.
private static void checkReaderSanity(LeafReader reader) throws IOException {
for (FieldInfo info : reader.getFieldInfos()) {
// reader shouldn't return normValues if the field does not have them
if (!info.hasNorms()) {
if (reader.getNormValues(info.name) != null) {
throw new RuntimeException("field: " + info.name + " should omit norms but has them!");
}
}
// reader shouldn't return docValues if the field does not have them
// reader shouldn't return multiple docvalues types for the same field.
switch (info.getDocValuesType()) {
case NONE:
if (reader.getBinaryDocValues(info.name) != null
|| reader.getNumericDocValues(info.name) != null
|| reader.getSortedDocValues(info.name) != null
|| reader.getSortedSetDocValues(info.name) != null) {
throw new RuntimeException(
"field: " + info.name + " has docvalues but should omit them!");
}
break;
case SORTED:
if (reader.getBinaryDocValues(info.name) != null
|| reader.getNumericDocValues(info.name) != null
|| reader.getSortedNumericDocValues(info.name) != null
|| reader.getSortedSetDocValues(info.name) != null) {
throw new RuntimeException(info.name + " returns multiple docvalues types!");
}
break;
case SORTED_NUMERIC:
if (reader.getBinaryDocValues(info.name) != null
|| reader.getNumericDocValues(info.name) != null
|| reader.getSortedSetDocValues(info.name) != null
|| reader.getSortedDocValues(info.name) != null) {
throw new RuntimeException(info.name + " returns multiple docvalues types!");
}
break;
case SORTED_SET:
if (reader.getBinaryDocValues(info.name) != null
|| reader.getNumericDocValues(info.name) != null
|| reader.getSortedNumericDocValues(info.name) != null
|| reader.getSortedDocValues(info.name) != null) {
throw new RuntimeException(info.name + " returns multiple docvalues types!");
}
break;
case BINARY:
if (reader.getNumericDocValues(info.name) != null
|| reader.getSortedDocValues(info.name) != null
|| reader.getSortedNumericDocValues(info.name) != null
|| reader.getSortedSetDocValues(info.name) != null) {
throw new RuntimeException(info.name + " returns multiple docvalues types!");
}
break;
case NUMERIC:
if (reader.getBinaryDocValues(info.name) != null
|| reader.getSortedDocValues(info.name) != null
|| reader.getSortedNumericDocValues(info.name) != null
|| reader.getSortedSetDocValues(info.name) != null) {
throw new RuntimeException(info.name + " returns multiple docvalues types!");
}
break;
default:
throw new AssertionError();
}
}
}
/** start and end are BOTH inclusive */
public static int nextInt(Random r, int start, int end) {
return RandomNumbers.randomIntBetween(r, start, end);
}
/** start and end are BOTH inclusive */
public static long nextLong(Random r, long start, long end) {
assert end >= start : "start=" + start + ",end=" + end;
final BigInteger range =
BigInteger.valueOf(end).add(BigInteger.valueOf(1)).subtract(BigInteger.valueOf(start));
if (range.compareTo(BigInteger.valueOf(Integer.MAX_VALUE)) <= 0) {
return start + r.nextInt(range.intValue());
} else {
// probably not evenly distributed when range is large, but OK for tests
final BigInteger augend =
new BigDecimal(range).multiply(new BigDecimal(r.nextDouble())).toBigInteger();
final long result = BigInteger.valueOf(start).add(augend).longValue();
assert result >= start;
assert result <= end;
return result;
}
}
/** Returns a randomish big integer with {@code 1 .. maxBytes} storage. */
public static BigInteger nextBigInteger(Random random, int maxBytes) {
int length = TestUtil.nextInt(random, 1, maxBytes);
byte[] buffer = new byte[length];
random.nextBytes(buffer);
return new BigInteger(buffer);
}
public static String randomSimpleString(Random r, int maxLength) {
return randomSimpleString(r, 0, maxLength);
}
public static String randomSimpleString(Random r, int minLength, int maxLength) {
final int end = nextInt(r, minLength, maxLength);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
buffer[i] = (char) TestUtil.nextInt(r, 'a', 'z');
}
return new String(buffer, 0, end);
}
public static String randomSimpleStringRange(
Random r, char minChar, char maxChar, int maxLength) {
final int end = nextInt(r, 0, maxLength);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
for (int i = 0; i < end; i++) {
buffer[i] = (char) TestUtil.nextInt(r, minChar, maxChar);
}
return new String(buffer, 0, end);
}
public static String randomSimpleString(Random r) {
return randomSimpleString(r, 0, 10);
}
/** Returns random string, including full unicode range. */
public static String randomUnicodeString(Random r) {
return randomUnicodeString(r, 20);
}
/** Returns a random string up to a certain length. */
public static String randomUnicodeString(Random r, int maxLength) {
final int end = nextInt(r, 0, maxLength);
if (end == 0) {
// allow 0 length
return "";
}
final char[] buffer = new char[end];
randomFixedLengthUnicodeString(r, buffer, 0, buffer.length);
return new String(buffer, 0, end);
}
/** Fills provided char[] with valid random unicode code unit sequence. */
public static void randomFixedLengthUnicodeString(
Random random, char[] chars, int offset, int length) {
int i = offset;
final int end = offset + length;
while (i < end) {
final int t = random.nextInt(5);
if (0 == t && i < length - 1) {
// Make a surrogate pair
// High surrogate
chars[i++] = (char) nextInt(random, 0xd800, 0xdbff);
// Low surrogate
chars[i++] = (char) nextInt(random, 0xdc00, 0xdfff);
} else if (t <= 1) {
chars[i++] = (char) random.nextInt(0x80);
} else if (2 == t) {
chars[i++] = (char) nextInt(random, 0x80, 0x7ff);
} else if (3 == t) {
chars[i++] = (char) nextInt(random, 0x800, 0xd7ff);
} else if (4 == t) {
chars[i++] = (char) nextInt(random, 0xe000, 0xffff);
}
}
}
/**
* Returns a String thats "regexpish" (contains lots of operators typically found in regular
* expressions) If you call this enough times, you might get a valid regex!
*/
public static String randomRegexpishString(Random r) {
return randomRegexpishString(r, 20);
}
/**
* Maximum recursion bound for '+' and '*' replacements in {@link #randomRegexpishString(Random,
* int)}.
*/
private static final int maxRecursionBound = 5;
/** Operators for {@link #randomRegexpishString(Random, int)}. */
private static final List<String> ops =
Arrays.asList(
".",
"?",
"{0," + maxRecursionBound + "}", // bounded replacement for '*'
"{1," + maxRecursionBound + "}", // bounded replacement for '+'
"(",
")",
"-",
"[",
"]",
"|");
/**
* Returns a String thats "regexpish" (contains lots of operators typically found in regular
* expressions) If you call this enough times, you might get a valid regex!
*
* <p>Note: to avoid practically endless backtracking patterns we replace asterisk and plus
* operators with bounded repetitions. See LUCENE-4111 for more info.
*
* @param maxLength A hint about maximum length of the regexpish string. It may be exceeded by a
* few characters.
*/
public static String randomRegexpishString(Random r, int maxLength) {
final StringBuilder regexp = new StringBuilder(maxLength);
for (int i = nextInt(r, 0, maxLength); i > 0; i--) {
if (r.nextBoolean()) {
regexp.append((char) RandomNumbers.randomIntBetween(r, 'a', 'z'));
} else {
regexp.append(RandomPicks.randomFrom(r, ops));
}
}
return regexp.toString();
}
private static final String[] HTML_CHAR_ENTITIES = {
"AElig",
"Aacute",
"Acirc",
"Agrave",
"Alpha",
"AMP",
"Aring",
"Atilde",
"Auml",
"Beta",
"COPY",
"Ccedil",
"Chi",
"Dagger",
"Delta",
"ETH",
"Eacute",
"Ecirc",
"Egrave",
"Epsilon",
"Eta",
"Euml",
"Gamma",
"GT",
"Iacute",
"Icirc",
"Igrave",
"Iota",
"Iuml",
"Kappa",
"Lambda",
"LT",
"Mu",
"Ntilde",
"Nu",
"OElig",
"Oacute",
"Ocirc",
"Ograve",
"Omega",
"Omicron",
"Oslash",
"Otilde",
"Ouml",
"Phi",
"Pi",
"Prime",
"Psi",
"QUOT",
"REG",
"Rho",
"Scaron",
"Sigma",
"THORN",
"Tau",
"Theta",
"Uacute",
"Ucirc",
"Ugrave",
"Upsilon",
"Uuml",
"Xi",
"Yacute",
"Yuml",
"Zeta",
"aacute",
"acirc",
"acute",
"aelig",
"agrave",
"alefsym",
"alpha",
"amp",
"and",
"ang",
"apos",
"aring",
"asymp",
"atilde",
"auml",
"bdquo",
"beta",
"brvbar",
"bull",
"cap",
"ccedil",
"cedil",
"cent",
"chi",
"circ",
"clubs",
"cong",
"copy",
"crarr",
"cup",
"curren",
"dArr",
"dagger",
"darr",
"deg",
"delta",
"diams",
"divide",
"eacute",
"ecirc",
"egrave",
"empty",
"emsp",
"ensp",
"epsilon",
"equiv",
"eta",
"eth",
"euml",
"euro",
"exist",
"fnof",
"forall",
"frac12",
"frac14",
"frac34",
"frasl",
"gamma",
"ge",
"gt",
"hArr",
"harr",
"hearts",
"hellip",
"iacute",
"icirc",
"iexcl",
"igrave",
"image",
"infin",
"int",
"iota",
"iquest",
"isin",
"iuml",
"kappa",
"lArr",
"lambda",
"lang",
"laquo",
"larr",
"lceil",
"ldquo",
"le",
"lfloor",
"lowast",
"loz",
"lrm",
"lsaquo",
"lsquo",
"lt",
"macr",
"mdash",
"micro",
"middot",
"minus",
"mu",
"nabla",
"nbsp",
"ndash",
"ne",
"ni",
"not",
"notin",
"nsub",
"ntilde",
"nu",
"oacute",
"ocirc",
"oelig",
"ograve",
"oline",
"omega",
"omicron",
"oplus",
"or",
"ordf",
"ordm",
"oslash",
"otilde",
"otimes",
"ouml",
"para",
"part",
"permil",
"perp",
"phi",
"pi",
"piv",
"plusmn",
"pound",
"prime",
"prod",
"prop",
"psi",
"quot",
"rArr",
"radic",
"rang",
"raquo",
"rarr",
"rceil",
"rdquo",
"real",
"reg",
"rfloor",
"rho",
"rlm",
"rsaquo",
"rsquo",
"sbquo",
"scaron",
"sdot",
"sect",
"shy",
"sigma",
"sigmaf",
"sim",
"spades",
"sub",
"sube",
"sum",
"sup",
"sup1",
"sup2",
"sup3",
"supe",
"szlig",
"tau",
"there4",
"theta",
"thetasym",
"thinsp",
"thorn",
"tilde",
"times",
"trade",
"uArr",
"uacute",
"uarr",
"ucirc",
"ugrave",
"uml",
"upsih",
"upsilon",
"uuml",
"weierp",
"xi",
"yacute",
"yen",
"yuml",
"zeta",
"zwj",
"zwnj"
};
public static String randomHtmlishString(Random random, int numElements) {
final int end = nextInt(random, 0, numElements);
if (end == 0) {
// allow 0 length
return "";
}
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++) {
int val = random.nextInt(25);
switch (val) {
case 0:
sb.append("<p>");
break;
case 1:
{
sb.append("<");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
for (int j = 0; j < nextInt(random, 0, 10); ++j) {
sb.append(' ');
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append('=');
sb.append(" ".substring(nextInt(random, 0, 1)));
sb.append("\"".substring(nextInt(random, 0, 1)));
sb.append(randomSimpleString(random));
sb.append("\"".substring(nextInt(random, 0, 1)));
}
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append("/".substring(nextInt(random, 0, 1)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 2:
{
sb.append("</");
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(randomSimpleString(random));
sb.append(" ".substring(nextInt(random, 0, 4)));
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
case 3:
sb.append(">");
break;
case 4:
sb.append("</p>");
break;
case 5:
sb.append("<!--");
break;
case 6:
sb.append("<!--#");
break;
case 7:
sb.append("<script><!-- f('");
break;
case 8:
sb.append("</script>");
break;
case 9:
sb.append("<?");
break;
case 10:
sb.append("?>");
break;
case 11:
sb.append("\"");
break;
case 12:
sb.append("\\\"");
break;
case 13:
sb.append("'");
break;
case 14:
sb.append("\\'");
break;
case 15:
sb.append("-->");
break;
case 16:
{
sb.append("&");
switch (nextInt(random, 0, 2)) {
case 0:
sb.append(randomSimpleString(random));
break;
case 1:
sb.append(HTML_CHAR_ENTITIES[random.nextInt(HTML_CHAR_ENTITIES.length)]);
break;
}
sb.append(";".substring(nextInt(random, 0, 1)));
break;
}
case 17:
{
sb.append("&#");
if (0 == nextInt(random, 0, 1)) {
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 18:
{
sb.append("&#x");
if (0 == nextInt(random, 0, 1)) {
sb.append(Integer.toString(nextInt(random, 0, Integer.MAX_VALUE - 1), 16));
sb.append(";".substring(nextInt(random, 0, 1)));
}
break;
}
case 19:
sb.append(";");
break;
case 20:
sb.append(nextInt(random, 0, Integer.MAX_VALUE - 1));
break;
case 21:
sb.append("\n");
break;
case 22:
sb.append(" ".substring(nextInt(random, 0, 10)));
break;
case 23:
{
sb.append("<");
if (0 == nextInt(random, 0, 3)) {
sb.append(" ".substring(nextInt(random, 1, 10)));
}
if (0 == nextInt(random, 0, 1)) {
sb.append("/");
if (0 == nextInt(random, 0, 3)) {
sb.append(" ".substring(nextInt(random, 1, 10)));
}
}
switch (nextInt(random, 0, 3)) {
case 0:
sb.append(randomlyRecaseCodePoints(random, "script"));
break;
case 1:
sb.append(randomlyRecaseCodePoints(random, "style"));
break;
case 2:
sb.append(randomlyRecaseCodePoints(random, "br"));
break;
// default: append nothing
}
sb.append(">".substring(nextInt(random, 0, 1)));
break;
}
default:
sb.append(randomSimpleString(random));
}
}
return sb.toString();
}
/** Randomly upcases, downcases, or leaves intact each code point in the given string */
public static String randomlyRecaseCodePoints(Random random, String str) {
StringBuilder builder = new StringBuilder();
int pos = 0;
while (pos < str.length()) {
int codePoint = str.codePointAt(pos);
pos += Character.charCount(codePoint);
switch (nextInt(random, 0, 2)) {
case 0:
builder.appendCodePoint(Character.toUpperCase(codePoint));
break;
case 1:
builder.appendCodePoint(Character.toLowerCase(codePoint));
break;
case 2:
builder.appendCodePoint(codePoint); // leave intact
}
}
return builder.toString();
}
private static final int[] blockStarts = {
0x0000, 0x0080, 0x0100, 0x0180, 0x0250, 0x02B0, 0x0300, 0x0370, 0x0400, 0x0500, 0x0530, 0x0590,
0x0600, 0x0700, 0x0750, 0x0780, 0x07C0, 0x0800, 0x0900, 0x0980, 0x0A00, 0x0A80, 0x0B00, 0x0B80,
0x0C00, 0x0C80, 0x0D00, 0x0D80, 0x0E00, 0x0E80, 0x0F00, 0x1000, 0x10A0, 0x1100, 0x1200, 0x1380,
0x13A0, 0x1400, 0x1680, 0x16A0, 0x1700, 0x1720, 0x1740, 0x1760, 0x1780, 0x1800, 0x18B0, 0x1900,
0x1950, 0x1980, 0x19E0, 0x1A00, 0x1A20, 0x1B00, 0x1B80, 0x1C00, 0x1C50, 0x1CD0, 0x1D00, 0x1D80,
0x1DC0, 0x1E00, 0x1F00, 0x2000, 0x2070, 0x20A0, 0x20D0, 0x2100, 0x2150, 0x2190, 0x2200, 0x2300,
0x2400, 0x2440, 0x2460, 0x2500, 0x2580, 0x25A0, 0x2600, 0x2700, 0x27C0, 0x27F0, 0x2800, 0x2900,
0x2980, 0x2A00, 0x2B00, 0x2C00, 0x2C60, 0x2C80, 0x2D00, 0x2D30, 0x2D80, 0x2DE0, 0x2E00, 0x2E80,
0x2F00, 0x2FF0, 0x3000, 0x3040, 0x30A0, 0x3100, 0x3130, 0x3190, 0x31A0, 0x31C0, 0x31F0, 0x3200,
0x3300, 0x3400, 0x4DC0, 0x4E00, 0xA000, 0xA490, 0xA4D0, 0xA500, 0xA640, 0xA6A0, 0xA700, 0xA720,
0xA800, 0xA830, 0xA840, 0xA880, 0xA8E0, 0xA900, 0xA930, 0xA960, 0xA980, 0xAA00, 0xAA60, 0xAA80,
0xABC0, 0xAC00, 0xD7B0, 0xE000, 0xF900, 0xFB00, 0xFB50, 0xFE00, 0xFE10, 0xFE20, 0xFE30, 0xFE50,
0xFE70, 0xFF00, 0xFFF0, 0x10000, 0x10080, 0x10100, 0x10140, 0x10190, 0x101D0, 0x10280, 0x102A0,
0x10300, 0x10330, 0x10380, 0x103A0, 0x10400, 0x10450, 0x10480, 0x10800, 0x10840, 0x10900,
0x10920, 0x10A00, 0x10A60, 0x10B00, 0x10B40, 0x10B60, 0x10C00, 0x10E60, 0x11080, 0x12000,
0x12400, 0x13000, 0x1D000, 0x1D100, 0x1D200, 0x1D300, 0x1D360, 0x1D400, 0x1F000, 0x1F030,
0x1F100, 0x1F200, 0x20000, 0x2A700, 0x2F800, 0xE0000, 0xE0100, 0xF0000, 0x100000
};
private static final int[] blockEnds = {
0x007F, 0x00FF, 0x017F, 0x024F, 0x02AF, 0x02FF, 0x036F, 0x03FF, 0x04FF, 0x052F, 0x058F, 0x05FF,
0x06FF, 0x074F, 0x077F, 0x07BF, 0x07FF, 0x083F, 0x097F, 0x09FF, 0x0A7F, 0x0AFF, 0x0B7F, 0x0BFF,
0x0C7F, 0x0CFF, 0x0D7F, 0x0DFF, 0x0E7F, 0x0EFF, 0x0FFF, 0x109F, 0x10FF, 0x11FF, 0x137F, 0x139F,
0x13FF, 0x167F, 0x169F, 0x16FF, 0x171F, 0x173F, 0x175F, 0x177F, 0x17FF, 0x18AF, 0x18FF, 0x194F,
0x197F, 0x19DF, 0x19FF, 0x1A1F, 0x1AAF, 0x1B7F, 0x1BBF, 0x1C4F, 0x1C7F, 0x1CFF, 0x1D7F, 0x1DBF,
0x1DFF, 0x1EFF, 0x1FFF, 0x206F, 0x209F, 0x20CF, 0x20FF, 0x214F, 0x218F, 0x21FF, 0x22FF, 0x23FF,
0x243F, 0x245F, 0x24FF, 0x257F, 0x259F, 0x25FF, 0x26FF, 0x27BF, 0x27EF, 0x27FF, 0x28FF, 0x297F,
0x29FF, 0x2AFF, 0x2BFF, 0x2C5F, 0x2C7F, 0x2CFF, 0x2D2F, 0x2D7F, 0x2DDF, 0x2DFF, 0x2E7F, 0x2EFF,
0x2FDF, 0x2FFF, 0x303F, 0x309F, 0x30FF, 0x312F, 0x318F, 0x319F, 0x31BF, 0x31EF, 0x31FF, 0x32FF,
0x33FF, 0x4DBF, 0x4DFF, 0x9FFF, 0xA48F, 0xA4CF, 0xA4FF, 0xA63F, 0xA69F, 0xA6FF, 0xA71F, 0xA7FF,
0xA82F, 0xA83F, 0xA87F, 0xA8DF, 0xA8FF, 0xA92F, 0xA95F, 0xA97F, 0xA9DF, 0xAA5F, 0xAA7F, 0xAADF,
0xABFF, 0xD7AF, 0xD7FF, 0xF8FF, 0xFAFF, 0xFB4F, 0xFDFF, 0xFE0F, 0xFE1F, 0xFE2F, 0xFE4F, 0xFE6F,
0xFEFF, 0xFFEF, 0xFFFF, 0x1007F, 0x100FF, 0x1013F, 0x1018F, 0x101CF, 0x101FF, 0x1029F, 0x102DF,
0x1032F, 0x1034F, 0x1039F, 0x103DF, 0x1044F, 0x1047F, 0x104AF, 0x1083F, 0x1085F, 0x1091F,
0x1093F, 0x10A5F, 0x10A7F, 0x10B3F, 0x10B5F, 0x10B7F, 0x10C4F, 0x10E7F, 0x110CF, 0x123FF,
0x1247F, 0x1342F, 0x1D0FF, 0x1D1FF, 0x1D24F, 0x1D35F, 0x1D37F, 0x1D7FF, 0x1F02F, 0x1F09F,
0x1F1FF, 0x1F2FF, 0x2A6DF, 0x2B73F, 0x2FA1F, 0xE007F, 0xE01EF, 0xFFFFF, 0x10FFFF
};
/**
* Returns random string of length between 0-20 codepoints, all codepoints within the same unicode
* block.
*/
public static String randomRealisticUnicodeString(Random r) {
return randomRealisticUnicodeString(r, 20);
}
/**
* Returns random string of length up to maxLength codepoints , all codepoints within the same
* unicode block.
*/
public static String randomRealisticUnicodeString(Random r, int maxLength) {
return randomRealisticUnicodeString(r, 0, maxLength);
}
/**
* Returns random string of length between min and max codepoints, all codepoints within the same
* unicode block.
*/
public static String randomRealisticUnicodeString(Random r, int minLength, int maxLength) {
final int end = nextInt(r, minLength, maxLength);
final int block = r.nextInt(blockStarts.length);
StringBuilder sb = new StringBuilder();
for (int i = 0; i < end; i++)
sb.appendCodePoint(nextInt(r, blockStarts[block], blockEnds[block]));
return sb.toString();
}
/** Returns random string, with a given UTF-8 byte length */
public static String randomFixedByteLengthUnicodeString(Random r, int length) {
final char[] buffer = new char[length * 3];
int bytes = length;
int i = 0;
for (; i < buffer.length && bytes != 0; i++) {
int t;
if (bytes >= 4) {
t = r.nextInt(5);
} else if (bytes >= 3) {
t = r.nextInt(4);
} else if (bytes >= 2) {
t = r.nextInt(2);
} else {
t = 0;
}
if (t == 0) {
buffer[i] = (char) r.nextInt(0x80);
bytes--;
} else if (1 == t) {
buffer[i] = (char) nextInt(r, 0x80, 0x7ff);
bytes -= 2;
} else if (2 == t) {
buffer[i] = (char) nextInt(r, 0x800, 0xd7ff);
bytes -= 3;
} else if (3 == t) {
buffer[i] = (char) nextInt(r, 0xe000, 0xffff);
bytes -= 3;
} else if (4 == t) {
// Make a surrogate pair
// High surrogate
buffer[i++] = (char) nextInt(r, 0xd800, 0xdbff);
// Low surrogate
buffer[i] = (char) nextInt(r, 0xdc00, 0xdfff);
bytes -= 4;
}
}
return new String(buffer, 0, i);
}
/** Returns a random binary term. */
public static BytesRef randomBinaryTerm(Random r) {
int length = r.nextInt(15);
BytesRef b = new BytesRef(length);
r.nextBytes(b.bytes);
b.length = length;
return b;
}
/**
* Return a Codec that can read any of the default codecs and formats, but always writes in the
* specified format.
*/
public static Codec alwaysPostingsFormat(final PostingsFormat format) {
// TODO: we really need for postings impls etc to announce themselves
// (and maybe their params, too) to infostream on flush and merge.
// otherwise in a real debugging situation we won't know whats going on!
if (LuceneTestCase.VERBOSE) {
System.out.println("forcing postings format to:" + format);
}
return new AssertingCodec() {
@Override
public PostingsFormat getPostingsFormatForField(String field) {
return format;
}
};
}
/**
* Return a Codec that can read any of the default codecs and formats, but always writes in the
* specified format.
*/
public static Codec alwaysDocValuesFormat(final DocValuesFormat format) {
// TODO: we really need for docvalues impls etc to announce themselves
// (and maybe their params, too) to infostream on flush and merge.
// otherwise in a real debugging situation we won't know whats going on!
if (LuceneTestCase.VERBOSE) {
System.out.println("TestUtil: forcing docvalues format to:" + format);
}
return new AssertingCodec() {
@Override
public DocValuesFormat getDocValuesFormatForField(String field) {
return format;
}
};
}
/**
* Returns the actual default codec (e.g. LuceneMNCodec) for this version of Lucene. This may be
* different than {@link Codec#getDefault()} because that is randomized.
*/
public static Codec getDefaultCodec() {
return new Lucene90Codec();
}
/**
* Returns the actual default postings format (e.g. LuceneMNPostingsFormat for this version of
* Lucene.
*/
public static PostingsFormat getDefaultPostingsFormat() {
return new Lucene90PostingsFormat();
}
/**
* Returns the actual default postings format (e.g. LuceneMNPostingsFormat for this version of
* Lucene.
*
* @lucene.internal this may disappear at any time
*/
public static PostingsFormat getDefaultPostingsFormat(
int minItemsPerBlock, int maxItemsPerBlock) {
return new Lucene90PostingsFormat(minItemsPerBlock, maxItemsPerBlock);
}
/** Returns a random postings format that supports term ordinals */
public static PostingsFormat getPostingsFormatWithOrds(Random r) {
switch (r.nextInt(2)) {
case 0:
return new LuceneFixedGap();
case 1:
return new BlockTreeOrdsPostingsFormat();
// TODO: these don't actually support ords!
// case 2: return new FSTOrdPostingsFormat();
default:
throw new AssertionError();
}
}
/**
* Returns the actual default docvalues format (e.g. LuceneMNDocValuesFormat for this version of
* Lucene.
*/
public static DocValuesFormat getDefaultDocValuesFormat() {
return new Lucene90DocValuesFormat();
}
// TODO: generalize all 'test-checks-for-crazy-codecs' to
// annotations (LUCENE-3489)
public static String getPostingsFormat(String field) {
return getPostingsFormat(Codec.getDefault(), field);
}
public static String getPostingsFormat(Codec codec, String field) {
PostingsFormat p = codec.postingsFormat();
if (p instanceof PerFieldPostingsFormat) {
return ((PerFieldPostingsFormat) p).getPostingsFormatForField(field).getName();
} else {
return p.getName();
}
}
public static String getDocValuesFormat(String field) {
return getDocValuesFormat(Codec.getDefault(), field);
}
public static String getDocValuesFormat(Codec codec, String field) {
DocValuesFormat f = codec.docValuesFormat();
if (f instanceof PerFieldDocValuesFormat) {
return ((PerFieldDocValuesFormat) f).getDocValuesFormatForField(field).getName();
} else {
return f.getName();
}
}
// TODO: remove this, push this test to Lucene40/Lucene42 codec tests
public static boolean fieldSupportsHugeBinaryDocValues(String field) {
String dvFormat = getDocValuesFormat(field);
if (dvFormat.equals("Lucene40") || dvFormat.equals("Lucene42")) {
return false;
}
return true;
}
/**
* Returns the actual default vector format (e.g. LuceneMNVectorFormat for this version of Lucene.
*/
public static VectorFormat getDefaultVectorFormat() {
return new Lucene90HnswVectorFormat();
}
public static boolean anyFilesExceptWriteLock(Directory dir) throws IOException {
String[] files = dir.listAll();
if (files.length > 1 || (files.length == 1 && !files[0].equals("write.lock"))) {
return true;
} else {
return false;
}
}
public static void addIndexesSlowly(IndexWriter writer, DirectoryReader... readers)
throws IOException {
List<CodecReader> leaves = new ArrayList<>();
for (DirectoryReader reader : readers) {
for (LeafReaderContext context : reader.leaves()) {
leaves.add(SlowCodecReaderWrapper.wrap(context.reader()));
}
}
writer.addIndexes(leaves.toArray(new CodecReader[leaves.size()]));
}
/** just tries to configure things to keep the open file count lowish */
public static void reduceOpenFiles(IndexWriter w) {
// keep number of open files lowish
MergePolicy mp = w.getConfig().getMergePolicy();
mp.setNoCFSRatio(1.0);
if (mp instanceof LogMergePolicy) {
LogMergePolicy lmp = (LogMergePolicy) mp;
lmp.setMergeFactor(Math.min(5, lmp.getMergeFactor()));
} else if (mp instanceof TieredMergePolicy) {
TieredMergePolicy tmp = (TieredMergePolicy) mp;
tmp.setMaxMergeAtOnce(Math.min(5, tmp.getMaxMergeAtOnce()));
tmp.setSegmentsPerTier(Math.min(5, tmp.getSegmentsPerTier()));
}
MergeScheduler ms = w.getConfig().getMergeScheduler();
if (ms instanceof ConcurrentMergeScheduler) {
// wtf... shouldnt it be even lower since it's 1 by default?!?!
((ConcurrentMergeScheduler) ms).setMaxMergesAndThreads(3, 2);
}
}
/**
* Checks some basic behaviour of an AttributeImpl
*
* @param reflectedValues contains a map with "AttributeClass#key" as values
*/
public static <T> void assertAttributeReflection(
final AttributeImpl att, Map<String, T> reflectedValues) {
final Map<String, Object> map = new HashMap<>();
att.reflectWith(
new AttributeReflector() {
@Override
public void reflect(Class<? extends Attribute> attClass, String key, Object value) {
map.put(attClass.getName() + '#' + key, value);
}
});
Assert.assertEquals("Reflection does not produce same map", reflectedValues, map);
}
/** Assert that the given {@link TopDocs} have the same top docs and consistent hit counts. */
public static void assertConsistent(TopDocs expected, TopDocs actual) {
Assert.assertEquals(
"wrong total hits", expected.totalHits.value == 0, actual.totalHits.value == 0);
if (expected.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
Assert.assertEquals("wrong total hits", expected.totalHits.value, actual.totalHits.value);
} else {
Assert.assertTrue("wrong total hits", expected.totalHits.value >= actual.totalHits.value);
}
} else if (actual.totalHits.relation == TotalHits.Relation.EQUAL_TO) {
Assert.assertTrue("wrong total hits", expected.totalHits.value <= actual.totalHits.value);
}
Assert.assertEquals("wrong hit count", expected.scoreDocs.length, actual.scoreDocs.length);
for (int hitIDX = 0; hitIDX < expected.scoreDocs.length; hitIDX++) {
final ScoreDoc expectedSD = expected.scoreDocs[hitIDX];
final ScoreDoc actualSD = actual.scoreDocs[hitIDX];
Assert.assertEquals("wrong hit docID", expectedSD.doc, actualSD.doc);
Assert.assertEquals("wrong hit score", expectedSD.score, actualSD.score, 0.0);
if (expectedSD instanceof FieldDoc) {
Assert.assertTrue(actualSD instanceof FieldDoc);
Assert.assertArrayEquals(
"wrong sort field values",
((FieldDoc) expectedSD).fields,
((FieldDoc) actualSD).fields);
} else {
Assert.assertFalse(actualSD instanceof FieldDoc);
}
}
}
// NOTE: this is likely buggy, and cannot clone fields
// with tokenStreamValues, etc. Use at your own risk!!
// TODO: is there a pre-existing way to do this!!!
public static Document cloneDocument(Document doc1) {
final Document doc2 = new Document();
for (IndexableField f : doc1.getFields()) {
final Field field1 = (Field) f;
final Field field2;
final DocValuesType dvType = field1.fieldType().docValuesType();
final int dimCount = field1.fieldType().pointDimensionCount();
if (dvType != DocValuesType.NONE) {
switch (dvType) {
case NUMERIC:
field2 = new NumericDocValuesField(field1.name(), field1.numericValue().longValue());
break;
case BINARY:
field2 = new BinaryDocValuesField(field1.name(), field1.binaryValue());
break;
case SORTED:
field2 = new SortedDocValuesField(field1.name(), field1.binaryValue());
break;
case NONE:
case SORTED_SET:
case SORTED_NUMERIC:
default:
throw new IllegalStateException("unknown Type: " + dvType);
}
} else if (dimCount != 0) {
BytesRef br = field1.binaryValue();
byte[] bytes = new byte[br.length];
System.arraycopy(br.bytes, br.offset, bytes, 0, br.length);
field2 = new BinaryPoint(field1.name(), bytes, field1.fieldType());
} else {
field2 = new Field(field1.name(), field1.stringValue(), field1.fieldType());
}
doc2.add(field2);
}
return doc2;
}
// Returns a DocsEnum, but randomly sometimes uses a
// DocsAndFreqsEnum, DocsAndPositionsEnum. Returns null
// if field/term doesn't exist:
public static PostingsEnum docs(
Random random, IndexReader r, String field, BytesRef term, PostingsEnum reuse, int flags)
throws IOException {
final Terms terms = MultiTerms.getTerms(r, field);
if (terms == null) {
return null;
}
final TermsEnum termsEnum = terms.iterator();
if (!termsEnum.seekExact(term)) {
return null;
}
return docs(random, termsEnum, reuse, flags);
}
// Returns a PostingsEnum with random features available
public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags)
throws IOException {
// TODO: simplify this method? it would be easier to randomly either use the flags passed, or do
// the random selection,
// FREQS should be part fo the random selection instead of outside on its own?
if (random.nextBoolean()) {
if (random.nextBoolean()) {
final int posFlags;
switch (random.nextInt(4)) {
case 0:
posFlags = PostingsEnum.POSITIONS;
break;
case 1:
posFlags = PostingsEnum.OFFSETS;
break;
case 2:
posFlags = PostingsEnum.PAYLOADS;
break;
default:
posFlags = PostingsEnum.ALL;
break;
}
return termsEnum.postings(null, posFlags);
}
flags |= PostingsEnum.FREQS;
}
return termsEnum.postings(reuse, flags);
}
public static CharSequence stringToCharSequence(String string, Random random) {
return bytesToCharSequence(new BytesRef(string), random);
}
public static CharSequence bytesToCharSequence(BytesRef ref, Random random) {
switch (random.nextInt(5)) {
case 4:
final char[] chars = new char[ref.length];
final int len = UnicodeUtil.UTF8toUTF16(ref.bytes, ref.offset, ref.length, chars);
return new CharsRef(chars, 0, len);
case 3:
return CharBuffer.wrap(ref.utf8ToString());
default:
return ref.utf8ToString();
}
}
/** Shutdown {@link ExecutorService} and wait for its. */
public static void shutdownExecutorService(ExecutorService ex) {
if (ex != null) {
try {
ex.shutdown();
ex.awaitTermination(1, TimeUnit.SECONDS);
} catch (InterruptedException e) {
// Just report it on the syserr.
System.err.println("Could not properly close executor service.");
e.printStackTrace(System.err);
}
}
}
/**
* Returns a valid (compiling) Pattern instance with random stuff inside. Be careful when applying
* random patterns to longer strings as certain types of patterns may explode into exponential
* times in backtracking implementations (such as Java's).
*/
public static Pattern randomPattern(Random random) {
final String nonBmpString = "AB\uD840\uDC00C";
while (true) {
try {
Pattern p = Pattern.compile(TestUtil.randomRegexpishString(random));
String replacement = null;
// ignore bugs in Sun's regex impl
try {
replacement = p.matcher(nonBmpString).replaceAll("_");
} catch (
@SuppressWarnings("unused")
StringIndexOutOfBoundsException jdkBug) {
System.out.println("WARNING: your jdk is buggy!");
System.out.println(
"Pattern.compile(\""
+ p.pattern()
+ "\").matcher(\"AB\\uD840\\uDC00C\").replaceAll(\"_\"); should not throw IndexOutOfBounds!");
}
// Make sure the result of applying the pattern to a string with extended
// unicode characters is a valid utf16 string. See LUCENE-4078 for discussion.
if (replacement != null && UnicodeUtil.validUTF16String(replacement)) {
return p;
}
} catch (
@SuppressWarnings("unused")
PatternSyntaxException ignored) {
// Loop trying until we hit something that compiles.
}
}
}
public static String randomAnalysisString(Random random, int maxLength, boolean simple) {
assert maxLength >= 0;
// sometimes just a purely random string
if (random.nextInt(31) == 0) {
return randomSubString(random, random.nextInt(maxLength), simple);
}
// otherwise, try to make it more realistic with 'words' since most tests use MockTokenizer
// first decide how big the string will really be: 0..n
maxLength = random.nextInt(maxLength);
int avgWordLength = TestUtil.nextInt(random, 3, 8);
StringBuilder sb = new StringBuilder();
while (sb.length() < maxLength) {
if (sb.length() > 0) {
sb.append(' ');
}
int wordLength = -1;
while (wordLength < 0) {
wordLength = (int) (random.nextGaussian() * 3 + avgWordLength);
}
wordLength = Math.min(wordLength, maxLength - sb.length());
sb.append(randomSubString(random, wordLength, simple));
}
return sb.toString();
}
public static String randomSubString(Random random, int wordLength, boolean simple) {
if (wordLength == 0) {
return "";
}
int evilness = TestUtil.nextInt(random, 0, 20);
StringBuilder sb = new StringBuilder();
while (sb.length() < wordLength) {
if (simple) {
sb.append(
random.nextBoolean()
? TestUtil.randomSimpleString(random, wordLength)
: TestUtil.randomHtmlishString(random, wordLength));
} else {
if (evilness < 10) {
sb.append(TestUtil.randomSimpleString(random, wordLength));
} else if (evilness < 15) {
assert sb.length() == 0; // we should always get wordLength back!
sb.append(TestUtil.randomRealisticUnicodeString(random, wordLength, wordLength));
} else if (evilness == 16) {
sb.append(TestUtil.randomHtmlishString(random, wordLength));
} else if (evilness == 17) {
// gives a lot of punctuation
sb.append(TestUtil.randomRegexpishString(random, wordLength));
} else {
sb.append(TestUtil.randomUnicodeString(random, wordLength));
}
}
}
if (sb.length() > wordLength) {
sb.setLength(wordLength);
if (Character.isHighSurrogate(sb.charAt(wordLength - 1))) {
sb.setLength(wordLength - 1);
}
}
if (random.nextInt(17) == 0) {
// mix up case
String mixedUp = TestUtil.randomlyRecaseCodePoints(random, sb.toString());
assert mixedUp.length() == sb.length();
return mixedUp;
} else {
return sb.toString();
}
}
/**
* For debugging: tries to include br.utf8ToString(), but if that fails (because it's not valid
* utf8, which is fine!), just use ordinary toString.
*/
public static String bytesRefToString(BytesRef br) {
if (br == null) {
return "(null)";
} else {
try {
return br.utf8ToString() + " " + br.toString();
} catch (@SuppressWarnings("unused") AssertionError | IllegalArgumentException t) {
// If BytesRef isn't actually UTF8, or it's eg a
// prefix of UTF8 that ends mid-unicode-char, we
// fallback to hex:
return br.toString();
}
}
}
/** Returns a copy of the source directory, with file contents stored in RAM. */
public static Directory ramCopyOf(Directory dir) throws IOException {
Directory ram = new ByteBuffersDirectory();
for (String file : dir.listAll()) {
if (file.startsWith(IndexFileNames.SEGMENTS)
|| IndexFileNames.CODEC_FILE_PATTERN.matcher(file).matches()) {
ram.copyFrom(dir, file, file, IOContext.DEFAULT);
}
}
return ram;
}
public static boolean hasWindowsFS(Directory dir) {
dir = FilterDirectory.unwrap(dir);
if (dir instanceof FSDirectory) {
Path path = ((FSDirectory) dir).getDirectory();
FileSystem fs = path.getFileSystem();
while (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getParent() instanceof WindowsFS) {
return true;
}
fs = ffs.getDelegate();
}
}
return false;
}
public static boolean hasWindowsFS(Path path) {
FileSystem fs = path.getFileSystem();
while (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getParent() instanceof WindowsFS) {
return true;
}
fs = ffs.getDelegate();
}
return false;
}
public static boolean hasVirusChecker(Directory dir) {
dir = FilterDirectory.unwrap(dir);
if (dir instanceof FSDirectory) {
return hasVirusChecker(((FSDirectory) dir).getDirectory());
} else {
return false;
}
}
public static boolean hasVirusChecker(Path path) {
FileSystem fs = path.getFileSystem();
while (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getParent() instanceof VirusCheckingFS) {
return true;
}
fs = ffs.getDelegate();
}
return false;
}
/** Returns true if VirusCheckingFS is in use and was in fact already enabled */
public static boolean disableVirusChecker(Directory in) {
Directory dir = FilterDirectory.unwrap(in);
if (dir instanceof FSDirectory) {
FileSystem fs = ((FSDirectory) dir).getDirectory().getFileSystem();
while (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getParent() instanceof VirusCheckingFS) {
VirusCheckingFS vfs = (VirusCheckingFS) ffs.getParent();
boolean isEnabled = vfs.isEnabled();
vfs.disable();
return isEnabled;
}
fs = ffs.getDelegate();
}
}
return false;
}
public static void enableVirusChecker(Directory in) {
Directory dir = FilterDirectory.unwrap(in);
if (dir instanceof FSDirectory) {
FileSystem fs = ((FSDirectory) dir).getDirectory().getFileSystem();
while (fs instanceof FilterFileSystem) {
FilterFileSystem ffs = (FilterFileSystem) fs;
if (ffs.getParent() instanceof VirusCheckingFS) {
VirusCheckingFS vfs = (VirusCheckingFS) ffs.getParent();
vfs.enable();
return;
}
fs = ffs.getDelegate();
}
}
}
}