| package org.apache.lucene.index; |
| |
| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.PrintStream; |
| import java.util.Arrays; |
| import java.util.Collection; |
| import java.util.Collections; |
| import java.util.HashMap; |
| import java.util.HashSet; |
| import java.util.IdentityHashMap; |
| import java.util.List; |
| import java.util.Map; |
| import java.util.Set; |
| |
| import org.apache.lucene.analysis.Analyzer; |
| import org.apache.lucene.analysis.MockAnalyzer; |
| import org.apache.lucene.codecs.Codec; |
| import org.apache.lucene.codecs.DocValuesConsumer; |
| import org.apache.lucene.codecs.DocValuesProducer; |
| import org.apache.lucene.codecs.FieldsConsumer; |
| import org.apache.lucene.codecs.FieldsProducer; |
| import org.apache.lucene.codecs.NormsConsumer; |
| import org.apache.lucene.codecs.NormsProducer; |
| import org.apache.lucene.codecs.StoredFieldsReader; |
| import org.apache.lucene.codecs.StoredFieldsWriter; |
| import org.apache.lucene.codecs.TermVectorsReader; |
| import org.apache.lucene.codecs.TermVectorsWriter; |
| import org.apache.lucene.codecs.mockrandom.MockRandomPostingsFormat; |
| import org.apache.lucene.document.Document; |
| import org.apache.lucene.document.Field; |
| import org.apache.lucene.document.FieldType; |
| import org.apache.lucene.document.NumericDocValuesField; |
| import org.apache.lucene.document.TextField; |
| import org.apache.lucene.store.AlreadyClosedException; |
| import org.apache.lucene.store.Directory; |
| import org.apache.lucene.store.FlushInfo; |
| import org.apache.lucene.store.IOContext; |
| import org.apache.lucene.store.IndexInput; |
| import org.apache.lucene.store.MockDirectoryWrapper; |
| import org.apache.lucene.util.BytesRef; |
| import org.apache.lucene.util.CloseableThreadLocal; |
| import org.apache.lucene.util.IOUtils; |
| import org.apache.lucene.util.InfoStream; |
| import org.apache.lucene.util.LuceneTestCase; |
| import org.apache.lucene.util.RamUsageEstimator; |
| import org.apache.lucene.util.RamUsageTester; |
| import org.apache.lucene.util.Rethrow; |
| import org.apache.lucene.util.StringHelper; |
| import org.apache.lucene.util.TestUtil; |
| import org.apache.lucene.util.Version; |
| |
| /** |
| * Common tests to all index formats. |
| */ |
| abstract class BaseIndexFileFormatTestCase extends LuceneTestCase { |
| |
| // metadata or Directory-level objects |
| private static final Set<Class<?>> EXCLUDED_CLASSES = Collections.newSetFromMap(new IdentityHashMap<Class<?>,Boolean>()); |
| |
| static { |
| // Directory objects, don't take into account eg. the NIO buffers |
| EXCLUDED_CLASSES.add(Directory.class); |
| EXCLUDED_CLASSES.add(IndexInput.class); |
| |
| // used for thread management, not by the index |
| EXCLUDED_CLASSES.add(CloseableThreadLocal.class); |
| EXCLUDED_CLASSES.add(ThreadLocal.class); |
| |
| // don't follow references to the top-level reader |
| EXCLUDED_CLASSES.add(IndexReader.class); |
| EXCLUDED_CLASSES.add(IndexReaderContext.class); |
| |
| // usually small but can bump memory usage for |
| // memory-efficient things like stored fields |
| EXCLUDED_CLASSES.add(FieldInfos.class); |
| EXCLUDED_CLASSES.add(SegmentInfo.class); |
| EXCLUDED_CLASSES.add(SegmentCommitInfo.class); |
| EXCLUDED_CLASSES.add(FieldInfo.class); |
| |
| // constant overhead is typically due to strings |
| // TODO: can we remove this and still pass the test consistently |
| EXCLUDED_CLASSES.add(String.class); |
| } |
| |
| static class Accumulator extends RamUsageTester.Accumulator { |
| |
| private final Object root; |
| |
| Accumulator(Object root) { |
| this.root = root; |
| } |
| |
| public long accumulateObject(Object o, long shallowSize, Map<java.lang.reflect.Field, Object> fieldValues, Collection<Object> queue) { |
| for (Class<?> clazz = o.getClass(); clazz != null; clazz = clazz.getSuperclass()) { |
| if (EXCLUDED_CLASSES.contains(clazz) && o != root) { |
| return 0; |
| } |
| } |
| // we have no way to estimate the size of these things in codecs although |
| // something like a Collections.newSetFromMap(new HashMap<>()) uses quite |
| // some memory... So for now the test ignores the overhead of such |
| // collections but can we do better? |
| long v; |
| if (o instanceof Collection) { |
| Collection<?> coll = (Collection<?>) o; |
| queue.addAll((Collection<?>) o); |
| v = (long) coll.size() * RamUsageEstimator.NUM_BYTES_OBJECT_REF; |
| } else if (o instanceof Map) { |
| final Map<?, ?> map = (Map<?,?>) o; |
| queue.addAll(map.keySet()); |
| queue.addAll(map.values()); |
| v = 2L * map.size() * RamUsageEstimator.NUM_BYTES_OBJECT_REF; |
| } else { |
| v = super.accumulateObject(o, shallowSize, fieldValues, queue); |
| } |
| // System.out.println(o.getClass() + "=" + v); |
| return v; |
| } |
| |
| @Override |
| public long accumulateArray(Object array, long shallowSize, |
| List<Object> values, Collection<Object> queue) { |
| long v = super.accumulateArray(array, shallowSize, values, queue); |
| // System.out.println(array.getClass() + "=" + v); |
| return v; |
| } |
| |
| }; |
| |
| /** Returns the codec to run tests against */ |
| protected abstract Codec getCodec(); |
| |
| private Codec savedCodec; |
| |
| public void setUp() throws Exception { |
| super.setUp(); |
| // set the default codec, so adding test cases to this isn't fragile |
| savedCodec = Codec.getDefault(); |
| Codec.setDefault(getCodec()); |
| } |
| |
| public void tearDown() throws Exception { |
| Codec.setDefault(savedCodec); // restore |
| super.tearDown(); |
| } |
| |
| /** Add random fields to the provided document. */ |
| protected abstract void addRandomFields(Document doc); |
| |
| private Map<String, Long> bytesUsedByExtension(Directory d) throws IOException { |
| Map<String, Long> bytesUsedByExtension = new HashMap<>(); |
| for (String file : d.listAll()) { |
| if (IndexFileNames.CODEC_FILE_PATTERN.matcher(file).matches()) { |
| final String ext = IndexFileNames.getExtension(file); |
| final long previousLength = bytesUsedByExtension.containsKey(ext) ? bytesUsedByExtension.get(ext) : 0; |
| bytesUsedByExtension.put(ext, previousLength + d.fileLength(file)); |
| } |
| } |
| bytesUsedByExtension.keySet().removeAll(excludedExtensionsFromByteCounts()); |
| |
| return bytesUsedByExtension; |
| } |
| |
| /** |
| * Return the list of extensions that should be excluded from byte counts when |
| * comparing indices that store the same content. |
| */ |
| protected Collection<String> excludedExtensionsFromByteCounts() { |
| return new HashSet<String>(Arrays.asList(new String[] { |
| // segment infos store various pieces of information that don't solely depend |
| // on the content of the index in the diagnostics (such as a timestamp) so we |
| // exclude this file from the bytes counts |
| "si", |
| // lock files are 0 bytes (one directory in the test could be RAMDir, the other FSDir) |
| "lock" })); |
| } |
| |
| /** The purpose of this test is to make sure that bulk merge doesn't accumulate useless data over runs. */ |
| public void testMergeStability() throws Exception { |
| Directory dir = newDirectory(); |
| if (dir instanceof MockDirectoryWrapper) { |
| // Else, the virus checker may prevent deletion of files and cause |
| // us to see too many bytes used by extension in the end: |
| ((MockDirectoryWrapper) dir).setEnableVirusScanner(false); |
| } |
| // do not use newMergePolicy that might return a MockMergePolicy that ignores the no-CFS ratio |
| // do not use RIW which will change things up! |
| MergePolicy mp = newTieredMergePolicy(); |
| mp.setNoCFSRatio(0); |
| IndexWriterConfig cfg = new IndexWriterConfig(new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp); |
| IndexWriter w = new IndexWriter(dir, cfg); |
| final int numDocs = atLeast(500); |
| for (int i = 0; i < numDocs; ++i) { |
| Document d = new Document(); |
| addRandomFields(d); |
| w.addDocument(d); |
| } |
| w.forceMerge(1); |
| w.commit(); |
| w.close(); |
| DirectoryReader reader = DirectoryReader.open(dir); |
| |
| Directory dir2 = newDirectory(); |
| if (dir2 instanceof MockDirectoryWrapper) { |
| // Else, the virus checker may prevent deletion of files and cause |
| // us to see too many bytes used by extension in the end: |
| ((MockDirectoryWrapper) dir2).setEnableVirusScanner(false); |
| } |
| mp = newTieredMergePolicy(); |
| mp.setNoCFSRatio(0); |
| cfg = new IndexWriterConfig(new MockAnalyzer(random())).setUseCompoundFile(false).setMergePolicy(mp); |
| w = new IndexWriter(dir2, cfg); |
| TestUtil.addIndexesSlowly(w, reader); |
| |
| w.commit(); |
| w.close(); |
| |
| assertEquals(bytesUsedByExtension(dir), bytesUsedByExtension(dir2)); |
| |
| reader.close(); |
| dir.close(); |
| dir2.close(); |
| } |
| |
| /** Test the accuracy of the ramBytesUsed estimations. */ |
| @Slow |
| public void testRamBytesUsed() throws IOException { |
| if (Codec.getDefault() instanceof RandomCodec) { |
| // this test relies on the fact that two segments will be written with |
| // the same codec so we need to disable MockRandomPF |
| final Set<String> avoidCodecs = new HashSet<>(((RandomCodec) Codec.getDefault()).avoidCodecs); |
| avoidCodecs.add(new MockRandomPostingsFormat().getName()); |
| Codec.setDefault(new RandomCodec(random(), avoidCodecs)); |
| } |
| Directory dir = newDirectory(); |
| IndexWriterConfig cfg = newIndexWriterConfig(new MockAnalyzer(random())); |
| IndexWriter w = new IndexWriter(dir, cfg); |
| // we need to index enough documents so that constant overhead doesn't dominate |
| final int numDocs = atLeast(10000); |
| LeafReader reader1 = null; |
| for (int i = 0; i < numDocs; ++i) { |
| Document d = new Document(); |
| addRandomFields(d); |
| w.addDocument(d); |
| if (i == 100) { |
| w.forceMerge(1); |
| w.commit(); |
| reader1 = getOnlySegmentReader(DirectoryReader.open(dir)); |
| } |
| } |
| w.forceMerge(1); |
| w.commit(); |
| w.close(); |
| |
| LeafReader reader2 = getOnlySegmentReader(DirectoryReader.open(dir)); |
| |
| for (LeafReader reader : Arrays.asList(reader1, reader2)) { |
| new SimpleMergedSegmentWarmer(InfoStream.NO_OUTPUT).warm(reader); |
| } |
| |
| final long actualBytes = RamUsageTester.sizeOf(reader2, new Accumulator(reader2)) - RamUsageTester.sizeOf(reader1, new Accumulator(reader1)); |
| final long expectedBytes = ((SegmentReader) reader2).ramBytesUsed() - ((SegmentReader) reader1).ramBytesUsed(); |
| final long absoluteError = actualBytes - expectedBytes; |
| final double relativeError = (double) absoluteError / actualBytes; |
| final String message = "Actual RAM usage " + actualBytes + ", but got " + expectedBytes + ", " + 100*relativeError + "% error"; |
| assertTrue(message, Math.abs(relativeError) < 0.20d || Math.abs(absoluteError) < 1000); |
| |
| reader1.close(); |
| reader2.close(); |
| dir.close(); |
| } |
| |
| /** Calls close multiple times on closeable codec apis */ |
| public void testMultiClose() throws IOException { |
| // first make a one doc index |
| Directory oneDocIndex = newDirectory(); |
| IndexWriter iw = new IndexWriter(oneDocIndex, new IndexWriterConfig(new MockAnalyzer(random()))); |
| Document oneDoc = new Document(); |
| FieldType customType = new FieldType(TextField.TYPE_STORED); |
| customType.setStoreTermVectors(true); |
| Field customField = new Field("field", "contents", customType); |
| oneDoc.add(customField); |
| oneDoc.add(new NumericDocValuesField("field", 5)); |
| iw.addDocument(oneDoc); |
| LeafReader oneDocReader = getOnlySegmentReader(DirectoryReader.open(iw, true)); |
| iw.close(); |
| |
| // now feed to codec apis manually |
| // we use FSDir, things like ramdir are not guaranteed to cause fails if you write to them after close(), etc |
| Directory dir = newFSDirectory(createTempDir("justSoYouGetSomeChannelErrors")); |
| Codec codec = getCodec(); |
| |
| SegmentInfo segmentInfo = new SegmentInfo(dir, Version.LATEST, "_0", 1, false, codec, Collections.emptyMap(), StringHelper.randomId(), new HashMap<>()); |
| FieldInfo proto = oneDocReader.getFieldInfos().fieldInfo("field"); |
| FieldInfo field = new FieldInfo(proto.name, proto.number, proto.hasVectors(), proto.omitsNorms(), proto.hasPayloads(), |
| proto.getIndexOptions(), proto.getDocValuesType(), proto.getDocValuesGen(), new HashMap<>()); |
| |
| FieldInfos fieldInfos = new FieldInfos(new FieldInfo[] { field } ); |
| |
| SegmentWriteState writeState = new SegmentWriteState(null, dir, |
| segmentInfo, fieldInfos, |
| null, new IOContext(new FlushInfo(1, 20))); |
| |
| SegmentReadState readState = new SegmentReadState(dir, segmentInfo, fieldInfos, IOContext.READ); |
| |
| // PostingsFormat |
| try (FieldsConsumer consumer = codec.postingsFormat().fieldsConsumer(writeState)) { |
| consumer.write(oneDocReader.fields()); |
| IOUtils.close(consumer); |
| IOUtils.close(consumer); |
| } |
| try (FieldsProducer producer = codec.postingsFormat().fieldsProducer(readState)) { |
| IOUtils.close(producer); |
| IOUtils.close(producer); |
| } |
| |
| // DocValuesFormat |
| try (DocValuesConsumer consumer = codec.docValuesFormat().fieldsConsumer(writeState)) { |
| consumer.addNumericField(field, Collections.singleton(5)); |
| IOUtils.close(consumer); |
| IOUtils.close(consumer); |
| } |
| try (DocValuesProducer producer = codec.docValuesFormat().fieldsProducer(readState)) { |
| IOUtils.close(producer); |
| IOUtils.close(producer); |
| } |
| |
| // NormsFormat |
| try (NormsConsumer consumer = codec.normsFormat().normsConsumer(writeState)) { |
| consumer.addNormsField(field, Collections.singleton(5)); |
| IOUtils.close(consumer); |
| IOUtils.close(consumer); |
| } |
| try (NormsProducer producer = codec.normsFormat().normsProducer(readState)) { |
| IOUtils.close(producer); |
| IOUtils.close(producer); |
| } |
| |
| // TermVectorsFormat |
| try (TermVectorsWriter consumer = codec.termVectorsFormat().vectorsWriter(dir, segmentInfo, writeState.context)) { |
| consumer.startDocument(1); |
| consumer.startField(field, 1, false, false, false); |
| consumer.startTerm(new BytesRef("testing"), 2); |
| consumer.finishTerm(); |
| consumer.finishField(); |
| consumer.finishDocument(); |
| consumer.finish(fieldInfos, 1); |
| IOUtils.close(consumer); |
| IOUtils.close(consumer); |
| } |
| try (TermVectorsReader producer = codec.termVectorsFormat().vectorsReader(dir, segmentInfo, fieldInfos, readState.context)) { |
| IOUtils.close(producer); |
| IOUtils.close(producer); |
| } |
| |
| // StoredFieldsFormat |
| try (StoredFieldsWriter consumer = codec.storedFieldsFormat().fieldsWriter(dir, segmentInfo, writeState.context)) { |
| consumer.startDocument(); |
| consumer.writeField(field, customField); |
| consumer.finishDocument(); |
| consumer.finish(fieldInfos, 1); |
| IOUtils.close(consumer); |
| IOUtils.close(consumer); |
| } |
| try (StoredFieldsReader producer = codec.storedFieldsFormat().fieldsReader(dir, segmentInfo, fieldInfos, readState.context)) { |
| IOUtils.close(producer); |
| IOUtils.close(producer); |
| } |
| |
| IOUtils.close(oneDocReader, oneDocIndex, dir); |
| } |
| |
| /** Tests exception handling on write and openInput/createOutput */ |
| // TODO: this is really not ideal. each BaseXXXTestCase should have unit tests doing this. |
| // but we use this shotgun approach to prevent bugs in the meantime: it just ensures the |
| // codec does not corrupt the index or leak file handles. |
| public void testRandomExceptions() throws Exception { |
| // disable slow things: we don't rely upon sleeps here. |
| MockDirectoryWrapper dir = newMockDirectory(); |
| dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER); |
| dir.setUseSlowOpenClosers(false); |
| dir.setPreventDoubleWrite(false); |
| dir.setRandomIOExceptionRate(0.001); // more rare |
| |
| // log all exceptions we hit, in case we fail (for debugging) |
| ByteArrayOutputStream exceptionLog = new ByteArrayOutputStream(); |
| PrintStream exceptionStream = new PrintStream(exceptionLog, true, "UTF-8"); |
| //PrintStream exceptionStream = System.out; |
| |
| Analyzer analyzer = new MockAnalyzer(random()); |
| |
| IndexWriterConfig conf = newIndexWriterConfig(analyzer); |
| // just for now, try to keep this test reproducible |
| conf.setMergeScheduler(new SerialMergeScheduler()); |
| conf.setCodec(getCodec()); |
| |
| int numDocs = atLeast(500); |
| |
| IndexWriter iw = new IndexWriter(dir, conf); |
| try { |
| boolean allowAlreadyClosed = false; |
| for (int i = 0; i < numDocs; i++) { |
| dir.setRandomIOExceptionRateOnOpen(0.02); // turn on exceptions for openInput/createOutput |
| |
| Document doc = new Document(); |
| doc.add(newStringField("id", Integer.toString(i), Field.Store.NO)); |
| addRandomFields(doc); |
| |
| // single doc |
| try { |
| iw.addDocument(doc); |
| // we made it, sometimes delete our doc |
| iw.deleteDocuments(new Term("id", Integer.toString(i))); |
| } catch (AlreadyClosedException ace) { |
| // OK: writer was closed by abort; we just reopen now: |
| dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration |
| assertTrue(iw.deleter.isClosed()); |
| assertTrue(allowAlreadyClosed); |
| allowAlreadyClosed = false; |
| conf = newIndexWriterConfig(analyzer); |
| // just for now, try to keep this test reproducible |
| conf.setMergeScheduler(new SerialMergeScheduler()); |
| conf.setCodec(getCodec()); |
| iw = new IndexWriter(dir, conf); |
| } catch (Exception e) { |
| if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) { |
| exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage()); |
| e.printStackTrace(exceptionStream); |
| allowAlreadyClosed = true; |
| } else { |
| Rethrow.rethrow(e); |
| } |
| } |
| |
| if (random().nextInt(10) == 0) { |
| // trigger flush: |
| try { |
| if (random().nextBoolean()) { |
| DirectoryReader ir = null; |
| try { |
| ir = DirectoryReader.open(iw, random().nextBoolean()); |
| dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration |
| TestUtil.checkReader(ir); |
| } finally { |
| IOUtils.closeWhileHandlingException(ir); |
| } |
| } else { |
| dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration: |
| // or we make slowExists angry and trip a scarier assert! |
| iw.commit(); |
| } |
| if (DirectoryReader.indexExists(dir)) { |
| TestUtil.checkIndex(dir); |
| } |
| } catch (AlreadyClosedException ace) { |
| // OK: writer was closed by abort; we just reopen now: |
| dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration |
| assertTrue(iw.deleter.isClosed()); |
| assertTrue(allowAlreadyClosed); |
| allowAlreadyClosed = false; |
| conf = newIndexWriterConfig(analyzer); |
| // just for now, try to keep this test reproducible |
| conf.setMergeScheduler(new SerialMergeScheduler()); |
| conf.setCodec(getCodec()); |
| iw = new IndexWriter(dir, conf); |
| } catch (Exception e) { |
| if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) { |
| exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage()); |
| e.printStackTrace(exceptionStream); |
| allowAlreadyClosed = true; |
| } else { |
| Rethrow.rethrow(e); |
| } |
| } |
| } |
| } |
| |
| try { |
| dir.setRandomIOExceptionRateOnOpen(0.0); // disable exceptions on openInput until next iteration: |
| // or we make slowExists angry and trip a scarier assert! |
| iw.close(); |
| } catch (Exception e) { |
| if (e.getMessage() != null && e.getMessage().startsWith("a random IOException")) { |
| exceptionStream.println("\nTEST: got expected fake exc:" + e.getMessage()); |
| e.printStackTrace(exceptionStream); |
| try { |
| iw.rollback(); |
| } catch (Throwable t) {} |
| } else { |
| Rethrow.rethrow(e); |
| } |
| } |
| dir.close(); |
| } catch (Throwable t) { |
| System.out.println("Unexpected exception: dumping fake-exception-log:..."); |
| exceptionStream.flush(); |
| System.out.println(exceptionLog.toString("UTF-8")); |
| System.out.flush(); |
| Rethrow.rethrow(t); |
| } |
| |
| if (VERBOSE) { |
| System.out.println("TEST PASSED: dumping fake-exception-log:..."); |
| System.out.println(exceptionLog.toString("UTF-8")); |
| } |
| } |
| } |