blob: d3e8004064dfcc8c3db5f10f0220f2f717d2a8f7 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import java.io.IOException;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.Collections;
import java.util.List;
import java.util.stream.Collectors;
import org.apache.lucene.store.ByteBuffersDataOutput;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/** Tests {@link FSTDictionary}. */
public class TestFSTDictionary extends LuceneTestCase {
public void testEmptyTermSupported() throws Exception {
FSTDictionary indexDictionary =
createFSTDictionary(Collections.singletonList(new BytesRef()), new int[] {588});
assertEquals(588, indexDictionary.browser().seekBlock(new BytesRef()));
}
public void testRepeatedTermNotAllowed() {
for (BytesRef term : new BytesRef[] {new BytesRef(), new BytesRef("a")}) {
try {
createFSTDictionary(Arrays.asList(term, term), new int[] {0, 1});
fail("Expected exception not thrown");
} catch (Exception e) {
assertSame(UnsupportedOperationException.class, e.getClass());
}
}
}
public void testRepeatedOutputAllowed() throws Exception {
BytesRef[] terms = {new BytesRef("a"), new BytesRef("b")};
FSTDictionary indexDictionary = createFSTDictionary(Arrays.asList(terms), new int[] {588, 588});
assertEquals(588, indexDictionary.browser().seekBlock(new BytesRef("a")));
assertEquals(588, indexDictionary.browser().seekBlock(new BytesRef("b")));
}
public void testSerialization() throws IOException {
List<String> vocab = Arrays.asList("aswoon", "asyl", "asyla", "asyllabic");
for (boolean shouldEncode : new boolean[] {false, true}) {
FSTDictionary srcDictionary = createFSTDictionary(vocab);
FSTDictionary fstDictionary = serializeAndReadDictionary(srcDictionary, shouldEncode);
assertNotSame(srcDictionary, fstDictionary);
assertEquals(-1L, fstDictionary.browser().seekBlock(new BytesRef()));
assertNotSame(-1L, fstDictionary.browser().seekBlock(new BytesRef("aswoon")));
assertNotSame(-1L, fstDictionary.browser().seekBlock(new BytesRef("z")));
}
}
public void testSerializationEmptyTerm() throws IOException {
for (boolean shouldEncode : new boolean[] {false, true}) {
FSTDictionary srcDictionary =
createFSTDictionary(Collections.singletonList(new BytesRef()), new int[1]);
FSTDictionary fstDictionary = serializeAndReadDictionary(srcDictionary, shouldEncode);
assertNotSame(srcDictionary, fstDictionary);
assertEquals(0, fstDictionary.browser().seekBlock(new BytesRef()));
}
}
public void testCommonPrefixes() throws Exception {
List<String> vocab = new ArrayList<>();
vocab.add("aswoon");
vocab.add("asyl");
vocab.add("asyla");
vocab.add("asyllabic");
vocab.add("asylum");
vocab.add("asylums");
vocab.add("asymmetric");
vocab.add("asymmetrical");
vocab.add("asymmetrically");
vocab.add("asymmetries");
vocab.add("asymmetry");
vocab.add("asymptomatic");
vocab.add("asymptomatically");
vocab.add("asymptote");
vocab.add("asymptotes");
vocab.add("asymptotic");
vocab.add("asymptotical");
vocab.add("asymptotically");
vocab.add("asynapses");
vocab.add("asynapsis");
int[] blockFPs = new int[vocab.size()];
for (int i = 0; i < blockFPs.length; i++) {
blockFPs[i] = i;
}
List<BytesRef> blockKeys = vocab.stream().map(BytesRef::new).collect(Collectors.toList());
FSTDictionary indexDictionary = createFSTDictionary(blockKeys, blockFPs);
IndexDictionary.Browser browser = indexDictionary.browser();
for (int i = 0; i < vocab.size(); i++) {
assertEquals(blockFPs[i], browser.seekBlock(blockKeys.get(i)));
}
assertEquals(blockFPs[vocab.size() - 1], browser.seekBlock(new BytesRef("zoo")));
assertEquals(-1, browser.seekBlock(new BytesRef("A")));
assertEquals(blockFPs[9], browser.seekBlock(new BytesRef("asymmetriesz")));
}
private static FSTDictionary createFSTDictionary(List<BytesRef> blockKeys, int[] blockFPs)
throws IOException {
FSTDictionary.Builder builder = new FSTDictionary.Builder();
for (int i = 0; i < blockKeys.size(); i++) {
builder.add(blockKeys.get(i), blockFPs[i]);
}
return builder.build();
}
private static FSTDictionary createFSTDictionary(List<String> vocab) throws IOException {
FSTDictionary.Builder builder = new FSTDictionary.Builder();
for (int i = 0; i < vocab.size(); i++) {
builder.add(new BytesRef(vocab.get(i)), i);
}
return builder.build();
}
private static FSTDictionary serializeAndReadDictionary(
FSTDictionary srcDictionary, boolean shouldEncrypt) throws IOException {
ByteBuffersDataOutput output = ByteBuffersDataOutput.newResettableInstance();
srcDictionary.write(output, shouldEncrypt ? Rot13CypherTestUtil.getBlockEncoder() : null);
// We must load the FST on-heap since we use a ByteBuffersDataInput which is not an instance of
// IndexInput.
return FSTDictionary.read(
output.toDataInput(), shouldEncrypt ? Rot13CypherTestUtil.getBlockDecoder() : null, true);
}
}