blob: d5c0402d5e7a6f3f755e66f2048025baf504165d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.lucene.codecs.uniformsplit;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.List;
import java.util.Map;
import java.util.stream.Collectors;
import org.apache.lucene.util.BytesRef;
import org.apache.lucene.util.LuceneTestCase;
/**
* Tests {@link TermBytes}.
*/
public class TestTermBytes extends LuceneTestCase {
public void testMDPA() {
validateExpectedMDP(new String[][]{
{"aa", "a"},
{"abbreviator", "ab"},
{"abidingly", "abi"},
{"aboiteaus", "abo"},
{"abranchiates", "abr"},
{"absentminded", "abs"},
});
}
public void testIncrementalA() {
validateExpectedSuffix(new String[][]{
{"aa", "0aa"},
{"abbreviator", "1bbreviator"},
{"abidingly", "2idingly"},
{"aboiteaus", "2oiteaus"},
{"abranchiates", "2ranchiates"},
{"absentminded", "2sentminded"},
{"rodriguez", "0rodriguez"},
{"romero", "2mero"},
});
}
public void testMDPMIX2() {
validateExpectedMDP(new String[][]{
{"abaco", "a"},
{"amigo", "am"},
{"bloom", "b"},
{"break", "br"},
{"can", "c"},
{"car", "car"},
{"carmagedon", "carm"},
{"danger", "d"},
{"lala", "l"},
{"literature", "li"},
{"lucene", "lu"},
{"nature", "n"},
{"naval", "nav"},
{"rico", "r"},
{"weird", "w"},
{"zoo", "z"},
});
}
public void testMDP() {
validateExpectedMDP(new String[][]{
{"abaco", "a"},
{"amigo", "am"},
{"arco", "ar"},
{"bloom", "b"},
{"frien", "f"},
{"frienchies", "frienc"},
{"friend", "friend"},
{"friendalan", "frienda"},
{"friende", "friende"},
});
}
public void testIncremental() {
validateExpectedSuffix(new String[][]{
{"abaco", "0abaco"},
{"amigo", "1migo"},
{"arco", "1rco"},
{"bloom", "0bloom"},
{"frien", "0frien"},
{"frienchies", "5chies"},
{"friend", "5d"},
{"friendalan", "6alan"},
{"friende", "6e"},
});
}
public void testIncrementalSimple() {
validateExpectedSuffix(new String[][]{
{"abaco", "0abaco"},
{"rodriguez", "0rodriguez"},
{"roma", "2ma"},
{"romero", "3ero"},
});
}
public void testMDPSimple() {
validateExpectedMDP(new String[][]{
{"abaco", "a"},
{"rodriguez", "r"},
{"romero", "rom"},
});
}
public void testMDPMIX() {
validateExpectedMDP(new String[][]{
{"aaab", "a"},
{"arco", "ar"},
{"busqueda", "b"},
{"trabajo", "t"},
{"zufix", "z"},
{"zzfix", "zz"},
});
}
private void validateExpectedSuffix(String[][] vocab) {
Map<String, String> vocabMap = toMap(vocab);
validateExpectedSuffix(vocabMap);
validateIncrementalDecoding(vocabMap);
}
private void validateExpectedSuffix(Map<String, String> vocab) {
List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
List<TermBytes> output = compressPrefixes(src);
validateMapList(vocab,
src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
output.stream().map(e -> e.getSuffixOffset() + createSuffixBytes(e).utf8ToString()).collect(Collectors.toList()));
}
private BytesRef createSuffixBytes(TermBytes termBytes) {
return new BytesRef(termBytes.getTerm().bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
}
private void validateExpectedMDP(String[][] vocab) {
Map<String, String> vocabMap = toMap(vocab);
validateExpectedMDP(vocabMap);
validateIncrementalDecoding(vocabMap);
}
private void validateExpectedMDP(Map<String, String> vocab) {
List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
List<TermBytes> output = compressPrefixes(src);
validateMapList(vocab,
src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
output.stream().map(e -> new BytesRef(e.getTerm().bytes, 0, e.getMdpLength()).utf8ToString())
.collect(Collectors.toList()));
}
private void validateIncrementalDecoding(Map<String, String> vocab) {
BytesRef previous = new BytesRef(80);
List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
List<TermBytes> output = compressPrefixes(src);
for (int i = 0; i < src.size(); i++) {
copyBytes(BytesRef.deepCopyOf(createSuffixBytes(output.get(i))), previous, output.get(i).getSuffixOffset());
assertEquals("Error in line " + i, src.get(i).utf8ToString(), previous.utf8ToString());
}
}
private void validateMapList(Map<String, String> expectedMap, List<String> src, List<String> result) {
for (int i = 0; i < src.size(); i++) {
assertEquals("Error in line " + i, expectedMap.get(src.get(i)), result.get(i));
}
}
private static List<TermBytes> compressPrefixes(List<BytesRef> vocab) {
List<TermBytes> termBytes = new ArrayList<>(vocab.size());
BytesRef last = null;
TermBytes term;
int mdp;
for (BytesRef current : vocab) {
mdp = TermBytes.computeMdpLength(last, current);
term = new TermBytes(mdp, current);
termBytes.add(term);
last = current;
}
return termBytes;
}
private static void copyBytes(BytesRef source, BytesRef target, int targetOffset) {
assert target.offset == 0;
assert source.offset == 0;
int newLength = targetOffset + source.length;
if (newLength > target.bytes.length) {
byte[] copy = new byte[newLength];
System.arraycopy(target.bytes, 0, copy, 0, targetOffset);
target.bytes = copy;
}
target.length = newLength;
System.arraycopy(source.bytes, 0, target.bytes, targetOffset, source.length);
}
private static Map<String, String> toMap(String[][] src) {
assert src.length > 0 : "insert at least one row";
assert src[0].length == 2 : "two columns are mandatory";
return Arrays.stream(src).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1]));
}
}