lucene/codecs/src/test/org/apache/lucene/codecs/uniformsplit/TestTermBytes.java - lucene-solr - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.lucene.codecs.uniformsplit;

 import java.util.ArrayList;
 import java.util.Arrays;
 import java.util.List;
 import java.util.Map;
 import java.util.stream.Collectors;

 import org.apache.lucene.util.BytesRef;
 import org.apache.lucene.util.LuceneTestCase;

 /**
  * Tests {@link TermBytes}.
  */
 public class TestTermBytes extends LuceneTestCase {

   public void testMDPA() {
     validateExpectedMDP(new String[][]{
         {"aa", "a"},
         {"abbreviator", "ab"},
         {"abidingly", "abi"},
         {"aboiteaus", "abo"},
         {"abranchiates", "abr"},
         {"absentminded", "abs"},
     });
   }

   public void testIncrementalA() {
     validateExpectedSuffix(new String[][]{
         {"aa", "0aa"},
         {"abbreviator", "1bbreviator"},
         {"abidingly", "2idingly"},
         {"aboiteaus", "2oiteaus"},
         {"abranchiates", "2ranchiates"},
         {"absentminded", "2sentminded"},
         {"rodriguez", "0rodriguez"},
         {"romero", "2mero"},
     });
   }

   public void testMDPMIX2() {
     validateExpectedMDP(new String[][]{
         {"abaco", "a"},
         {"amigo", "am"},
         {"bloom", "b"},
         {"break", "br"},
         {"can", "c"},
         {"car", "car"},
         {"carmagedon", "carm"},
         {"danger", "d"},
         {"lala", "l"},
         {"literature", "li"},
         {"lucene", "lu"},
         {"nature", "n"},
         {"naval", "nav"},
         {"rico", "r"},
         {"weird", "w"},
         {"zoo", "z"},
     });
   }

   public void testMDP() {
     validateExpectedMDP(new String[][]{
         {"abaco", "a"},
         {"amigo", "am"},
         {"arco", "ar"},
         {"bloom", "b"},
         {"frien", "f"},
         {"frienchies", "frienc"},
         {"friend", "friend"},
         {"friendalan", "frienda"},
         {"friende", "friende"},
     });
   }

   public void testIncremental() {
     validateExpectedSuffix(new String[][]{
         {"abaco", "0abaco"},
         {"amigo", "1migo"},
         {"arco", "1rco"},
         {"bloom", "0bloom"},
         {"frien", "0frien"},
         {"frienchies", "5chies"},
         {"friend", "5d"},
         {"friendalan", "6alan"},
         {"friende", "6e"},
     });
   }

   public void testIncrementalSimple() {
     validateExpectedSuffix(new String[][]{
         {"abaco", "0abaco"},
         {"rodriguez", "0rodriguez"},
         {"roma", "2ma"},
         {"romero", "3ero"},
     });
   }

   public void testMDPSimple() {
     validateExpectedMDP(new String[][]{
         {"abaco", "a"},
         {"rodriguez", "r"},
         {"romero", "rom"},
     });
   }

   public void testMDPMIX() {
     validateExpectedMDP(new String[][]{
         {"aaab", "a"},
         {"arco", "ar"},
         {"busqueda", "b"},
         {"trabajo", "t"},
         {"zufix", "z"},
         {"zzfix", "zz"},
     });
   }

   private void validateExpectedSuffix(String[][] vocab) {
     Map<String, String> vocabMap = toMap(vocab);
     validateExpectedSuffix(vocabMap);
     validateIncrementalDecoding(vocabMap);
   }

   private void validateExpectedSuffix(Map<String, String> vocab) {
     List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
     List<TermBytes> output = compressPrefixes(src);
     validateMapList(vocab,
         src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
         output.stream().map(e -> e.getSuffixOffset() + createSuffixBytes(e).utf8ToString()).collect(Collectors.toList()));
   }

   private BytesRef createSuffixBytes(TermBytes termBytes) {
     return new BytesRef(termBytes.getTerm().bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
   }

   private void validateExpectedMDP(String[][] vocab) {
     Map<String, String> vocabMap = toMap(vocab);
     validateExpectedMDP(vocabMap);
     validateIncrementalDecoding(vocabMap);
   }

   private void validateExpectedMDP(Map<String, String> vocab) {
     List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
     List<TermBytes> output = compressPrefixes(src);
     validateMapList(vocab,
         src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
         output.stream().map(e -> new BytesRef(e.getTerm().bytes, 0, e.getMdpLength()).utf8ToString())
             .collect(Collectors.toList()));
   }

   private void validateIncrementalDecoding(Map<String, String> vocab) {
     BytesRef previous = new BytesRef(80);
     List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
     List<TermBytes> output = compressPrefixes(src);

     for (int i = 0; i < src.size(); i++) {
       copyBytes(BytesRef.deepCopyOf(createSuffixBytes(output.get(i))), previous, output.get(i).getSuffixOffset());
       assertEquals("Error in line " + i, src.get(i).utf8ToString(), previous.utf8ToString());
     }
   }

   private void validateMapList(Map<String, String> expectedMap, List<String> src, List<String> result) {
     for (int i = 0; i < src.size(); i++) {
       assertEquals("Error in line " + i, expectedMap.get(src.get(i)), result.get(i));
     }
   }

   private static List<TermBytes> compressPrefixes(List<BytesRef> vocab) {
     List<TermBytes> termBytes = new ArrayList<>(vocab.size());
     BytesRef last = null;
     TermBytes term;
     int mdp;
     for (BytesRef current : vocab) {
       mdp = TermBytes.computeMdpLength(last, current);
       term = new TermBytes(mdp, current);
       termBytes.add(term);
       last = current;
     }
     return termBytes;
   }

   private static void copyBytes(BytesRef source, BytesRef target, int targetOffset) {
     assert target.offset == 0;
     assert source.offset == 0;
     int newLength = targetOffset + source.length;
     if (newLength > target.bytes.length) {
       byte[] copy = new byte[newLength];
       System.arraycopy(target.bytes, 0, copy, 0, targetOffset);
       target.bytes = copy;
     }
     target.length = newLength;
     System.arraycopy(source.bytes, 0, target.bytes, targetOffset, source.length);
   }

   private static Map<String, String> toMap(String[][] src) {
     assert src.length > 0 : "insert at least one row";
     assert src[0].length == 2 : "two columns are mandatory";
     return Arrays.stream(src).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1]));
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.lucene.codecs.uniformsplit;

	import java.util.ArrayList;
	import java.util.Arrays;
	import java.util.List;
	import java.util.Map;
	import java.util.stream.Collectors;

	import org.apache.lucene.util.BytesRef;
	import org.apache.lucene.util.LuceneTestCase;

	/**
	* Tests {@link TermBytes}.
	*/
	public class TestTermBytes extends LuceneTestCase {

	public void testMDPA() {
	validateExpectedMDP(new String[][]{
	{"aa", "a"},
	{"abbreviator", "ab"},
	{"abidingly", "abi"},
	{"aboiteaus", "abo"},
	{"abranchiates", "abr"},
	{"absentminded", "abs"},
	});
	}

	public void testIncrementalA() {
	validateExpectedSuffix(new String[][]{
	{"aa", "0aa"},
	{"abbreviator", "1bbreviator"},
	{"abidingly", "2idingly"},
	{"aboiteaus", "2oiteaus"},
	{"abranchiates", "2ranchiates"},
	{"absentminded", "2sentminded"},
	{"rodriguez", "0rodriguez"},
	{"romero", "2mero"},
	});
	}

	public void testMDPMIX2() {
	validateExpectedMDP(new String[][]{
	{"abaco", "a"},
	{"amigo", "am"},
	{"bloom", "b"},
	{"break", "br"},
	{"can", "c"},
	{"car", "car"},
	{"carmagedon", "carm"},
	{"danger", "d"},
	{"lala", "l"},
	{"literature", "li"},
	{"lucene", "lu"},
	{"nature", "n"},
	{"naval", "nav"},
	{"rico", "r"},
	{"weird", "w"},
	{"zoo", "z"},
	});
	}

	public void testMDP() {
	validateExpectedMDP(new String[][]{
	{"abaco", "a"},
	{"amigo", "am"},
	{"arco", "ar"},
	{"bloom", "b"},
	{"frien", "f"},
	{"frienchies", "frienc"},
	{"friend", "friend"},
	{"friendalan", "frienda"},
	{"friende", "friende"},
	});
	}

	public void testIncremental() {
	validateExpectedSuffix(new String[][]{
	{"abaco", "0abaco"},
	{"amigo", "1migo"},
	{"arco", "1rco"},
	{"bloom", "0bloom"},
	{"frien", "0frien"},
	{"frienchies", "5chies"},
	{"friend", "5d"},
	{"friendalan", "6alan"},
	{"friende", "6e"},
	});
	}

	public void testIncrementalSimple() {
	validateExpectedSuffix(new String[][]{
	{"abaco", "0abaco"},
	{"rodriguez", "0rodriguez"},
	{"roma", "2ma"},
	{"romero", "3ero"},
	});
	}

	public void testMDPSimple() {
	validateExpectedMDP(new String[][]{
	{"abaco", "a"},
	{"rodriguez", "r"},
	{"romero", "rom"},
	});
	}

	public void testMDPMIX() {
	validateExpectedMDP(new String[][]{
	{"aaab", "a"},
	{"arco", "ar"},
	{"busqueda", "b"},
	{"trabajo", "t"},
	{"zufix", "z"},
	{"zzfix", "zz"},
	});
	}

	private void validateExpectedSuffix(String[][] vocab) {
	Map<String, String> vocabMap = toMap(vocab);
	validateExpectedSuffix(vocabMap);
	validateIncrementalDecoding(vocabMap);
	}

	private void validateExpectedSuffix(Map<String, String> vocab) {
	List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
	List<TermBytes> output = compressPrefixes(src);
	validateMapList(vocab,
	src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
	output.stream().map(e -> e.getSuffixOffset() + createSuffixBytes(e).utf8ToString()).collect(Collectors.toList()));
	}

	private BytesRef createSuffixBytes(TermBytes termBytes) {
	return new BytesRef(termBytes.getTerm().bytes, termBytes.getSuffixOffset(), termBytes.getSuffixLength());
	}

	private void validateExpectedMDP(String[][] vocab) {
	Map<String, String> vocabMap = toMap(vocab);
	validateExpectedMDP(vocabMap);
	validateIncrementalDecoding(vocabMap);
	}

	private void validateExpectedMDP(Map<String, String> vocab) {
	List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
	List<TermBytes> output = compressPrefixes(src);
	validateMapList(vocab,
	src.stream().map(BytesRef::utf8ToString).collect(Collectors.toList()),
	output.stream().map(e -> new BytesRef(e.getTerm().bytes, 0, e.getMdpLength()).utf8ToString())
	.collect(Collectors.toList()));
	}

	private void validateIncrementalDecoding(Map<String, String> vocab) {
	BytesRef previous = new BytesRef(80);
	List<BytesRef> src = vocab.keySet().stream().sorted().map(BytesRef::new).collect(Collectors.toList());
	List<TermBytes> output = compressPrefixes(src);

	for (int i = 0; i < src.size(); i++) {
	copyBytes(BytesRef.deepCopyOf(createSuffixBytes(output.get(i))), previous, output.get(i).getSuffixOffset());
	assertEquals("Error in line " + i, src.get(i).utf8ToString(), previous.utf8ToString());
	}
	}

	private void validateMapList(Map<String, String> expectedMap, List<String> src, List<String> result) {
	for (int i = 0; i < src.size(); i++) {
	assertEquals("Error in line " + i, expectedMap.get(src.get(i)), result.get(i));
	}
	}

	private static List<TermBytes> compressPrefixes(List<BytesRef> vocab) {
	List<TermBytes> termBytes = new ArrayList<>(vocab.size());
	BytesRef last = null;
	TermBytes term;
	int mdp;
	for (BytesRef current : vocab) {
	mdp = TermBytes.computeMdpLength(last, current);
	term = new TermBytes(mdp, current);
	termBytes.add(term);
	last = current;
	}
	return termBytes;
	}

	private static void copyBytes(BytesRef source, BytesRef target, int targetOffset) {
	assert target.offset == 0;
	assert source.offset == 0;
	int newLength = targetOffset + source.length;
	if (newLength > target.bytes.length) {
	byte[] copy = new byte[newLength];
	System.arraycopy(target.bytes, 0, copy, 0, targetOffset);
	target.bytes = copy;
	}
	target.length = newLength;
	System.arraycopy(source.bytes, 0, target.bytes, targetOffset, source.length);
	}

	private static Map<String, String> toMap(String[][] src) {
	assert src.length > 0 : "insert at least one row";
	assert src[0].length == 2 : "two columns are mandatory";
	return Arrays.stream(src).collect(Collectors.toMap(kv -> kv[0], kv -> kv[1]));
	}
	}