processing/src/test/java/org/apache/druid/segment/IndexMergerNullHandlingTest.java - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.druid.segment;

 import com.google.common.collect.ImmutableList;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.ImmutableMultiset;
 import com.google.common.collect.Sets;
 import org.apache.druid.collections.bitmap.ImmutableBitmap;
 import org.apache.druid.common.config.NullHandling;
 import org.apache.druid.data.input.MapBasedInputRow;
 import org.apache.druid.java.util.common.ISE;
 import org.apache.druid.java.util.common.guava.Comparators;
 import org.apache.druid.query.aggregation.AggregatorFactory;
 import org.apache.druid.segment.column.BitmapIndex;
 import org.apache.druid.segment.column.ColumnHolder;
 import org.apache.druid.segment.column.DictionaryEncodedColumn;
 import org.apache.druid.segment.data.IncrementalIndexTest;
 import org.apache.druid.segment.incremental.IncrementalIndex;
 import org.apache.druid.segment.writeout.OffHeapMemorySegmentWriteOutMediumFactory;
 import org.junit.Assert;
 import org.junit.Before;
 import org.junit.Rule;
 import org.junit.Test;
 import org.junit.rules.TemporaryFolder;
 import org.roaringbitmap.IntIterator;

 import java.io.File;
 import java.util.ArrayList;
 import java.util.Collections;
 import java.util.HashMap;
 import java.util.HashSet;
 import java.util.List;
 import java.util.Map;
 import java.util.Set;
 import java.util.stream.Collectors;
 import java.util.stream.IntStream;

 public class IndexMergerNullHandlingTest
 {
   private IndexMerger indexMerger;
   private IndexIO indexIO;
   private IndexSpec indexSpec;

   @Rule
   public TemporaryFolder temporaryFolder = new TemporaryFolder();

   @Before
   public void setUp()
   {
     indexMerger = TestHelper.getTestIndexMergerV9(OffHeapMemorySegmentWriteOutMediumFactory.instance());
     indexIO = TestHelper.getTestIndexIO();
     indexSpec = new IndexSpec();
   }

   @Test
   public void testStringColumnNullHandling() throws Exception
   {
     List<Map<String, Object>> nonNullFlavors = new ArrayList<>();
     nonNullFlavors.add(ImmutableMap.of("d", "a"));
     nonNullFlavors.add(ImmutableMap.of("d", ImmutableList.of("a", "b")));

     List<Map<String, Object>> nullFlavors = new ArrayList<>();
     Map<String, Object> mMissing = ImmutableMap.of();
     Map<String, Object> mEmptyList = ImmutableMap.of("d", Collections.emptyList());
     Map<String, Object> mNull = new HashMap<>();
     mNull.put("d", null);
     Map<String, Object> mEmptyString = ImmutableMap.of("d", "");
     Map<String, Object> mListOfNull = ImmutableMap.of("d", Collections.singletonList(null));
     Map<String, Object> mListOfEmptyString = ImmutableMap.of("d", Collections.singletonList(""));

     nullFlavors.add(mMissing);
     nullFlavors.add(mEmptyList);
     nullFlavors.add(mNull);
     nullFlavors.add(mListOfNull);

     if (NullHandling.replaceWithDefault()) {
       nullFlavors.add(mEmptyString);
       nullFlavors.add(mListOfEmptyString);
     } else {
       nonNullFlavors.add(mEmptyString);
       nonNullFlavors.add(mListOfEmptyString);
     }

     Set<Map<String, Object>> allValues = new HashSet<>();
     allValues.addAll(nonNullFlavors);
     allValues.addAll(nullFlavors);

     for (Set<Map<String, Object>> subset : Sets.powerSet(allValues)) {
       if (subset.isEmpty()) {
         continue;
       }

       final List<Map<String, Object>> subsetList = new ArrayList<>(subset);

       IncrementalIndex toPersist = IncrementalIndexTest.createIndex(new AggregatorFactory[]{});
       for (Map<String, Object> m : subsetList) {
         toPersist.add(new MapBasedInputRow(0L, ImmutableList.of("d"), m));
       }

       final File tempDir = temporaryFolder.newFolder();
       try (QueryableIndex index = indexIO.loadIndex(indexMerger.persist(toPersist, tempDir, indexSpec, null))) {
         final ColumnHolder columnHolder = index.getColumnHolder("d");

         if (nullFlavors.containsAll(subsetList)) {
           // all null -> should be missing
           Assert.assertNull(subsetList.toString(), columnHolder);
         } else {
           Assert.assertNotNull(subsetList.toString(), columnHolder);

           // The column has multiple values if there are any lists with > 1 element in the input set.
           final boolean hasMultipleValues = subsetList.stream()
                                                       .anyMatch(m -> m.get("d") instanceof List
                                                                      && (((List) m.get("d")).size() > 1));

           // Compute all unique values, the same way that IndexMerger is expected to do it.
           final Set<String> uniqueValues = new HashSet<>();
           for (Map<String, Object> m : subsetList) {
             final List<String> dValues = normalize(m.get("d"), hasMultipleValues);
             uniqueValues.addAll(dValues);

             if (nullFlavors.contains(m)) {
               uniqueValues.add(null);
             }
           }

           try (final DictionaryEncodedColumn<String> dictionaryColumn =
                    (DictionaryEncodedColumn<String>) columnHolder.getColumn()) {
             // Verify unique values against the dictionary.
             Assert.assertEquals(
                 subsetList.toString(),
                 uniqueValues.stream().sorted(Comparators.naturalNullsFirst()).collect(Collectors.toList()),
                 IntStream.range(0, dictionaryColumn.getCardinality())
                          .mapToObj(dictionaryColumn::lookupName)
                          .collect(Collectors.toList())
             );

             Assert.assertEquals(subsetList.toString(), hasMultipleValues, dictionaryColumn.hasMultipleValues());
             Assert.assertEquals(subsetList.toString(), uniqueValues.size(), dictionaryColumn.getCardinality());

             // Verify the expected set of rows was indexed, ignoring order.
             Assert.assertEquals(
                 subsetList.toString(),
                 ImmutableMultiset.copyOf(
                     subsetList.stream()
                               .map(m -> normalize(m.get("d"), hasMultipleValues))
                               .distinct() // Distinct values only, because we expect rollup.
                               .collect(Collectors.toList())
                 ),
                 ImmutableMultiset.copyOf(
                     IntStream.range(0, index.getNumRows())
                              .mapToObj(rowNumber -> getRow(dictionaryColumn, rowNumber))
                              // The "distinct" shouldn't be necessary, but it is, because [{}, {d=}, {d=a}]
                              // yields [[null] x 2, [a]] (arguably a bug).
                              .distinct()
                              .collect(Collectors.toList())
                 )
             );

             // Verify that the bitmap index for null is correct.
             final BitmapIndex bitmapIndex = columnHolder.getBitmapIndex();

             // Read through the column to find all the rows that should match null.
             final List<Integer> expectedNullRows = new ArrayList<>();
             for (int i = 0; i < index.getNumRows(); i++) {
               final List<String> row = getRow(dictionaryColumn, i);
               if (row.isEmpty() || row.stream().anyMatch(NullHandling::isNullOrEquivalent)) {
                 expectedNullRows.add(i);
               }
             }

             Assert.assertEquals(subsetList.toString(), expectedNullRows.size() > 0, bitmapIndex.hasNulls());

             if (expectedNullRows.size() > 0) {
               Assert.assertEquals(subsetList.toString(), 0, bitmapIndex.getIndex(null));

               final ImmutableBitmap nullBitmap = bitmapIndex.getBitmap(bitmapIndex.getIndex(null));
               final List<Integer> actualNullRows = new ArrayList<>();
               final IntIterator iterator = nullBitmap.iterator();
               while (iterator.hasNext()) {
                 actualNullRows.add(iterator.next());
               }

               Assert.assertEquals(subsetList.toString(), expectedNullRows, actualNullRows);
             } else {
               Assert.assertEquals(-1, bitmapIndex.getIndex(null));
             }
           }
         }
       }
     }
   }

   /**
    * Normalize an input value the same way that IndexMerger is expected to do it.
    */
   private static List<String> normalize(final Object value, final boolean hasMultipleValues)
   {
     final List<String> retVal = new ArrayList<>();

     if (value == null) {
       retVal.add(null);
     } else if (value instanceof String) {
       retVal.add(NullHandling.emptyToNullIfNeeded(((String) value)));
     } else if (value instanceof List) {
       final List<String> list = (List<String>) value;
       if (list.isEmpty() && !hasMultipleValues) {
         // empty lists become nulls in single valued columns
         retVal.add(NullHandling.emptyToNullIfNeeded(null));
       } else {
         retVal.addAll(list.stream().map(NullHandling::emptyToNullIfNeeded).collect(Collectors.toList()));
       }
     } else {
       throw new ISE("didn't expect class[%s]", value.getClass());
     }

     return retVal;
   }

   /**
    * Get a particular row from a column, exactly as reported by the column.
    */
   private static List<String> getRow(final DictionaryEncodedColumn<String> column, final int rowNumber)
   {
     final List<String> retVal = new ArrayList<>();

     if (column.hasMultipleValues()) {
       column.getMultiValueRow(rowNumber).forEach(i -> retVal.add(column.lookupName(i)));
     } else {
       retVal.add(column.lookupName(column.getSingleValueRow(rowNumber)));
     }

     return retVal;
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.druid.segment;

	import com.google.common.collect.ImmutableList;
	import com.google.common.collect.ImmutableMap;
	import com.google.common.collect.ImmutableMultiset;
	import com.google.common.collect.Sets;
	import org.apache.druid.collections.bitmap.ImmutableBitmap;
	import org.apache.druid.common.config.NullHandling;
	import org.apache.druid.data.input.MapBasedInputRow;
	import org.apache.druid.java.util.common.ISE;
	import org.apache.druid.java.util.common.guava.Comparators;
	import org.apache.druid.query.aggregation.AggregatorFactory;
	import org.apache.druid.segment.column.BitmapIndex;
	import org.apache.druid.segment.column.ColumnHolder;
	import org.apache.druid.segment.column.DictionaryEncodedColumn;
	import org.apache.druid.segment.data.IncrementalIndexTest;
	import org.apache.druid.segment.incremental.IncrementalIndex;
	import org.apache.druid.segment.writeout.OffHeapMemorySegmentWriteOutMediumFactory;
	import org.junit.Assert;
	import org.junit.Before;
	import org.junit.Rule;
	import org.junit.Test;
	import org.junit.rules.TemporaryFolder;
	import org.roaringbitmap.IntIterator;

	import java.io.File;
	import java.util.ArrayList;
	import java.util.Collections;
	import java.util.HashMap;
	import java.util.HashSet;
	import java.util.List;
	import java.util.Map;
	import java.util.Set;
	import java.util.stream.Collectors;
	import java.util.stream.IntStream;

	public class IndexMergerNullHandlingTest
	{
	private IndexMerger indexMerger;
	private IndexIO indexIO;
	private IndexSpec indexSpec;

	@Rule
	public TemporaryFolder temporaryFolder = new TemporaryFolder();

	@Before
	public void setUp()
	{
	indexMerger = TestHelper.getTestIndexMergerV9(OffHeapMemorySegmentWriteOutMediumFactory.instance());
	indexIO = TestHelper.getTestIndexIO();
	indexSpec = new IndexSpec();
	}

	@Test
	public void testStringColumnNullHandling() throws Exception
	{
	List<Map<String, Object>> nonNullFlavors = new ArrayList<>();
	nonNullFlavors.add(ImmutableMap.of("d", "a"));
	nonNullFlavors.add(ImmutableMap.of("d", ImmutableList.of("a", "b")));

	List<Map<String, Object>> nullFlavors = new ArrayList<>();
	Map<String, Object> mMissing = ImmutableMap.of();
	Map<String, Object> mEmptyList = ImmutableMap.of("d", Collections.emptyList());
	Map<String, Object> mNull = new HashMap<>();
	mNull.put("d", null);
	Map<String, Object> mEmptyString = ImmutableMap.of("d", "");
	Map<String, Object> mListOfNull = ImmutableMap.of("d", Collections.singletonList(null));
	Map<String, Object> mListOfEmptyString = ImmutableMap.of("d", Collections.singletonList(""));

	nullFlavors.add(mMissing);
	nullFlavors.add(mEmptyList);
	nullFlavors.add(mNull);
	nullFlavors.add(mListOfNull);

	if (NullHandling.replaceWithDefault()) {
	nullFlavors.add(mEmptyString);
	nullFlavors.add(mListOfEmptyString);
	} else {
	nonNullFlavors.add(mEmptyString);
	nonNullFlavors.add(mListOfEmptyString);
	}

	Set<Map<String, Object>> allValues = new HashSet<>();
	allValues.addAll(nonNullFlavors);
	allValues.addAll(nullFlavors);

	for (Set<Map<String, Object>> subset : Sets.powerSet(allValues)) {
	if (subset.isEmpty()) {
	continue;
	}

	final List<Map<String, Object>> subsetList = new ArrayList<>(subset);

	IncrementalIndex toPersist = IncrementalIndexTest.createIndex(new AggregatorFactory[]{});
	for (Map<String, Object> m : subsetList) {
	toPersist.add(new MapBasedInputRow(0L, ImmutableList.of("d"), m));
	}

	final File tempDir = temporaryFolder.newFolder();
	try (QueryableIndex index = indexIO.loadIndex(indexMerger.persist(toPersist, tempDir, indexSpec, null))) {
	final ColumnHolder columnHolder = index.getColumnHolder("d");

	if (nullFlavors.containsAll(subsetList)) {
	// all null -> should be missing
	Assert.assertNull(subsetList.toString(), columnHolder);
	} else {
	Assert.assertNotNull(subsetList.toString(), columnHolder);

	// The column has multiple values if there are any lists with > 1 element in the input set.
	final boolean hasMultipleValues = subsetList.stream()
	.anyMatch(m -> m.get("d") instanceof List
	&& (((List) m.get("d")).size() > 1));

	// Compute all unique values, the same way that IndexMerger is expected to do it.
	final Set<String> uniqueValues = new HashSet<>();
	for (Map<String, Object> m : subsetList) {
	final List<String> dValues = normalize(m.get("d"), hasMultipleValues);
	uniqueValues.addAll(dValues);

	if (nullFlavors.contains(m)) {
	uniqueValues.add(null);
	}
	}

	try (final DictionaryEncodedColumn<String> dictionaryColumn =
	(DictionaryEncodedColumn<String>) columnHolder.getColumn()) {
	// Verify unique values against the dictionary.
	Assert.assertEquals(
	subsetList.toString(),
	uniqueValues.stream().sorted(Comparators.naturalNullsFirst()).collect(Collectors.toList()),
	IntStream.range(0, dictionaryColumn.getCardinality())
	.mapToObj(dictionaryColumn::lookupName)
	.collect(Collectors.toList())
	);

	Assert.assertEquals(subsetList.toString(), hasMultipleValues, dictionaryColumn.hasMultipleValues());
	Assert.assertEquals(subsetList.toString(), uniqueValues.size(), dictionaryColumn.getCardinality());

	// Verify the expected set of rows was indexed, ignoring order.
	Assert.assertEquals(
	subsetList.toString(),
	ImmutableMultiset.copyOf(
	subsetList.stream()
	.map(m -> normalize(m.get("d"), hasMultipleValues))
	.distinct() // Distinct values only, because we expect rollup.
	.collect(Collectors.toList())
	),
	ImmutableMultiset.copyOf(
	IntStream.range(0, index.getNumRows())
	.mapToObj(rowNumber -> getRow(dictionaryColumn, rowNumber))
	// The "distinct" shouldn't be necessary, but it is, because [{}, {d=}, {d=a}]
	// yields [[null] x 2, [a]] (arguably a bug).
	.distinct()
	.collect(Collectors.toList())
	)
	);

	// Verify that the bitmap index for null is correct.
	final BitmapIndex bitmapIndex = columnHolder.getBitmapIndex();

	// Read through the column to find all the rows that should match null.
	final List<Integer> expectedNullRows = new ArrayList<>();
	for (int i = 0; i < index.getNumRows(); i++) {
	final List<String> row = getRow(dictionaryColumn, i);
	if (row.isEmpty() \|\| row.stream().anyMatch(NullHandling::isNullOrEquivalent)) {
	expectedNullRows.add(i);
	}
	}

	Assert.assertEquals(subsetList.toString(), expectedNullRows.size() > 0, bitmapIndex.hasNulls());

	if (expectedNullRows.size() > 0) {
	Assert.assertEquals(subsetList.toString(), 0, bitmapIndex.getIndex(null));

	final ImmutableBitmap nullBitmap = bitmapIndex.getBitmap(bitmapIndex.getIndex(null));
	final List<Integer> actualNullRows = new ArrayList<>();
	final IntIterator iterator = nullBitmap.iterator();
	while (iterator.hasNext()) {
	actualNullRows.add(iterator.next());
	}

	Assert.assertEquals(subsetList.toString(), expectedNullRows, actualNullRows);
	} else {
	Assert.assertEquals(-1, bitmapIndex.getIndex(null));
	}
	}
	}
	}
	}
	}

	/**
	* Normalize an input value the same way that IndexMerger is expected to do it.
	*/
	private static List<String> normalize(final Object value, final boolean hasMultipleValues)
	{
	final List<String> retVal = new ArrayList<>();

	if (value == null) {
	retVal.add(null);
	} else if (value instanceof String) {
	retVal.add(NullHandling.emptyToNullIfNeeded(((String) value)));
	} else if (value instanceof List) {
	final List<String> list = (List<String>) value;
	if (list.isEmpty() && !hasMultipleValues) {
	// empty lists become nulls in single valued columns
	retVal.add(NullHandling.emptyToNullIfNeeded(null));
	} else {
	retVal.addAll(list.stream().map(NullHandling::emptyToNullIfNeeded).collect(Collectors.toList()));
	}
	} else {
	throw new ISE("didn't expect class[%s]", value.getClass());
	}

	return retVal;
	}

	/**
	* Get a particular row from a column, exactly as reported by the column.
	*/
	private static List<String> getRow(final DictionaryEncodedColumn<String> column, final int rowNumber)
	{
	final List<String> retVal = new ArrayList<>();

	if (column.hasMultipleValues()) {
	column.getMultiValueRow(rowNumber).forEach(i -> retVal.add(column.lookupName(i)));
	} else {
	retVal.add(column.lookupName(column.getSingleValueRow(rowNumber)));
	}

	return retVal;
	}
	}