processing/src/main/java/org/apache/druid/segment/DimensionMerger.java - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.druid.segment;

 import javax.annotation.Nullable;
 import java.io.IOException;
 import java.nio.IntBuffer;
 import java.util.List;

 /**
  * Processing related interface
  *
  * A DimensionMerger is a per-dimension stateful object that encapsulates type-specific operations and data structures
  * used during the segment merging process (i.e., work done by {@link IndexMerger}).
  *
  * This object is responsible for:
  *   - merging encoding dictionaries, if present
  *   - writing the merged column data and any merged indexing structures (e.g., dictionaries, bitmaps) to disk
  *
  * At a high level, the index merging process can be broken down into the following steps:
  *   - Merge segment's encoding dictionaries. These need to be merged across segments into a shared space of dictionary
  *     mappings: {@link #writeMergedValueDictionary(List)}.
  *
  *   - Merge the rows across segments into a common sequence of rows. Done outside of scope of this interface,
  *     currently in {@link IndexMergerV9}.
  *
  *   - After constructing the merged sequence of rows, process each individual row via {@link #processMergedRow},
  *     potentially continuing updating the internal structures.
  *
  *   - Write the value representation metadata (dictionary, bitmaps), the sequence of row values,
  *     and index structures to a merged segment: {@link #writeIndexes}
  *
  * A class implementing this interface is expected to be highly stateful, updating its internal state as these
  * functions are called.
  */
 public interface DimensionMerger
 {
   /**
    * Given a list of segment adapters:
    * - Read _sorted order_ (e. g. see {@link
    *   org.apache.druid.segment.incremental.IncrementalIndexAdapter#getDimValueLookup(String)}) dictionary encoding information
    *   from the adapters
    * - Merge those sorted order dictionary into a one big sorted order dictionary and write this merged dictionary.
    *
    * The implementer should maintain knowledge of the "index number" of the adapters in the input list,
    * i.e., the position of each adapter in the input list.
    *
    * This "index number" will be used to refer to specific segments later
    * in {@link #convertSortedSegmentRowValuesToMergedRowValues}.
    *
    * @param adapters List of adapters to be merged.
    * @see DimensionIndexer#convertUnsortedValuesToSorted
    */
   void writeMergedValueDictionary(List<IndexableAdapter> adapters) throws IOException;

   /**
    * Creates a value selector, which converts values with per-segment, _sorted order_ (see {@link
    * DimensionIndexer#convertUnsortedValuesToSorted}) encoding from the given selector to their equivalent
    * representation in the merged set of rows.
    *
    * This method is used by the index merging process to build the merged sequence of rows.
    *
    * The implementing class is expected to use the merged value metadata constructed
    * during {@link #writeMergedValueDictionary(List)}, if applicable.
    *
    * For example, an implementation of this function for a dictionary-encoded String column would convert the
    * segment-specific, sorted order dictionary values within the row to the common merged dictionary values
    * determined during {@link #writeMergedValueDictionary(List)}.
    *
    * @param segmentIndex indicates which segment the row originated from, in the order established in
    * {@link #writeMergedValueDictionary(List)}
    * @param source the selector from which to take values to convert
    * @return a selector with converted values
    */
   ColumnValueSelector convertSortedSegmentRowValuesToMergedRowValues(int segmentIndex, ColumnValueSelector source);

   /**
    * Process a column value(s) (potentially multi-value) of a row from the given selector and update the
    * DimensionMerger's internal state.
    *
    * After constructing a merged sequence of rows across segments, the index merging process will iterate through these
    * rows and on each iteration, for each column, pass the column value selector to the corresponding DimensionMerger.
    *
    * This allows each DimensionMerger to build its internal view of the sequence of merged rows, to be
    * written out to a segment later.
    */
   void processMergedRow(ColumnValueSelector selector) throws IOException;

   /**
    * Internally construct any index structures relevant to this DimensionMerger.
    *
    * After receiving the sequence of merged rows via iterated {@link #processMergedRow} calls, the DimensionMerger
    * can now build any index structures it needs.
    *
    * For example, a dictionary encoded String implementation would create its bitmap indexes
    * for the merged segment during this step.
    *
    * The index merger will provide a list of row number conversion IntBuffer objects.
    * Each IntBuffer is associated with one of the segments being merged; the position of the IntBuffer in the list
    * corresponds to the position of segment adapters within the input list of {@link #writeMergedValueDictionary(List)}.
    *
    * For example, suppose there are two segments A and B.
    * Row 24 from segment A maps to row 99 in the merged sequence of rows,
    * The IntBuffer for segment A would have a mapping of 24 -> 99.
    *
    * @param segmentRowNumConversions A list of row number conversion IntBuffer objects.
    */
   void writeIndexes(@Nullable List<IntBuffer> segmentRowNumConversions) throws IOException;

   /**
    * Returns true if this dimension has no data besides nulls.
    * See {@link org.apache.druid.segment.serde.NullColumnPartSerde} for how null-only columns are stored in the segment.
    */
   boolean hasOnlyNulls();
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.druid.segment;

	import javax.annotation.Nullable;
	import java.io.IOException;
	import java.nio.IntBuffer;
	import java.util.List;

	/**
	* Processing related interface
	*
	* A DimensionMerger is a per-dimension stateful object that encapsulates type-specific operations and data structures
	* used during the segment merging process (i.e., work done by {@link IndexMerger}).
	*
	* This object is responsible for:
	* - merging encoding dictionaries, if present
	* - writing the merged column data and any merged indexing structures (e.g., dictionaries, bitmaps) to disk
	*
	* At a high level, the index merging process can be broken down into the following steps:
	* - Merge segment's encoding dictionaries. These need to be merged across segments into a shared space of dictionary
	* mappings: {@link #writeMergedValueDictionary(List)}.
	*
	* - Merge the rows across segments into a common sequence of rows. Done outside of scope of this interface,
	* currently in {@link IndexMergerV9}.
	*
	* - After constructing the merged sequence of rows, process each individual row via {@link #processMergedRow},
	* potentially continuing updating the internal structures.
	*
	* - Write the value representation metadata (dictionary, bitmaps), the sequence of row values,
	* and index structures to a merged segment: {@link #writeIndexes}
	*
	* A class implementing this interface is expected to be highly stateful, updating its internal state as these
	* functions are called.
	*/
	public interface DimensionMerger
	{
	/**
	* Given a list of segment adapters:
	* - Read _sorted order_ (e. g. see {@link
	* org.apache.druid.segment.incremental.IncrementalIndexAdapter#getDimValueLookup(String)}) dictionary encoding information
	* from the adapters
	* - Merge those sorted order dictionary into a one big sorted order dictionary and write this merged dictionary.
	*
	* The implementer should maintain knowledge of the "index number" of the adapters in the input list,
	* i.e., the position of each adapter in the input list.
	*
	* This "index number" will be used to refer to specific segments later
	* in {@link #convertSortedSegmentRowValuesToMergedRowValues}.
	*
	* @param adapters List of adapters to be merged.
	* @see DimensionIndexer#convertUnsortedValuesToSorted
	*/
	void writeMergedValueDictionary(List<IndexableAdapter> adapters) throws IOException;

	/**
	* Creates a value selector, which converts values with per-segment, _sorted order_ (see {@link
	* DimensionIndexer#convertUnsortedValuesToSorted}) encoding from the given selector to their equivalent
	* representation in the merged set of rows.
	*
	* This method is used by the index merging process to build the merged sequence of rows.
	*
	* The implementing class is expected to use the merged value metadata constructed
	* during {@link #writeMergedValueDictionary(List)}, if applicable.
	*
	* For example, an implementation of this function for a dictionary-encoded String column would convert the
	* segment-specific, sorted order dictionary values within the row to the common merged dictionary values
	* determined during {@link #writeMergedValueDictionary(List)}.
	*
	* @param segmentIndex indicates which segment the row originated from, in the order established in
	* {@link #writeMergedValueDictionary(List)}
	* @param source the selector from which to take values to convert
	* @return a selector with converted values
	*/
	ColumnValueSelector convertSortedSegmentRowValuesToMergedRowValues(int segmentIndex, ColumnValueSelector source);

	/**
	* Process a column value(s) (potentially multi-value) of a row from the given selector and update the
	* DimensionMerger's internal state.
	*
	* After constructing a merged sequence of rows across segments, the index merging process will iterate through these
	* rows and on each iteration, for each column, pass the column value selector to the corresponding DimensionMerger.
	*
	* This allows each DimensionMerger to build its internal view of the sequence of merged rows, to be
	* written out to a segment later.
	*/
	void processMergedRow(ColumnValueSelector selector) throws IOException;

	/**
	* Internally construct any index structures relevant to this DimensionMerger.
	*
	* After receiving the sequence of merged rows via iterated {@link #processMergedRow} calls, the DimensionMerger
	* can now build any index structures it needs.
	*
	* For example, a dictionary encoded String implementation would create its bitmap indexes
	* for the merged segment during this step.
	*
	* The index merger will provide a list of row number conversion IntBuffer objects.
	* Each IntBuffer is associated with one of the segments being merged; the position of the IntBuffer in the list
	* corresponds to the position of segment adapters within the input list of {@link #writeMergedValueDictionary(List)}.
	*
	* For example, suppose there are two segments A and B.
	* Row 24 from segment A maps to row 99 in the merged sequence of rows,
	* The IntBuffer for segment A would have a mapping of 24 -> 99.
	*
	* @param segmentRowNumConversions A list of row number conversion IntBuffer objects.
	*/
	void writeIndexes(@Nullable List<IntBuffer> segmentRowNumConversions) throws IOException;

	/**
	* Returns true if this dimension has no data besides nulls.
	* See {@link org.apache.druid.segment.serde.NullColumnPartSerde} for how null-only columns are stored in the segment.
	*/
	boolean hasOnlyNulls();
	}