web-console/src/druid-models/compaction-config.tsx - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 import { Code } from '@blueprintjs/core';
 import React from 'react';

 import { Field } from '../components';
 import { deepGet, deepSet, oneOf } from '../utils';

 export type CompactionConfig = Record<string, any>;

 export const COMPACTION_CONFIG_FIELDS: Field<CompactionConfig>[] = [
   {
     name: 'skipOffsetFromLatest',
     type: 'string',
     defaultValue: 'P1D',
     suggestions: ['PT0H', 'PT1H', 'P1D', 'P3D'],
     info: (
       <p>
         The offset for searching segments to be compacted. Strongly recommended to set for realtime
         dataSources.
       </p>
     ),
   },
   {
     name: 'tuningConfig.partitionsSpec.type',
     label: 'Partitioning type',
     type: 'string',
     suggestions: ['dynamic', 'hashed', 'single_dim'],
     info: (
       <p>
         For perfect rollup, you should use either <Code>hashed</Code> (partitioning based on the
         hash of dimensions in each row) or <Code>single_dim</Code> (based on ranges of a single
         dimension). For best-effort rollup, you should use <Code>dynamic</Code>.
       </p>
     ),
   },
   // partitionsSpec type: dynamic
   {
     name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
     type: 'number',
     defaultValue: 5000000,
     defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'dynamic',
     info: <>Determines how many rows are in each segment.</>,
   },
   {
     name: 'tuningConfig.partitionsSpec.maxTotalRows',
     type: 'number',
     defaultValue: 20000000,
     defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'dynamic',
     info: <>Total number of rows in segments waiting for being pushed.</>,
   },
   // partitionsSpec type: hashed
   {
     name: 'tuningConfig.partitionsSpec.targetRowsPerSegment',
     type: 'number',
     zeroMeansUndefined: true,
     placeholder: `(defaults to 500000)`,
     defined: t =>
       deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
       !deepGet(t, 'tuningConfig.partitionsSpec.numShards') &&
       !deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
     info: (
       <>
         <p>
           If the segments generated are a sub-optimal size for the requested partition dimensions,
           consider setting this field.
         </p>
         <p>
           A target row count for each partition. Each partition will have a row count close to the
           target assuming evenly distributed keys. Defaults to 5 million if numShards is null.
         </p>
       </>
     ),
   },
   {
     name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
     type: 'number',
     zeroMeansUndefined: true,
     defined: t =>
       deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
       !deepGet(t, 'tuningConfig.partitionsSpec.numShards') &&
       !deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
     info: (
       <>
         <p>
           Target number of rows to include in a partition, should be a number that targets segments
           of 500MB~1GB.
         </p>
         <p>
           <Code>maxRowsPerSegment</Code> is an alias for <Code>targetRowsPerSegment</Code>. Only one
           of these properties can be used.
         </p>
       </>
     ),
   },
   {
     name: 'tuningConfig.partitionsSpec.numShards',
     type: 'number',
     zeroMeansUndefined: true,
     defined: t =>
       deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
       !deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment') &&
       !deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
     info: (
       <>
         <p>
           If you know the optimal number of shards and want to speed up the time it takes for
           compaction to run, set this field.
         </p>
         <p>
           Directly specify the number of shards to create. If this is specified and
           &apos;intervals&apos; is specified in the granularitySpec, the index task can skip the
           determine intervals/partitions pass through the data.
         </p>
       </>
     ),
   },
   {
     name: 'tuningConfig.partitionsSpec.partitionDimensions',
     type: 'string-array',
     placeholder: '(all dimensions)',
     defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed',
     info: <p>The dimensions to partition on. Leave blank to select all dimensions.</p>,
   },
   // partitionsSpec type: single_dim
   {
     name: 'tuningConfig.partitionsSpec.partitionDimension',
     type: 'string',
     defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim',
     required: true,
     info: <p>The dimension to partition on.</p>,
   },
   {
     name: 'tuningConfig.partitionsSpec.targetRowsPerSegment',
     type: 'number',
     zeroMeansUndefined: true,
     defined: t =>
       deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim' &&
       !deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
     required: (t: CompactionConfig) =>
       !deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment') &&
       !deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
     info: (
       <p>
         Target number of rows to include in a partition, should be a number that targets segments of
         500MB~1GB.
       </p>
     ),
   },
   {
     name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
     type: 'number',
     zeroMeansUndefined: true,
     defined: t =>
       deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim' &&
       !deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
     required: (t: CompactionConfig) =>
       !deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment') &&
       !deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
     info: <p>Maximum number of rows to include in a partition.</p>,
   },
   {
     name: 'tuningConfig.partitionsSpec.assumeGrouped',
     type: 'boolean',
     defaultValue: false,
     defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim',
     info: (
       <p>
         Assume that input data has already been grouped on time and dimensions. Ingestion will run
         faster, but may choose sub-optimal partitions if this assumption is violated.
       </p>
     ),
   },
   {
     name: 'inputSegmentSizeBytes',
     type: 'number',
     defaultValue: 419430400,
     info: (
       <p>
         Maximum number of total segment bytes processed per compaction task. Since a time chunk must
         be processed in its entirety, if the segments for a particular time chunk have a total size
         in bytes greater than this parameter, compaction will not run for that time chunk. Because
         each compaction task runs with a single thread, setting this value too far above 1–2GB will
         result in compaction tasks taking an excessive amount of time.
       </p>
     ),
   },
   {
     name: 'tuningConfig.maxNumConcurrentSubTasks',
     type: 'number',
     defaultValue: 1,
     min: 1,
     info: (
       <>
         Maximum number of tasks which can be run at the same time. The supervisor task would spawn
         worker tasks up to maxNumConcurrentSubTasks regardless of the available task slots. If this
         value is set to 1, the supervisor task processes data ingestion on its own instead of
         spawning worker tasks. If this value is set to too large, too many worker tasks can be
         created which might block other ingestion.
       </>
     ),
   },
   {
     name: 'tuningConfig.totalNumMergeTasks',
     type: 'number',
     defaultValue: 10,
     min: 1,
     defined: t => oneOf(deepGet(t, 'tuningConfig.partitionsSpec.type'), 'hashed', 'single_dim'),
     info: <>Maximum number of merge tasks which can be run at the same time.</>,
   },
   {
     name: 'tuningConfig.splitHintSpec.maxSplitSize',
     type: 'number',
     defaultValue: 1073741824,
     min: 1000000,
     hideInMore: true,
     adjustment: (t: CompactionConfig) => deepSet(t, 'tuningConfig.splitHintSpec.type', 'maxSize'),
     info: (
       <>
         Maximum number of bytes of input segments to process in a single task. If a single segment
         is larger than this number, it will be processed by itself in a single task (input segments
         are never split across tasks).
       </>
     ),
   },
   {
     name: 'tuningConfig.splitHintSpec.maxNumFiles',
     label: 'Max num files (segments)',
     type: 'number',
     defaultValue: 1000,
     min: 1,
     hideInMore: true,
     adjustment: (t: CompactionConfig) => deepSet(t, 'tuningConfig.splitHintSpec.type', 'maxSize'),
     info: (
       <>
         Maximum number of input segments to process in a single subtask. This limit is to avoid task
         failures when the ingestion spec is too long. There are two known limits on the max size of
         serialized ingestion spec, i.e., the max ZNode size in ZooKeeper (
         <Code>jute.maxbuffer</Code>) and the max packet size in MySQL (
         <Code>max_allowed_packet</Code>). These can make ingestion tasks fail if the serialized
         ingestion spec size hits one of them.
       </>
     ),
   },
 ];
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	import { Code } from '@blueprintjs/core';
	import React from 'react';

	import { Field } from '../components';
	import { deepGet, deepSet, oneOf } from '../utils';

	export type CompactionConfig = Record<string, any>;

	export const COMPACTION_CONFIG_FIELDS: Field<CompactionConfig>[] = [
	{
	name: 'skipOffsetFromLatest',
	type: 'string',
	defaultValue: 'P1D',
	suggestions: ['PT0H', 'PT1H', 'P1D', 'P3D'],
	info: (
	<p>
	The offset for searching segments to be compacted. Strongly recommended to set for realtime
	dataSources.
	</p>
	),
	},
	{
	name: 'tuningConfig.partitionsSpec.type',
	label: 'Partitioning type',
	type: 'string',
	suggestions: ['dynamic', 'hashed', 'single_dim'],
	info: (
	<p>
	For perfect rollup, you should use either <Code>hashed</Code> (partitioning based on the
	hash of dimensions in each row) or <Code>single_dim</Code> (based on ranges of a single
	dimension). For best-effort rollup, you should use <Code>dynamic</Code>.
	</p>
	),
	},
	// partitionsSpec type: dynamic
	{
	name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
	type: 'number',
	defaultValue: 5000000,
	defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'dynamic',
	info: <>Determines how many rows are in each segment.</>,
	},
	{
	name: 'tuningConfig.partitionsSpec.maxTotalRows',
	type: 'number',
	defaultValue: 20000000,
	defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'dynamic',
	info: <>Total number of rows in segments waiting for being pushed.</>,
	},
	// partitionsSpec type: hashed
	{
	name: 'tuningConfig.partitionsSpec.targetRowsPerSegment',
	type: 'number',
	zeroMeansUndefined: true,
	placeholder: `(defaults to 500000)`,
	defined: t =>
	deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
	!deepGet(t, 'tuningConfig.partitionsSpec.numShards') &&
	!deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
	info: (
	<>
	<p>
	If the segments generated are a sub-optimal size for the requested partition dimensions,
	consider setting this field.
	</p>
	<p>
	A target row count for each partition. Each partition will have a row count close to the
	target assuming evenly distributed keys. Defaults to 5 million if numShards is null.
	</p>
	</>
	),
	},
	{
	name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
	type: 'number',
	zeroMeansUndefined: true,
	defined: t =>
	deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
	!deepGet(t, 'tuningConfig.partitionsSpec.numShards') &&
	!deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
	info: (
	<>
	<p>
	Target number of rows to include in a partition, should be a number that targets segments
	of 500MB~1GB.
	</p>
	<p>
	<Code>maxRowsPerSegment</Code> is an alias for <Code>targetRowsPerSegment</Code>. Only one
	of these properties can be used.
	</p>
	</>
	),
	},
	{
	name: 'tuningConfig.partitionsSpec.numShards',
	type: 'number',
	zeroMeansUndefined: true,
	defined: t =>
	deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed' &&
	!deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment') &&
	!deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
	info: (
	<>
	<p>
	If you know the optimal number of shards and want to speed up the time it takes for
	compaction to run, set this field.
	</p>
	<p>
	Directly specify the number of shards to create. If this is specified and
	'intervals' is specified in the granularitySpec, the index task can skip the
	determine intervals/partitions pass through the data.
	</p>
	</>
	),
	},
	{
	name: 'tuningConfig.partitionsSpec.partitionDimensions',
	type: 'string-array',
	placeholder: '(all dimensions)',
	defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'hashed',
	info: <p>The dimensions to partition on. Leave blank to select all dimensions.</p>,
	},
	// partitionsSpec type: single_dim
	{
	name: 'tuningConfig.partitionsSpec.partitionDimension',
	type: 'string',
	defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim',
	required: true,
	info: <p>The dimension to partition on.</p>,
	},
	{
	name: 'tuningConfig.partitionsSpec.targetRowsPerSegment',
	type: 'number',
	zeroMeansUndefined: true,
	defined: t =>
	deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim' &&
	!deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
	required: (t: CompactionConfig) =>
	!deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment') &&
	!deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
	info: (
	<p>
	Target number of rows to include in a partition, should be a number that targets segments of
	500MB~1GB.
	</p>
	),
	},
	{
	name: 'tuningConfig.partitionsSpec.maxRowsPerSegment',
	type: 'number',
	zeroMeansUndefined: true,
	defined: t =>
	deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim' &&
	!deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment'),
	required: (t: CompactionConfig) =>
	!deepGet(t, 'tuningConfig.partitionsSpec.targetRowsPerSegment') &&
	!deepGet(t, 'tuningConfig.partitionsSpec.maxRowsPerSegment'),
	info: <p>Maximum number of rows to include in a partition.</p>,
	},
	{
	name: 'tuningConfig.partitionsSpec.assumeGrouped',
	type: 'boolean',
	defaultValue: false,
	defined: t => deepGet(t, 'tuningConfig.partitionsSpec.type') === 'single_dim',
	info: (
	<p>
	Assume that input data has already been grouped on time and dimensions. Ingestion will run
	faster, but may choose sub-optimal partitions if this assumption is violated.
	</p>
	),
	},
	{
	name: 'inputSegmentSizeBytes',
	type: 'number',
	defaultValue: 419430400,
	info: (
	<p>
	Maximum number of total segment bytes processed per compaction task. Since a time chunk must
	be processed in its entirety, if the segments for a particular time chunk have a total size
	in bytes greater than this parameter, compaction will not run for that time chunk. Because
	each compaction task runs with a single thread, setting this value too far above 1–2GB will
	result in compaction tasks taking an excessive amount of time.
	</p>
	),
	},
	{
	name: 'tuningConfig.maxNumConcurrentSubTasks',
	type: 'number',
	defaultValue: 1,
	min: 1,
	info: (
	<>
	Maximum number of tasks which can be run at the same time. The supervisor task would spawn
	worker tasks up to maxNumConcurrentSubTasks regardless of the available task slots. If this
	value is set to 1, the supervisor task processes data ingestion on its own instead of
	spawning worker tasks. If this value is set to too large, too many worker tasks can be
	created which might block other ingestion.
	</>
	),
	},
	{
	name: 'tuningConfig.totalNumMergeTasks',
	type: 'number',
	defaultValue: 10,
	min: 1,
	defined: t => oneOf(deepGet(t, 'tuningConfig.partitionsSpec.type'), 'hashed', 'single_dim'),
	info: <>Maximum number of merge tasks which can be run at the same time.</>,
	},
	{
	name: 'tuningConfig.splitHintSpec.maxSplitSize',
	type: 'number',
	defaultValue: 1073741824,
	min: 1000000,
	hideInMore: true,
	adjustment: (t: CompactionConfig) => deepSet(t, 'tuningConfig.splitHintSpec.type', 'maxSize'),
	info: (
	<>
	Maximum number of bytes of input segments to process in a single task. If a single segment
	is larger than this number, it will be processed by itself in a single task (input segments
	are never split across tasks).
	</>
	),
	},
	{
	name: 'tuningConfig.splitHintSpec.maxNumFiles',
	label: 'Max num files (segments)',
	type: 'number',
	defaultValue: 1000,
	min: 1,
	hideInMore: true,
	adjustment: (t: CompactionConfig) => deepSet(t, 'tuningConfig.splitHintSpec.type', 'maxSize'),
	info: (
	<>
	Maximum number of input segments to process in a single subtask. This limit is to avoid task
	failures when the ingestion spec is too long. There are two known limits on the max size of
	serialized ingestion spec, i.e., the max ZNode size in ZooKeeper (
	<Code>jute.maxbuffer</Code>) and the max packet size in MySQL (
	<Code>max_allowed_packet</Code>). These can make ingestion tasks fail if the serialized
	ingestion spec size hits one of them.
	</>
	),
	},
	];