| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import type { CompactionConfigs } from '../../druid-models'; |
| import { Api } from '../../singletons'; |
| import { deepGet, pluralIfNeeded, queryDruidSql } from '../../utils'; |
| import { postToSampler } from '../../utils/sampler'; |
| |
| export interface CheckControls { |
| addSuggestion: (message: string) => void; |
| addIssue: (message: string) => void; |
| terminateChecks: () => void; |
| } |
| |
| export interface DoctorCheck { |
| name: string; |
| check: (controls: CheckControls) => Promise<void>; |
| } |
| |
| interface HistoricalFill { |
| historical: string; |
| fill: number; |
| } |
| |
| const RUNTIME_PROPERTIES_ALL_NODES_MUST_AGREE_ON: string[] = [ |
| 'user.timezone', |
| 'druid.zk.service.host', |
| ]; |
| |
| // In the future (when we can query other services) is will also be cool to check: |
| // 'druid.storage.type' <=> historicals, overlords, mm |
| // 'druid.indexer.logs.type' <=> overlord, mm, + peons |
| |
| const RUNTIME_PROPERTIES_MASTER_NODES_SHOULD_AGREE_ON: string[] = [ |
| 'druid.metadata.storage.type', // overlord + coordinator |
| 'druid.metadata.storage.connector.connectURI', |
| ]; |
| |
| export const DOCTOR_CHECKS: DoctorCheck[] = [ |
| // ------------------------------------- |
| // Self (router) checks |
| // ------------------------------------- |
| { |
| name: 'Verify own status', |
| check: async controls => { |
| // Make sure that the router responds to /status and gives some valid info back |
| let status: any; |
| try { |
| status = (await Api.instance.get(`/status`)).data; |
| } catch (e) { |
| controls.addIssue( |
| `Did not get a /status response from the Router service. Try confirming that it is running and accessible. Got: ${e.message}`, |
| ); |
| controls.terminateChecks(); |
| return; |
| } |
| |
| if (typeof status.version !== 'string') { |
| controls.addIssue('Could not get a valid /status response from the Router.'); |
| } |
| }, |
| }, |
| { |
| name: 'Verify own runtime properties', |
| check: async controls => { |
| // Make sure that everything in /status/properties is above board |
| let properties: Record<string, string>; |
| try { |
| properties = (await Api.instance.get(`/status/properties`)).data; |
| } catch (e) { |
| controls.addIssue( |
| `Did not get a /status/properties response from the Router. Message: ${e.message}`, |
| ); |
| return; |
| } |
| |
| // Check that the management proxy is on, it really should be for someone to access the console in the first place but everything could happen |
| if (properties['druid.router.managementProxy.enabled'] !== 'true') { |
| controls.addIssue( |
| `The Router's "druid.router.managementProxy.enabled" is not reported as "true". This means that the Coordinator and Overlord will not be accessible from the Router (and this console).`, |
| ); |
| } |
| |
| // Check for Java 8u92+, 11, or 17 |
| if ( |
| properties['java.specification.version'] && |
| properties['java.specification.version'] !== '1.8' && |
| properties['java.specification.version'] !== '11' && |
| properties['java.specification.version'] !== '17' |
| ) { |
| controls.addSuggestion( |
| `It looks like are running Java ${properties['java.runtime.version']}. Druid officially supports Java 8u92+, 11, or 17`, |
| ); |
| } |
| |
| // Check "file.encoding" |
| if (properties['file.encoding'] && properties['file.encoding'] !== 'UTF-8') { |
| controls.addSuggestion( |
| `It looks like "file.encoding" is set to ${properties['file.encoding']}, it is recommended to set this to "UTF-8"`, |
| ); |
| } |
| |
| // Check "user.timezone" |
| if (properties['user.timezone'] && properties['user.timezone'] !== 'UTC') { |
| controls.addSuggestion( |
| `It looks like "user.timezone" is set to ${properties['user.timezone']}, it is recommended to set this to "UTC"`, |
| ); |
| } |
| }, |
| }, |
| |
| // ------------------------------------- |
| // Coordinator and Overlord |
| // ------------------------------------- |
| { |
| name: 'Verify the Coordinator and Overlord status', |
| check: async controls => { |
| // Make sure that everything in Coordinator's /status is good |
| let myStatus: any; |
| try { |
| myStatus = (await Api.instance.get(`/status`)).data; |
| } catch { |
| return; |
| } |
| |
| let coordinatorStatus: any; |
| try { |
| coordinatorStatus = (await Api.instance.get(`/proxy/coordinator/status`)).data; |
| } catch (e) { |
| controls.addIssue( |
| 'Did not get a /status response from the Coordinator service. Try confirming that it is running and accessible.', |
| ); |
| return; |
| } |
| |
| let overlordStatus: any; |
| try { |
| overlordStatus = (await Api.instance.get(`/proxy/overlord/status`)).data; |
| } catch (e) { |
| controls.addIssue( |
| 'Did not get a /status response from the Overlord service. Try confirming that it is running and accessible.', |
| ); |
| return; |
| } |
| |
| if (myStatus.version !== coordinatorStatus.version) { |
| controls.addSuggestion( |
| `It looks like the Router and Coordinator services are on different versions of Druid. This may indicate a problem if you are not in the middle of a rolling upgrade.`, |
| ); |
| } |
| |
| if (myStatus.version !== overlordStatus.version) { |
| controls.addSuggestion( |
| `It looks like the Router and Overlord services are on different versions of Druid. This may indicate a problem if you are not in the middle of a rolling upgrade.`, |
| ); |
| } |
| }, |
| }, |
| { |
| name: 'Verify the Coordinator and Overlord runtime properties', |
| check: async controls => { |
| // Make sure that everything in coordinator and overlord /status/properties is good and matches where needed |
| let myProperties: Record<string, string>; |
| try { |
| myProperties = (await Api.instance.get(`/status/properties`)).data; |
| } catch { |
| return; |
| } |
| |
| let coordinatorProperties: Record<string, string>; |
| try { |
| coordinatorProperties = (await Api.instance.get(`/proxy/coordinator/status/properties`)) |
| .data; |
| } catch (e) { |
| controls.addIssue( |
| 'Did not get a /status response from the coordinator. Try confirming that it is running and accessible.', |
| ); |
| return; |
| } |
| |
| let overlordProperties: Record<string, string>; |
| try { |
| overlordProperties = (await Api.instance.get(`/proxy/overlord/status/properties`)).data; |
| } catch (e) { |
| controls.addIssue( |
| 'Did not get a /status response from the overlord. Try confirming that it is running and accessible.', |
| ); |
| return; |
| } |
| |
| for (const prop of RUNTIME_PROPERTIES_ALL_NODES_MUST_AGREE_ON) { |
| if (myProperties[prop] !== coordinatorProperties[prop]) { |
| controls.addIssue( |
| `The Router and Coordinator do not agree on the "${prop}" runtime property ("${myProperties[prop]}" vs "${coordinatorProperties[prop]}")`, |
| ); |
| } |
| if (myProperties[prop] !== overlordProperties[prop]) { |
| controls.addIssue( |
| `The Router and Overlord do not agree on the "${prop}" runtime property ("${myProperties[prop]}" vs "${overlordProperties[prop]}")`, |
| ); |
| } |
| } |
| |
| for (const prop of RUNTIME_PROPERTIES_MASTER_NODES_SHOULD_AGREE_ON) { |
| if (coordinatorProperties[prop] !== overlordProperties[prop]) { |
| controls.addSuggestion( |
| `The Coordinator and Overlord do not agree on the "${prop}" runtime property ("${coordinatorProperties[prop]}" vs "${overlordProperties[prop]}")`, |
| ); |
| } |
| } |
| }, |
| }, |
| |
| // ------------------------------------- |
| // Check sampler |
| // ------------------------------------- |
| { |
| name: 'Verify that the sampler works', |
| check: async controls => { |
| // Make sure that everything in Coordinator's /status is good |
| let testSampledData: any; |
| try { |
| testSampledData = await postToSampler( |
| { |
| type: 'index_parallel', |
| spec: { |
| ioConfig: { |
| type: 'index_parallel', |
| inputSource: { type: 'inline', data: '{"test":"Data"}' }, |
| inputFormat: { type: 'json' }, |
| }, |
| dataSchema: { |
| dataSource: 'sample', |
| timestampSpec: { |
| column: '!!!_no_such_column_!!!', |
| missingValue: '2010-01-01T00:00:00Z', |
| }, |
| dimensionsSpec: { dimensions: ['test'] }, |
| transformSpec: {}, |
| metricsSpec: [], |
| granularitySpec: { queryGranularity: 'NONE' }, |
| }, |
| }, |
| samplerConfig: { |
| numRows: 50, |
| timeoutMs: 1000, |
| }, |
| }, |
| 'doctor', |
| ); |
| } catch { |
| controls.addIssue(`Could not use the sampler.`); |
| return; |
| } |
| |
| if (deepGet(testSampledData, 'data.0.parsed.test') !== 'Data') { |
| controls.addIssue(`Sampler returned incorrect data.`); |
| } |
| }, |
| }, |
| |
| // ------------------------------------- |
| // Check SQL |
| // ------------------------------------- |
| { |
| name: 'Verify that SQL works', |
| check: async controls => { |
| // Make sure that we can run the simplest query |
| let sqlResult: any[]; |
| try { |
| sqlResult = await queryDruidSql({ query: `SELECT 1 + 1 AS "two"` }); |
| } catch (e) { |
| controls.addIssue( |
| `Could not query SQL ensure that "druid.sql.enable" is set to "true" and that there is a Broker service running. Got: ${e.message}`, |
| ); |
| controls.terminateChecks(); |
| return; |
| } |
| |
| if (sqlResult.length !== 1 || sqlResult[0]['two'] !== 2) { |
| controls.addIssue(`Got incorrect results from a basic SQL query.`); |
| } |
| }, |
| }, |
| { |
| name: 'Verify that there are historicals and they are not too full', |
| check: async controls => { |
| // Make sure that no services are reported that are over 95% capacity |
| let historicalFills: HistoricalFill[]; |
| try { |
| historicalFills = await queryDruidSql({ |
| query: `SELECT |
| "server" AS "historical", |
| "curr_size" * 100.0 / "max_size" AS "fill" |
| FROM sys.servers |
| WHERE "server_type" = 'historical' |
| ORDER BY "fill" DESC`, |
| }); |
| // Note: for some reason adding ` AND "curr_size" * 100.0 / "max_size" > 90` to the filter does not work as of this writing Apr 8, 2024 |
| } catch (e) { |
| controls.addIssue(`Could not run a sys.servers query. Got: ${e.message}`); |
| return; |
| } |
| |
| if (!historicalFills.length) { |
| controls.addIssue(`There do not appear to be any historical services.`); |
| return; |
| } |
| |
| function formatFill(historicalFill: HistoricalFill): string { |
| return historicalFill.fill.toFixed(2); |
| } |
| |
| for (const historicalFill of historicalFills) { |
| if (historicalFill.fill > 95) { |
| controls.addIssue( |
| `Historical "${historicalFill.historical}" appears to be over 95% full (is ${formatFill( |
| historicalFill, |
| )}%). Increase capacity.`, |
| ); |
| } else if (historicalFill.fill > 90) { |
| controls.addSuggestion( |
| `Historical "${historicalFill.historical}" appears to be over 90% full (is ${formatFill( |
| historicalFill, |
| )}%)`, |
| ); |
| } |
| } |
| }, |
| }, |
| { |
| name: 'Look for time chunks that could benefit from compaction', |
| check: async controls => { |
| // Check for any time chunks where there is more than 1 segment and avg segment size is less than 100MB |
| const dayAgo = new Date(Date.now() - 24 * 60 * 60 * 1000).toISOString(); |
| let sqlResult: any[]; |
| try { |
| sqlResult = await queryDruidSql({ |
| query: `SELECT |
| "datasource", |
| COUNT(*) AS "num_bad_time_chunks" |
| FROM ( |
| SELECT |
| "datasource", "start", "end", |
| AVG("size") AS "avg_segment_size_in_time_chunk", |
| SUM("size") AS "total_size", |
| COUNT(*) AS "num_segments" |
| FROM sys.segments |
| WHERE is_published = 1 AND "start" < '${dayAgo}' |
| GROUP BY 1, 2, 3 |
| HAVING "num_segments" > 1 AND "total_size" > 1 AND "avg_segment_size_in_time_chunk" < 100000000 |
| ) |
| GROUP BY 1 |
| ORDER BY "num_bad_time_chunks"`, |
| }); |
| } catch (e) { |
| return; |
| } |
| |
| if (sqlResult.length) { |
| // Grab the auto-compaction definitions and ignore dataSources that already have auto-compaction |
| let compactionResult: CompactionConfigs; |
| try { |
| compactionResult = ( |
| await Api.instance.get('/druid/indexer/v1/compaction/config/datasources') |
| ).data; |
| } catch (e) { |
| controls.addIssue(`Could not get compaction config. Something is wrong.`); |
| return; |
| } |
| |
| if (!compactionResult.compactionConfigs) return; |
| |
| if (!Array.isArray(compactionResult.compactionConfigs)) { |
| controls.addIssue(`Got invalid value from compaction config. Something is wrong.`); |
| return; |
| } |
| |
| const dataSourcesWithCompaction = compactionResult.compactionConfigs.map( |
| (d: any) => d.dataSource, |
| ); |
| |
| sqlResult = sqlResult.filter(d => !dataSourcesWithCompaction.includes(d['datasource'])); |
| |
| for (const datasource of sqlResult) { |
| controls.addSuggestion( |
| `Datasource "${ |
| datasource['datasource'] |
| }" could benefit from auto-compaction as it has ${pluralIfNeeded( |
| datasource['num_bad_time_chunks'], |
| 'time chunk', |
| )} that have multiple small segments that could be compacted.`, |
| ); |
| } |
| } |
| }, |
| }, |
| ]; |