| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| import { CSV_SAMPLE, JSON_SAMPLE } from '../../utils/sampler.mock'; |
| |
| import type { IngestionSpec } from './ingestion-spec'; |
| import { |
| adjustId, |
| cleanSpec, |
| DEFAULT_FORCE_SEGMENT_SORT_BY_TIME, |
| guessColumnTypeFromInput, |
| guessColumnTypeFromSampleResponse, |
| guessKafkaInputFormat, |
| guessSimpleInputFormat, |
| updateSchemaWithSample, |
| upgradeSpec, |
| } from './ingestion-spec'; |
| |
| describe('ingestion-spec', () => { |
| it('upgrades / downgrades task spec 1', () => { |
| const oldTaskSpec = { |
| type: 'index_parallel', |
| spec: { |
| ioConfig: { |
| type: 'index_parallel', |
| firehose: { |
| type: 'http', |
| uris: ['https://website.com/wikipedia.json.gz'], |
| }, |
| }, |
| tuningConfig: { |
| type: 'index_parallel', |
| }, |
| dataSchema: { |
| dataSource: 'wikipedia', |
| granularitySpec: { |
| segmentGranularity: 'day', |
| queryGranularity: 'hour', |
| rollup: true, |
| }, |
| parser: { |
| type: 'string', |
| parseSpec: { |
| format: 'json', |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'iso', |
| }, |
| dimensionsSpec: { |
| dimensions: ['channel', 'cityName', 'comment'], |
| }, |
| flattenSpec: { |
| fields: [ |
| { |
| type: 'path', |
| name: 'cityNameAlt', |
| expr: '$.cityName', |
| }, |
| ], |
| }, |
| }, |
| }, |
| transformSpec: { |
| transforms: [ |
| { |
| type: 'expression', |
| name: 'channel', |
| expression: 'concat("channel", \'lol\')', |
| }, |
| ], |
| filter: { |
| type: 'selector', |
| dimension: 'commentLength', |
| value: '35', |
| }, |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| name: 'sum_added', |
| type: 'longSum', |
| fieldName: 'added', |
| }, |
| ], |
| }, |
| }, |
| }; |
| |
| expect(upgradeSpec(oldTaskSpec)).toEqual({ |
| spec: { |
| dataSchema: { |
| dataSource: 'wikipedia', |
| dimensionsSpec: { |
| dimensions: ['channel', 'cityName', 'comment'], |
| }, |
| granularitySpec: { |
| queryGranularity: 'hour', |
| rollup: true, |
| segmentGranularity: 'day', |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| fieldName: 'added', |
| name: 'sum_added', |
| type: 'longSum', |
| }, |
| ], |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'iso', |
| }, |
| transformSpec: { |
| filter: { |
| dimension: 'commentLength', |
| type: 'selector', |
| value: '35', |
| }, |
| transforms: [ |
| { |
| expression: 'concat("channel", \'lol\')', |
| name: 'channel', |
| type: 'expression', |
| }, |
| ], |
| }, |
| }, |
| ioConfig: { |
| inputFormat: { |
| flattenSpec: { |
| fields: [ |
| { |
| expr: '$.cityName', |
| name: 'cityNameAlt', |
| type: 'path', |
| }, |
| ], |
| }, |
| type: 'json', |
| }, |
| inputSource: { |
| type: 'http', |
| uris: ['https://website.com/wikipedia.json.gz'], |
| }, |
| type: 'index_parallel', |
| }, |
| tuningConfig: { |
| type: 'index_parallel', |
| }, |
| }, |
| type: 'index_parallel', |
| }); |
| }); |
| |
| it('does not mangle a custom parser', () => { |
| expect(() => |
| upgradeSpec({ |
| type: 'index_parallel', |
| spec: { |
| ioConfig: { |
| type: 'index_parallel', |
| firehose: { |
| type: 'http', |
| uris: ['https://website.com/wikipedia.json.gz'], |
| }, |
| }, |
| tuningConfig: { |
| type: 'index_parallel', |
| }, |
| dataSchema: { |
| dataSource: 'wikipedia', |
| granularitySpec: { |
| segmentGranularity: 'day', |
| queryGranularity: 'hour', |
| rollup: true, |
| }, |
| parser: { |
| type: 'super_cool_custom_parser', |
| }, |
| }, |
| }, |
| }), |
| ).toThrow( |
| "Can not rewrite parser of type 'super_cool_custom_parser', only 'string' is supported", |
| ); |
| }); |
| |
| it('upgrades / downgrades task spec (without parser)', () => { |
| const oldTaskSpec = { |
| type: 'index_parallel', |
| ioConfig: { |
| type: 'index_parallel', |
| firehose: { type: 'http', uris: ['https://website.com/wikipedia.json.gz'] }, |
| }, |
| tuningConfig: { type: 'index_parallel' }, |
| dataSchema: { |
| dataSource: 'new-data-source', |
| granularitySpec: { type: 'uniform', segmentGranularity: 'DAY', queryGranularity: 'HOUR' }, |
| }, |
| }; |
| |
| expect(upgradeSpec(oldTaskSpec)).toEqual({ |
| spec: { |
| dataSchema: { |
| dataSource: 'new-data-source', |
| granularitySpec: { |
| queryGranularity: 'HOUR', |
| segmentGranularity: 'DAY', |
| type: 'uniform', |
| }, |
| }, |
| ioConfig: { |
| inputSource: { |
| type: 'http', |
| uris: ['https://website.com/wikipedia.json.gz'], |
| }, |
| type: 'index_parallel', |
| }, |
| tuningConfig: { |
| type: 'index_parallel', |
| }, |
| }, |
| type: 'index_parallel', |
| }); |
| }); |
| |
| it('upgrades / downgrades supervisor spec', () => { |
| const oldSupervisorSpec = { |
| type: 'kafka', |
| dataSchema: { |
| dataSource: 'metrics-kafka', |
| parser: { |
| type: 'string', |
| parseSpec: { |
| format: 'json', |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'auto', |
| }, |
| dimensionsSpec: { |
| dimensions: [], |
| dimensionExclusions: ['timestamp', 'value'], |
| }, |
| }, |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| name: 'value_sum', |
| fieldName: 'value', |
| type: 'doubleSum', |
| }, |
| { |
| name: 'value_min', |
| fieldName: 'value', |
| type: 'doubleMin', |
| }, |
| { |
| name: 'value_max', |
| fieldName: 'value', |
| type: 'doubleMax', |
| }, |
| ], |
| granularitySpec: { |
| type: 'uniform', |
| segmentGranularity: 'HOUR', |
| queryGranularity: 'NONE', |
| }, |
| }, |
| tuningConfig: { |
| type: 'kafka', |
| maxRowsPerSegment: 5000000, |
| }, |
| ioConfig: { |
| topic: 'metrics', |
| consumerProperties: { |
| 'bootstrap.servers': 'localhost:9092', |
| }, |
| taskCount: 1, |
| replicas: 1, |
| taskDuration: 'PT1H', |
| }, |
| }; |
| |
| expect(upgradeSpec(oldSupervisorSpec)).toEqual({ |
| spec: { |
| dataSchema: { |
| dataSource: 'metrics-kafka', |
| dimensionsSpec: { |
| dimensionExclusions: ['timestamp', 'value'], |
| dimensions: [], |
| }, |
| granularitySpec: { |
| queryGranularity: 'NONE', |
| segmentGranularity: 'HOUR', |
| type: 'uniform', |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_sum', |
| type: 'doubleSum', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_min', |
| type: 'doubleMin', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_max', |
| type: 'doubleMax', |
| }, |
| ], |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'auto', |
| }, |
| }, |
| ioConfig: { |
| consumerProperties: { |
| 'bootstrap.servers': 'localhost:9092', |
| }, |
| inputFormat: { |
| type: 'json', |
| }, |
| replicas: 1, |
| taskCount: 1, |
| taskDuration: 'PT1H', |
| topic: 'metrics', |
| }, |
| tuningConfig: { |
| maxRowsPerSegment: 5000000, |
| type: 'kafka', |
| }, |
| }, |
| type: 'kafka', |
| }); |
| }); |
| |
| it('upgrades / downgrades back compat supervisor spec', () => { |
| const backCompatSupervisorSpec = { |
| type: 'kafka', |
| id: 'metrics-kafka', |
| spec: { |
| dataSchema: { |
| dataSource: 'metrics-kafka', |
| parser: { |
| type: 'string', |
| parseSpec: { |
| format: 'json', |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'auto', |
| }, |
| dimensionsSpec: { |
| dimensions: [], |
| dimensionExclusions: ['timestamp', 'value'], |
| }, |
| }, |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| name: 'value_sum', |
| fieldName: 'value', |
| type: 'doubleSum', |
| }, |
| { |
| name: 'value_min', |
| fieldName: 'value', |
| type: 'doubleMin', |
| }, |
| { |
| name: 'value_max', |
| fieldName: 'value', |
| type: 'doubleMax', |
| }, |
| ], |
| granularitySpec: { |
| type: 'uniform', |
| segmentGranularity: 'HOUR', |
| queryGranularity: 'NONE', |
| }, |
| }, |
| tuningConfig: { |
| type: 'kafka', |
| maxRowsPerSegment: 5000000, |
| }, |
| ioConfig: { |
| topic: 'metrics', |
| consumerProperties: { |
| 'bootstrap.servers': 'localhost:9092', |
| }, |
| taskCount: 1, |
| replicas: 1, |
| taskDuration: 'PT1H', |
| }, |
| }, |
| dataSchema: { |
| dataSource: 'metrics-kafka', |
| parser: { |
| type: 'string', |
| parseSpec: { |
| format: 'json', |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'auto', |
| }, |
| dimensionsSpec: { |
| dimensions: [], |
| dimensionExclusions: ['timestamp', 'value'], |
| }, |
| }, |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| name: 'value_sum', |
| fieldName: 'value', |
| type: 'doubleSum', |
| }, |
| { |
| name: 'value_min', |
| fieldName: 'value', |
| type: 'doubleMin', |
| }, |
| { |
| name: 'value_max', |
| fieldName: 'value', |
| type: 'doubleMax', |
| }, |
| ], |
| granularitySpec: { |
| type: 'uniform', |
| segmentGranularity: 'HOUR', |
| queryGranularity: 'NONE', |
| }, |
| }, |
| tuningConfig: { |
| type: 'kafka', |
| maxRowsPerSegment: 5000000, |
| }, |
| ioConfig: { |
| topic: 'metrics', |
| consumerProperties: { |
| 'bootstrap.servers': 'localhost:9092', |
| }, |
| taskCount: 1, |
| replicas: 1, |
| taskDuration: 'PT1H', |
| }, |
| }; |
| |
| expect(cleanSpec(upgradeSpec(backCompatSupervisorSpec))).toEqual({ |
| id: 'metrics-kafka', |
| spec: { |
| dataSchema: { |
| dataSource: 'metrics-kafka', |
| dimensionsSpec: { |
| dimensionExclusions: ['timestamp', 'value'], |
| dimensions: [], |
| }, |
| granularitySpec: { |
| queryGranularity: 'NONE', |
| segmentGranularity: 'HOUR', |
| type: 'uniform', |
| }, |
| metricsSpec: [ |
| { |
| name: 'count', |
| type: 'count', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_sum', |
| type: 'doubleSum', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_min', |
| type: 'doubleMin', |
| }, |
| { |
| fieldName: 'value', |
| name: 'value_max', |
| type: 'doubleMax', |
| }, |
| ], |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'auto', |
| }, |
| }, |
| ioConfig: { |
| consumerProperties: { |
| 'bootstrap.servers': 'localhost:9092', |
| }, |
| inputFormat: { |
| type: 'json', |
| }, |
| replicas: 1, |
| taskCount: 1, |
| taskDuration: 'PT1H', |
| topic: 'metrics', |
| }, |
| tuningConfig: { |
| maxRowsPerSegment: 5000000, |
| type: 'kafka', |
| }, |
| }, |
| type: 'kafka', |
| }); |
| }); |
| |
| it('cleanSpec', () => { |
| expect( |
| cleanSpec({ |
| type: 'index_parallel', |
| id: 'index_parallel_coronavirus_hamlcmea_2020-03-19T00:56:12.175Z', |
| groupId: 'index_parallel_coronavirus_hamlcmea_2020-03-19T00:56:12.175Z', |
| resource: { |
| availabilityGroup: 'index_parallel_coronavirus_hamlcmea_2020-03-19T00:56:12.175Z', |
| requiredCapacity: 1, |
| }, |
| spec: { |
| dataSchema: {}, |
| }, |
| } as any), |
| ).toEqual({ |
| id: 'index_parallel_coronavirus_hamlcmea_2020-03-19T00:56:12.175Z', |
| type: 'index_parallel', |
| spec: { |
| dataSchema: {}, |
| }, |
| }); |
| }); |
| |
| describe('guessSimpleInputFormat', () => { |
| it('works for parquet', () => { |
| expect(guessSimpleInputFormat(['PAR1lol']).type).toEqual('parquet'); |
| }); |
| |
| it('works for orc', () => { |
| expect(guessSimpleInputFormat(['ORClol']).type).toEqual('orc'); |
| }); |
| |
| it('works for AVRO', () => { |
| expect(guessSimpleInputFormat(['Obj\x01lol']).type).toEqual('avro_ocf'); |
| expect(guessSimpleInputFormat(['Obj1lol']).type).toEqual('regex'); |
| }); |
| |
| it('works for JSON (strict)', () => { |
| expect(guessSimpleInputFormat(['{"a":1}'])).toEqual({ type: 'json' }); |
| }); |
| |
| it('works for JSON (lax)', () => { |
| expect(guessSimpleInputFormat([`{hello:'world'}`])).toEqual({ |
| type: 'json', |
| featureSpec: { |
| ALLOW_BACKSLASH_ESCAPING_ANY_CHARACTER: true, |
| ALLOW_COMMENTS: true, |
| ALLOW_MISSING_VALUES: true, |
| ALLOW_NON_NUMERIC_NUMBERS: true, |
| ALLOW_NUMERIC_LEADING_ZEROS: true, |
| ALLOW_SINGLE_QUOTES: true, |
| ALLOW_TRAILING_COMMA: true, |
| ALLOW_UNQUOTED_CONTROL_CHARS: true, |
| ALLOW_UNQUOTED_FIELD_NAMES: true, |
| ALLOW_YAML_COMMENTS: true, |
| }, |
| }); |
| }); |
| |
| it('works for CSV (with header)', () => { |
| expect(guessSimpleInputFormat(['A,B,"X,1",Y'])).toEqual({ |
| type: 'csv', |
| findColumnsFromHeader: true, |
| }); |
| }); |
| |
| it('works for CSV (no header)', () => { |
| expect(guessSimpleInputFormat(['"A,1","B,2",1,2'])).toEqual({ |
| type: 'csv', |
| findColumnsFromHeader: false, |
| columns: ['column1', 'column2', 'column3', 'column4'], |
| }); |
| }); |
| |
| it('works for TSV (with header)', () => { |
| expect(guessSimpleInputFormat(['A\tB\tX\tY'])).toEqual({ |
| type: 'tsv', |
| findColumnsFromHeader: true, |
| }); |
| }); |
| |
| it('works for TSV (no header)', () => { |
| expect(guessSimpleInputFormat(['A\tB\t1\t2\t3\t4\t5\t6\t7\t8\t9'])).toEqual({ |
| type: 'tsv', |
| findColumnsFromHeader: false, |
| columns: [ |
| 'column01', |
| 'column02', |
| 'column03', |
| 'column04', |
| 'column05', |
| 'column06', |
| 'column07', |
| 'column08', |
| 'column09', |
| 'column10', |
| 'column11', |
| ], |
| }); |
| }); |
| |
| it('works for TSV with ;', () => { |
| const inputFormat = guessSimpleInputFormat(['A;B;X;Y']); |
| expect(inputFormat).toEqual({ |
| type: 'tsv', |
| delimiter: ';', |
| findColumnsFromHeader: true, |
| }); |
| }); |
| |
| it('works for TSV with |', () => { |
| const inputFormat = guessSimpleInputFormat(['A|B|X|Y']); |
| expect(inputFormat).toEqual({ |
| type: 'tsv', |
| delimiter: '|', |
| findColumnsFromHeader: true, |
| }); |
| }); |
| |
| it('works for regex', () => { |
| expect(guessSimpleInputFormat(['A/B/X/Y'])).toEqual({ |
| type: 'regex', |
| pattern: '([\\s\\S]*)', |
| columns: ['line'], |
| }); |
| }); |
| }); |
| |
| describe('guessKafkaInputFormat', () => { |
| const sample = [ |
| { |
| 'kafka.timestamp': 1710962988515, |
| 'kafka.topic': 'kttm2', |
| 'raw': |
| '{"timestamp":"2019-08-25T00:00:00.031Z","session":"S56194838","number":"16","event":{"type":"PercentClear","percentage":55},"agent":{"type":"Browser","category":"Personal computer","browser":"Chrome","browser_version":"76.0.3809.100","os":"Windows 7","platform":"Windows"},"client_ip":"181.13.41.82","geo_ip":{"continent":"South America","country":"Argentina","region":"Santa Fe","city":"Rosario"},"language":["es","es-419"],"adblock_list":"NoAdblock","app_version":"1.9.6","path":"http://www.koalastothemax.com/","loaded_image":"http://www.koalastothemax.com/img/koalas2.jpg","referrer":"Direct","referrer_host":"Direct","server_ip":"172.31.57.89","screen":"1680x1050","window":"1680x939","session_length":76261,"timezone":"N/A","timezone_offset":"180"}', |
| }, |
| { |
| 'kafka.timestamp': 1710962988518, |
| 'kafka.topic': 'kttm2', |
| 'raw': |
| '{"timestamp":"2019-08-25T00:00:00.059Z","session":"S46093731","number":"24","event":{"type":"PercentClear","percentage":85},"agent":{"type":"Mobile Browser","category":"Smartphone","browser":"Chrome Mobile","browser_version":"50.0.2661.89","os":"Android","platform":"Android"},"client_ip":"177.242.100.0","geo_ip":{"continent":"North America","country":"Mexico","region":"Chihuahua","city":"Nuevo Casas Grandes"},"language":["en","es","es-419","es-MX"],"adblock_list":"NoAdblock","app_version":"1.9.6","path":"https://koalastothemax.com/","loaded_image":"https://koalastothemax.com/img/koalas1.jpg","referrer":"https://www.google.com/","referrer_host":"www.google.com","server_ip":"172.31.11.5","screen":"320x570","window":"540x743","session_length":252689,"timezone":"CDT","timezone_offset":"300"}', |
| }, |
| ]; |
| |
| it('works when single topic', () => { |
| expect(guessKafkaInputFormat(sample, false)).toEqual({ type: 'json' }); |
| }); |
| |
| it('works when multi-topic', () => { |
| expect(guessKafkaInputFormat(sample, true)).toEqual({ |
| type: 'kafka', |
| valueFormat: { |
| type: 'json', |
| }, |
| }); |
| }); |
| }); |
| }); |
| |
| describe('spec utils', () => { |
| const ingestionSpec: IngestionSpec = { |
| type: 'index_parallel', |
| spec: { |
| ioConfig: { |
| type: 'index_parallel', |
| inputSource: { |
| type: 'http', |
| uris: ['https://website.com/wikipedia.json.gz'], |
| }, |
| inputFormat: { |
| type: 'json', |
| }, |
| }, |
| tuningConfig: { |
| type: 'index_parallel', |
| }, |
| dataSchema: { |
| dataSource: 'wikipedia', |
| granularitySpec: { |
| segmentGranularity: 'day', |
| queryGranularity: 'hour', |
| }, |
| timestampSpec: { |
| column: 'timestamp', |
| format: 'iso', |
| }, |
| dimensionsSpec: {}, |
| }, |
| }, |
| }; |
| |
| describe('guessColumnTypeFromInput', () => { |
| it('works for empty', () => { |
| expect(guessColumnTypeFromInput([], false)).toEqual('string'); |
| }); |
| |
| it('works for long', () => { |
| expect(guessColumnTypeFromInput([null, 1, 2, 3], false)).toEqual('long'); |
| expect(guessColumnTypeFromInput([null, 1, 2, 3], true)).toEqual('long'); |
| expect(guessColumnTypeFromInput([null, '1', '2', '3'], false)).toEqual('string'); |
| expect(guessColumnTypeFromInput([null, '1', '2', '3'], true)).toEqual('long'); |
| }); |
| |
| it('works for double', () => { |
| expect(guessColumnTypeFromInput([null, 1, 2.1, 3], false)).toEqual('double'); |
| expect(guessColumnTypeFromInput([null, 1, 2.1, 3], true)).toEqual('double'); |
| expect(guessColumnTypeFromInput([null, '1', '2.1', '3'], false)).toEqual('string'); |
| expect(guessColumnTypeFromInput([null, '1', '2.1', '3'], true)).toEqual('double'); |
| expect(guessColumnTypeFromInput([null, '1.0', '2.0', '3.0'], true)).toEqual('double'); |
| }); |
| |
| it('works for ARRAY<string>', () => { |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| ['A', 'B'], |
| ['A', 'C'], |
| ], |
| false, |
| ), |
| ).toEqual('ARRAY<string>'); |
| }); |
| |
| it('works for ARRAY<long>', () => { |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| [1, 2], |
| [3, 4], |
| ], |
| false, |
| ), |
| ).toEqual('ARRAY<long>'); |
| |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| ['1', '2'], |
| ['3', '4'], |
| ], |
| false, |
| ), |
| ).toEqual('ARRAY<string>'); |
| |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| ['1', '2'], |
| ['3', '4'], |
| ], |
| true, |
| ), |
| ).toEqual('ARRAY<long>'); |
| }); |
| |
| it('works for ARRAY<double>', () => { |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| [1.1, 2.2], |
| [3.3, 4.4], |
| ], |
| false, |
| ), |
| ).toEqual('ARRAY<double>'); |
| |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| ['1.1', '2.2'], |
| ['3.3', '4.4'], |
| ], |
| false, |
| ), |
| ).toEqual('ARRAY<string>'); |
| |
| expect( |
| guessColumnTypeFromInput( |
| [ |
| ['1.1', '2.2'], |
| ['3.3', '4.4'], |
| ], |
| true, |
| ), |
| ).toEqual('ARRAY<double>'); |
| }); |
| |
| it('works for complex arrays', () => { |
| expect(guessColumnTypeFromInput([{ type: 'Dogs' }, { type: 'JavaScript' }], false)).toEqual( |
| 'COMPLEX<json>', |
| ); |
| }); |
| |
| it('works for strange json', () => { |
| expect(guessColumnTypeFromInput([1, { hello: 'world' }, 3], false)).toEqual('COMPLEX<json>'); |
| }); |
| |
| it('works for strange input (object with no prototype)', () => { |
| expect(guessColumnTypeFromInput([1, Object.create(null), 3], false)).toEqual('COMPLEX<json>'); |
| }); |
| }); |
| |
| describe('guessColumnTypeFromSampleResponse', () => { |
| it('works for generic dataset', () => { |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'user', false)).toEqual('string'); |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'followers', false)).toEqual('string'); |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'followers', true)).toEqual('long'); |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'spend', true)).toEqual('double'); |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'nums', false)).toEqual('ARRAY<string>'); |
| expect(guessColumnTypeFromSampleResponse(CSV_SAMPLE, 'nums', true)).toEqual('ARRAY<long>'); |
| }); |
| }); |
| |
| describe('updateSchemaWithSample', () => { |
| it('works with when not forcing time, arrays', () => { |
| const updateSpec = updateSchemaWithSample( |
| ingestionSpec, |
| JSON_SAMPLE, |
| false, |
| 'fixed', |
| 'array', |
| true, |
| ); |
| expect(updateSpec.spec).toMatchInlineSnapshot(` |
| { |
| "dataSchema": { |
| "dataSource": "wikipedia", |
| "dimensionsSpec": { |
| "dimensions": [ |
| { |
| "name": "__time", |
| "type": "long", |
| }, |
| "user", |
| "id", |
| { |
| "castToType": "ARRAY<STRING>", |
| "name": "tags", |
| "type": "auto", |
| }, |
| { |
| "castToType": "ARRAY<LONG>", |
| "name": "nums", |
| "type": "auto", |
| }, |
| ], |
| "forceSegmentSortByTime": false, |
| }, |
| "granularitySpec": { |
| "queryGranularity": "hour", |
| "rollup": true, |
| "segmentGranularity": "day", |
| }, |
| "metricsSpec": [ |
| { |
| "name": "count", |
| "type": "count", |
| }, |
| { |
| "fieldName": "followers", |
| "name": "sum_followers", |
| "type": "longSum", |
| }, |
| { |
| "fieldName": "spend", |
| "name": "sum_spend", |
| "type": "doubleSum", |
| }, |
| ], |
| "timestampSpec": { |
| "column": "timestamp", |
| "format": "iso", |
| }, |
| }, |
| "ioConfig": { |
| "inputFormat": { |
| "type": "json", |
| }, |
| "inputSource": { |
| "type": "http", |
| "uris": [ |
| "https://website.com/wikipedia.json.gz", |
| ], |
| }, |
| "type": "index_parallel", |
| }, |
| "tuningConfig": { |
| "forceGuaranteedRollup": true, |
| "partitionsSpec": { |
| "type": "hashed", |
| }, |
| "type": "index_parallel", |
| }, |
| } |
| `); |
| }); |
| |
| it('works with rollup, arrays', () => { |
| const updateSpec = updateSchemaWithSample( |
| ingestionSpec, |
| JSON_SAMPLE, |
| DEFAULT_FORCE_SEGMENT_SORT_BY_TIME, |
| 'fixed', |
| 'array', |
| true, |
| ); |
| expect(updateSpec.spec).toMatchInlineSnapshot(` |
| { |
| "dataSchema": { |
| "dataSource": "wikipedia", |
| "dimensionsSpec": { |
| "dimensions": [ |
| "user", |
| "id", |
| { |
| "castToType": "ARRAY<STRING>", |
| "name": "tags", |
| "type": "auto", |
| }, |
| { |
| "castToType": "ARRAY<LONG>", |
| "name": "nums", |
| "type": "auto", |
| }, |
| ], |
| }, |
| "granularitySpec": { |
| "queryGranularity": "hour", |
| "rollup": true, |
| "segmentGranularity": "day", |
| }, |
| "metricsSpec": [ |
| { |
| "name": "count", |
| "type": "count", |
| }, |
| { |
| "fieldName": "followers", |
| "name": "sum_followers", |
| "type": "longSum", |
| }, |
| { |
| "fieldName": "spend", |
| "name": "sum_spend", |
| "type": "doubleSum", |
| }, |
| ], |
| "timestampSpec": { |
| "column": "timestamp", |
| "format": "iso", |
| }, |
| }, |
| "ioConfig": { |
| "inputFormat": { |
| "type": "json", |
| }, |
| "inputSource": { |
| "type": "http", |
| "uris": [ |
| "https://website.com/wikipedia.json.gz", |
| ], |
| }, |
| "type": "index_parallel", |
| }, |
| "tuningConfig": { |
| "forceGuaranteedRollup": true, |
| "partitionsSpec": { |
| "type": "hashed", |
| }, |
| "type": "index_parallel", |
| }, |
| } |
| `); |
| }); |
| |
| it('works with rollup, MVDs', () => { |
| const updateSpec = updateSchemaWithSample( |
| ingestionSpec, |
| JSON_SAMPLE, |
| DEFAULT_FORCE_SEGMENT_SORT_BY_TIME, |
| 'fixed', |
| 'mvd', |
| true, |
| ); |
| expect(updateSpec.spec).toMatchInlineSnapshot(` |
| { |
| "dataSchema": { |
| "dataSource": "wikipedia", |
| "dimensionsSpec": { |
| "dimensions": [ |
| "user", |
| "id", |
| { |
| "multiValueHandling": "SORTED_ARRAY", |
| "name": "tags", |
| "type": "string", |
| }, |
| { |
| "multiValueHandling": "SORTED_ARRAY", |
| "name": "nums", |
| "type": "string", |
| }, |
| ], |
| }, |
| "granularitySpec": { |
| "queryGranularity": "hour", |
| "rollup": true, |
| "segmentGranularity": "day", |
| }, |
| "metricsSpec": [ |
| { |
| "name": "count", |
| "type": "count", |
| }, |
| { |
| "fieldName": "followers", |
| "name": "sum_followers", |
| "type": "longSum", |
| }, |
| { |
| "fieldName": "spend", |
| "name": "sum_spend", |
| "type": "doubleSum", |
| }, |
| ], |
| "timestampSpec": { |
| "column": "timestamp", |
| "format": "iso", |
| }, |
| }, |
| "ioConfig": { |
| "inputFormat": { |
| "type": "json", |
| }, |
| "inputSource": { |
| "type": "http", |
| "uris": [ |
| "https://website.com/wikipedia.json.gz", |
| ], |
| }, |
| "type": "index_parallel", |
| }, |
| "tuningConfig": { |
| "forceGuaranteedRollup": true, |
| "partitionsSpec": { |
| "type": "hashed", |
| }, |
| "type": "index_parallel", |
| }, |
| } |
| `); |
| }); |
| |
| it('works without rollup, arrays', () => { |
| const updatedSpec = updateSchemaWithSample( |
| ingestionSpec, |
| JSON_SAMPLE, |
| DEFAULT_FORCE_SEGMENT_SORT_BY_TIME, |
| 'fixed', |
| 'array', |
| false, |
| ); |
| expect(updatedSpec.spec).toMatchInlineSnapshot(` |
| { |
| "dataSchema": { |
| "dataSource": "wikipedia", |
| "dimensionsSpec": { |
| "dimensions": [ |
| "user", |
| { |
| "name": "followers", |
| "type": "long", |
| }, |
| { |
| "name": "spend", |
| "type": "double", |
| }, |
| "id", |
| { |
| "castToType": "ARRAY<STRING>", |
| "name": "tags", |
| "type": "auto", |
| }, |
| { |
| "castToType": "ARRAY<LONG>", |
| "name": "nums", |
| "type": "auto", |
| }, |
| ], |
| }, |
| "granularitySpec": { |
| "queryGranularity": "none", |
| "rollup": false, |
| "segmentGranularity": "day", |
| }, |
| "timestampSpec": { |
| "column": "timestamp", |
| "format": "iso", |
| }, |
| }, |
| "ioConfig": { |
| "inputFormat": { |
| "type": "json", |
| }, |
| "inputSource": { |
| "type": "http", |
| "uris": [ |
| "https://website.com/wikipedia.json.gz", |
| ], |
| }, |
| "type": "index_parallel", |
| }, |
| "tuningConfig": { |
| "partitionsSpec": { |
| "type": "dynamic", |
| }, |
| "type": "index_parallel", |
| }, |
| } |
| `); |
| }); |
| |
| it('works without rollup, MVDs', () => { |
| const updatedSpec = updateSchemaWithSample( |
| ingestionSpec, |
| JSON_SAMPLE, |
| DEFAULT_FORCE_SEGMENT_SORT_BY_TIME, |
| 'fixed', |
| 'mvd', |
| false, |
| ); |
| expect(updatedSpec.spec).toMatchInlineSnapshot(` |
| { |
| "dataSchema": { |
| "dataSource": "wikipedia", |
| "dimensionsSpec": { |
| "dimensions": [ |
| "user", |
| { |
| "name": "followers", |
| "type": "long", |
| }, |
| { |
| "name": "spend", |
| "type": "double", |
| }, |
| "id", |
| { |
| "multiValueHandling": "SORTED_ARRAY", |
| "name": "tags", |
| "type": "string", |
| }, |
| { |
| "multiValueHandling": "SORTED_ARRAY", |
| "name": "nums", |
| "type": "string", |
| }, |
| ], |
| }, |
| "granularitySpec": { |
| "queryGranularity": "none", |
| "rollup": false, |
| "segmentGranularity": "day", |
| }, |
| "timestampSpec": { |
| "column": "timestamp", |
| "format": "iso", |
| }, |
| }, |
| "ioConfig": { |
| "inputFormat": { |
| "type": "json", |
| }, |
| "inputSource": { |
| "type": "http", |
| "uris": [ |
| "https://website.com/wikipedia.json.gz", |
| ], |
| }, |
| "type": "index_parallel", |
| }, |
| "tuningConfig": { |
| "partitionsSpec": { |
| "type": "dynamic", |
| }, |
| "type": "index_parallel", |
| }, |
| } |
| `); |
| }); |
| }); |
| |
| it('adjustId', () => { |
| expect(adjustId('')).toEqual(''); |
| expect(adjustId('lol')).toEqual('lol'); |
| expect(adjustId('.l/o/l')).toEqual('lol'); |
| expect(adjustId('l\t \nl')).toEqual('l l'); |
| }); |
| }); |