blob: 208666d15b9338aab469f6aeb63fd64d80001b96 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.druid.indexing.input;
import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.common.base.Preconditions;
import org.apache.druid.data.input.AbstractInputSource;
import org.apache.druid.data.input.InputFormat;
import org.apache.druid.data.input.InputRow;
import org.apache.druid.data.input.InputRowListPlusRawValues;
import org.apache.druid.data.input.InputRowSchema;
import org.apache.druid.data.input.InputSource;
import org.apache.druid.data.input.InputSourceReader;
import org.apache.druid.data.input.InputSplit;
import org.apache.druid.data.input.MapBasedInputRow;
import org.apache.druid.data.input.SplitHintSpec;
import org.apache.druid.data.input.impl.SplittableInputSource;
import org.apache.druid.java.util.common.CloseableIterators;
import org.apache.druid.java.util.common.DateTimes;
import org.apache.druid.java.util.common.parsers.CloseableIterator;
import org.apache.druid.segment.generator.DataGenerator;
import org.apache.druid.segment.generator.GeneratorBasicSchemas;
import org.apache.druid.segment.generator.GeneratorColumnSchema;
import javax.annotation.Nullable;
import java.io.File;
import java.util.Iterator;
import java.util.List;
import java.util.Objects;
import java.util.Random;
import java.util.stream.LongStream;
import java.util.stream.Stream;
/**
* {@link InputSource} that can be used to seed a Druid cluster with test data, using either the built-in schemas
* defined in {@link GeneratorBasicSchemas}, or by directly supplying a list of {@link GeneratorColumnSchema}, to
* construct a {@link DataGenerator}. To produce a stable set of data, a random {@link #seed} may be supplied which
* will be used for all data generated by the columns. When {@link #numSplits} is greater than 1, the {@link #seed}
* will be instead used to pick a new seed for each split, allowing the splits to produce a different set of data,
* but still in a stable manner.
*/
public class GeneratorInputSource extends AbstractInputSource implements SplittableInputSource<Long>
{
private static final int DEFAULT_NUM_ROWS = 1000;
private static final int DEFAULT_NUM_SPLITS = 1;
private static final long DEFAULT_SEED = 1024L;
private static final long DEFAULT_START_TIME = DateTimes.nowUtc().minusDays(1).getMillis();
private static final int DEFAULT_CONSECUTIVE_TIMESTAMPS = 100;
private static final double DEFAULT_TIMESTAMP_INCREMENT = 1.0;
private final String schemaName;
private final List<GeneratorColumnSchema> schema;
private final int numRows;
private final Integer numSplits;
private final Long seed;
private final Long startTime;
private final Integer numConsecutiveTimestamps;
private final Double timestampIncrement;
@JsonCreator
public GeneratorInputSource(
@JsonProperty("schemaName") @Nullable String schemaName,
@JsonProperty("schema") @Nullable List<GeneratorColumnSchema> schema,
@JsonProperty("numRows") Integer numRows,
@JsonProperty("numSplits") Integer numSplits,
@JsonProperty("seed") Long seed,
@JsonProperty("startTime") Long startTime,
@JsonProperty("numConsecutiveTimestamps") Integer numConsecutiveTimestamps,
@JsonProperty("timestampIncrement") Double timestampIncrement
)
{
Preconditions.checkArgument(
schemaName != null || schema != null,
"Must specify either 'schemaName' or 'schema'"
);
this.schemaName = schemaName;
this.schema = schema != null
? schema
: GeneratorBasicSchemas.SCHEMA_MAP.get(schemaName).getColumnSchemas();
this.numRows = numRows != null ? numRows : DEFAULT_NUM_ROWS;
this.numSplits = numSplits != null ? numSplits : DEFAULT_NUM_SPLITS;
this.seed = seed != null ? seed : DEFAULT_SEED;
this.startTime = startTime != null ? startTime : DEFAULT_START_TIME;
this.numConsecutiveTimestamps = numConsecutiveTimestamps != null
? numConsecutiveTimestamps
: DEFAULT_CONSECUTIVE_TIMESTAMPS;
this.timestampIncrement = timestampIncrement != null ? timestampIncrement : DEFAULT_TIMESTAMP_INCREMENT;
}
@Override
public Stream<InputSplit<Long>> createSplits(
InputFormat inputFormat,
@Nullable SplitHintSpec splitHintSpec
)
{
Random r = new Random(seed);
return LongStream.range(0, numSplits).mapToObj(i -> new InputSplit<>(r.nextLong()));
}
@Override
public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
{
return numSplits;
}
@Override
public InputSource withSplit(InputSplit<Long> split)
{
return new GeneratorInputSource(
schemaName,
schema,
numRows,
1,
split.get(),
startTime,
numConsecutiveTimestamps,
timestampIncrement
);
}
@Override
public boolean needsFormat()
{
return false;
}
@Override
protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory)
{
return new InputSourceReader()
{
@Override
public CloseableIterator<InputRow> read()
{
return CloseableIterators.withEmptyBaggage(new Iterator<InputRow>()
{
int rowCount = 0;
private final DataGenerator generator = makeGenerator();
@Override
public boolean hasNext()
{
return rowCount < numRows;
}
@Override
public InputRow next()
{
rowCount++;
return generator.nextRow();
}
});
}
@Override
public CloseableIterator<InputRowListPlusRawValues> sample()
{
return CloseableIterators.withEmptyBaggage(new Iterator<InputRowListPlusRawValues>()
{
int rowCount = 0;
private final DataGenerator generator = makeGenerator();
@Override
public boolean hasNext()
{
return rowCount < numRows;
}
@Override
public InputRowListPlusRawValues next()
{
rowCount++;
InputRow row = generator.nextRow();
return InputRowListPlusRawValues.of(row, ((MapBasedInputRow) row).getEvent());
}
});
}
};
}
@JsonProperty
public String getSchemaName()
{
return schemaName;
}
@JsonProperty
public List<GeneratorColumnSchema> getSchema()
{
return schemaName == null ? schema : null;
}
@JsonProperty
public int getNumRows()
{
return numRows;
}
@JsonProperty
public Integer getNumSplits()
{
return numSplits;
}
@JsonProperty
public Long getSeed()
{
return seed;
}
@JsonProperty
public Long getStartTime()
{
return startTime;
}
@JsonProperty
public Integer getNumConsecutiveTimestamps()
{
return numConsecutiveTimestamps;
}
@JsonProperty
public Double getTimestampIncrement()
{
return timestampIncrement;
}
@Override
public boolean equals(Object o)
{
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
GeneratorInputSource that = (GeneratorInputSource) o;
return numRows == that.numRows &&
Objects.equals(schemaName, that.schemaName) &&
Objects.equals(schema, that.schema) &&
Objects.equals(numSplits, that.numSplits) &&
Objects.equals(seed, that.seed) &&
Objects.equals(startTime, that.startTime) &&
Objects.equals(numConsecutiveTimestamps, that.numConsecutiveTimestamps) &&
Objects.equals(timestampIncrement, that.timestampIncrement);
}
@Override
public int hashCode()
{
return Objects.hash(
schemaName,
schema,
numRows,
numSplits,
seed,
startTime,
numConsecutiveTimestamps,
timestampIncrement
);
}
private DataGenerator makeGenerator()
{
return new DataGenerator(
schema,
seed,
startTime,
numConsecutiveTimestamps,
timestampIncrement
);
}
}