indexing-service/src/main/java/org/apache/druid/indexing/input/GeneratorInputSource.java - druid - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 package org.apache.druid.indexing.input;

 import com.fasterxml.jackson.annotation.JsonCreator;
 import com.fasterxml.jackson.annotation.JsonProperty;
 import com.google.common.base.Preconditions;
 import org.apache.druid.data.input.AbstractInputSource;
 import org.apache.druid.data.input.InputFormat;
 import org.apache.druid.data.input.InputRow;
 import org.apache.druid.data.input.InputRowListPlusRawValues;
 import org.apache.druid.data.input.InputRowSchema;
 import org.apache.druid.data.input.InputSource;
 import org.apache.druid.data.input.InputSourceReader;
 import org.apache.druid.data.input.InputSplit;
 import org.apache.druid.data.input.MapBasedInputRow;
 import org.apache.druid.data.input.SplitHintSpec;
 import org.apache.druid.data.input.impl.SplittableInputSource;
 import org.apache.druid.java.util.common.CloseableIterators;
 import org.apache.druid.java.util.common.DateTimes;
 import org.apache.druid.java.util.common.parsers.CloseableIterator;
 import org.apache.druid.segment.generator.DataGenerator;
 import org.apache.druid.segment.generator.GeneratorBasicSchemas;
 import org.apache.druid.segment.generator.GeneratorColumnSchema;

 import javax.annotation.Nullable;
 import java.io.File;
 import java.util.Iterator;
 import java.util.List;
 import java.util.Objects;
 import java.util.Random;
 import java.util.stream.LongStream;
 import java.util.stream.Stream;

 /**
  * {@link InputSource} that can be used to seed a Druid cluster with test data, using either the built-in schemas
  * defined in {@link GeneratorBasicSchemas}, or by directly supplying a list of {@link GeneratorColumnSchema}, to
  * construct a {@link DataGenerator}. To produce a stable set of data, a random {@link #seed} may be supplied which
  * will be used for all data generated by the columns. When {@link #numSplits} is greater than 1, the {@link #seed}
  * will be instead used to pick a new seed for each split, allowing the splits to produce a different set of data,
  * but still in a stable manner.
  */
 public class GeneratorInputSource extends AbstractInputSource implements SplittableInputSource<Long>
 {
   private static final int DEFAULT_NUM_ROWS = 1000;
   private static final int DEFAULT_NUM_SPLITS = 1;
   private static final long DEFAULT_SEED = 1024L;
   private static final long DEFAULT_START_TIME = DateTimes.nowUtc().minusDays(1).getMillis();
   private static final int DEFAULT_CONSECUTIVE_TIMESTAMPS = 100;
   private static final double DEFAULT_TIMESTAMP_INCREMENT = 1.0;

   private final String schemaName;
   private final List<GeneratorColumnSchema> schema;
   private final int numRows;
   private final Integer numSplits;
   private final Long seed;
   private final Long startTime;
   private final Integer numConsecutiveTimestamps;
   private final Double timestampIncrement;

   @JsonCreator
   public GeneratorInputSource(
       @JsonProperty("schemaName") @Nullable String schemaName,
       @JsonProperty("schema") @Nullable List<GeneratorColumnSchema> schema,
       @JsonProperty("numRows") Integer numRows,
       @JsonProperty("numSplits") Integer numSplits,
       @JsonProperty("seed") Long seed,
       @JsonProperty("startTime") Long startTime,
       @JsonProperty("numConsecutiveTimestamps") Integer numConsecutiveTimestamps,
       @JsonProperty("timestampIncrement") Double timestampIncrement
   )
   {
     Preconditions.checkArgument(
         schemaName != null || schema != null,
         "Must specify either 'schemaName' or 'schema'"
     );
     this.schemaName = schemaName;
     this.schema = schema != null
                          ? schema
                          : GeneratorBasicSchemas.SCHEMA_MAP.get(schemaName).getColumnSchemas();
     this.numRows = numRows != null ? numRows : DEFAULT_NUM_ROWS;
     this.numSplits = numSplits != null ? numSplits : DEFAULT_NUM_SPLITS;
     this.seed = seed != null ? seed : DEFAULT_SEED;
     this.startTime = startTime != null ? startTime : DEFAULT_START_TIME;
     this.numConsecutiveTimestamps = numConsecutiveTimestamps != null
                                     ? numConsecutiveTimestamps
                                     : DEFAULT_CONSECUTIVE_TIMESTAMPS;
     this.timestampIncrement = timestampIncrement != null ? timestampIncrement : DEFAULT_TIMESTAMP_INCREMENT;
   }

   @Override
   public Stream<InputSplit<Long>> createSplits(
       InputFormat inputFormat,
       @Nullable SplitHintSpec splitHintSpec
   )
   {
     Random r = new Random(seed);
     return LongStream.range(0, numSplits).mapToObj(i -> new InputSplit<>(r.nextLong()));
   }

   @Override
   public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
   {
     return numSplits;
   }

   @Override
   public InputSource withSplit(InputSplit<Long> split)
   {
     return new GeneratorInputSource(
         schemaName,
         schema,
         numRows,
         1,
         split.get(),
         startTime,
         numConsecutiveTimestamps,
         timestampIncrement
     );
   }

   @Override
   public boolean needsFormat()
   {
     return false;
   }

   @Override
   protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory)
   {
     return new InputSourceReader()
     {
       @Override
       public CloseableIterator<InputRow> read()
       {
         return CloseableIterators.withEmptyBaggage(new Iterator<InputRow>()
         {
           int rowCount = 0;
           private final DataGenerator generator = makeGenerator();

           @Override
           public boolean hasNext()
           {
             return rowCount < numRows;
           }

           @Override
           public InputRow next()
           {
             rowCount++;
             return generator.nextRow();
           }
         });
       }

       @Override
       public CloseableIterator<InputRowListPlusRawValues> sample()
       {
         return CloseableIterators.withEmptyBaggage(new Iterator<InputRowListPlusRawValues>()
         {
           int rowCount = 0;
           private final DataGenerator generator = makeGenerator();

           @Override
           public boolean hasNext()
           {
             return rowCount < numRows;
           }

           @Override
           public InputRowListPlusRawValues next()
           {
             rowCount++;
             InputRow row = generator.nextRow();
             return InputRowListPlusRawValues.of(row, ((MapBasedInputRow) row).getEvent());
           }
         });
       }
     };
   }

   @JsonProperty
   public String getSchemaName()
   {
     return schemaName;
   }

   @JsonProperty
   public List<GeneratorColumnSchema> getSchema()
   {
     return schemaName == null ? schema : null;
   }

   @JsonProperty
   public int getNumRows()
   {
     return numRows;
   }

   @JsonProperty
   public Integer getNumSplits()
   {
     return numSplits;
   }

   @JsonProperty
   public Long getSeed()
   {
     return seed;
   }

   @JsonProperty
   public Long getStartTime()
   {
     return startTime;
   }

   @JsonProperty
   public Integer getNumConsecutiveTimestamps()
   {
     return numConsecutiveTimestamps;
   }

   @JsonProperty
   public Double getTimestampIncrement()
   {
     return timestampIncrement;
   }

   @Override
   public boolean equals(Object o)
   {
     if (this == o) {
       return true;
     }
     if (o == null || getClass() != o.getClass()) {
       return false;
     }
     GeneratorInputSource that = (GeneratorInputSource) o;
     return numRows == that.numRows &&
            Objects.equals(schemaName, that.schemaName) &&
            Objects.equals(schema, that.schema) &&
            Objects.equals(numSplits, that.numSplits) &&
            Objects.equals(seed, that.seed) &&
            Objects.equals(startTime, that.startTime) &&
            Objects.equals(numConsecutiveTimestamps, that.numConsecutiveTimestamps) &&
            Objects.equals(timestampIncrement, that.timestampIncrement);
   }

   @Override
   public int hashCode()
   {
     return Objects.hash(
         schemaName,
         schema,
         numRows,
         numSplits,
         seed,
         startTime,
         numConsecutiveTimestamps,
         timestampIncrement
     );
   }

   private DataGenerator makeGenerator()
   {
     return new DataGenerator(
         schema,
         seed,
         startTime,
         numConsecutiveTimestamps,
         timestampIncrement
     );
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	package org.apache.druid.indexing.input;

	import com.fasterxml.jackson.annotation.JsonCreator;
	import com.fasterxml.jackson.annotation.JsonProperty;
	import com.google.common.base.Preconditions;
	import org.apache.druid.data.input.AbstractInputSource;
	import org.apache.druid.data.input.InputFormat;
	import org.apache.druid.data.input.InputRow;
	import org.apache.druid.data.input.InputRowListPlusRawValues;
	import org.apache.druid.data.input.InputRowSchema;
	import org.apache.druid.data.input.InputSource;
	import org.apache.druid.data.input.InputSourceReader;
	import org.apache.druid.data.input.InputSplit;
	import org.apache.druid.data.input.MapBasedInputRow;
	import org.apache.druid.data.input.SplitHintSpec;
	import org.apache.druid.data.input.impl.SplittableInputSource;
	import org.apache.druid.java.util.common.CloseableIterators;
	import org.apache.druid.java.util.common.DateTimes;
	import org.apache.druid.java.util.common.parsers.CloseableIterator;
	import org.apache.druid.segment.generator.DataGenerator;
	import org.apache.druid.segment.generator.GeneratorBasicSchemas;
	import org.apache.druid.segment.generator.GeneratorColumnSchema;

	import javax.annotation.Nullable;
	import java.io.File;
	import java.util.Iterator;
	import java.util.List;
	import java.util.Objects;
	import java.util.Random;
	import java.util.stream.LongStream;
	import java.util.stream.Stream;

	/**
	* {@link InputSource} that can be used to seed a Druid cluster with test data, using either the built-in schemas
	* defined in {@link GeneratorBasicSchemas}, or by directly supplying a list of {@link GeneratorColumnSchema}, to
	* construct a {@link DataGenerator}. To produce a stable set of data, a random {@link #seed} may be supplied which
	* will be used for all data generated by the columns. When {@link #numSplits} is greater than 1, the {@link #seed}
	* will be instead used to pick a new seed for each split, allowing the splits to produce a different set of data,
	* but still in a stable manner.
	*/
	public class GeneratorInputSource extends AbstractInputSource implements SplittableInputSource<Long>
	{
	private static final int DEFAULT_NUM_ROWS = 1000;
	private static final int DEFAULT_NUM_SPLITS = 1;
	private static final long DEFAULT_SEED = 1024L;
	private static final long DEFAULT_START_TIME = DateTimes.nowUtc().minusDays(1).getMillis();
	private static final int DEFAULT_CONSECUTIVE_TIMESTAMPS = 100;
	private static final double DEFAULT_TIMESTAMP_INCREMENT = 1.0;

	private final String schemaName;
	private final List<GeneratorColumnSchema> schema;
	private final int numRows;
	private final Integer numSplits;
	private final Long seed;
	private final Long startTime;
	private final Integer numConsecutiveTimestamps;
	private final Double timestampIncrement;

	@JsonCreator
	public GeneratorInputSource(
	@JsonProperty("schemaName") @Nullable String schemaName,
	@JsonProperty("schema") @Nullable List<GeneratorColumnSchema> schema,
	@JsonProperty("numRows") Integer numRows,
	@JsonProperty("numSplits") Integer numSplits,
	@JsonProperty("seed") Long seed,
	@JsonProperty("startTime") Long startTime,
	@JsonProperty("numConsecutiveTimestamps") Integer numConsecutiveTimestamps,
	@JsonProperty("timestampIncrement") Double timestampIncrement
	)
	{
	Preconditions.checkArgument(
	schemaName != null \|\| schema != null,
	"Must specify either 'schemaName' or 'schema'"
	);
	this.schemaName = schemaName;
	this.schema = schema != null
	? schema
	: GeneratorBasicSchemas.SCHEMA_MAP.get(schemaName).getColumnSchemas();
	this.numRows = numRows != null ? numRows : DEFAULT_NUM_ROWS;
	this.numSplits = numSplits != null ? numSplits : DEFAULT_NUM_SPLITS;
	this.seed = seed != null ? seed : DEFAULT_SEED;
	this.startTime = startTime != null ? startTime : DEFAULT_START_TIME;
	this.numConsecutiveTimestamps = numConsecutiveTimestamps != null
	? numConsecutiveTimestamps
	: DEFAULT_CONSECUTIVE_TIMESTAMPS;
	this.timestampIncrement = timestampIncrement != null ? timestampIncrement : DEFAULT_TIMESTAMP_INCREMENT;
	}

	@Override
	public Stream<InputSplit<Long>> createSplits(
	InputFormat inputFormat,
	@Nullable SplitHintSpec splitHintSpec
	)
	{
	Random r = new Random(seed);
	return LongStream.range(0, numSplits).mapToObj(i -> new InputSplit<>(r.nextLong()));
	}

	@Override
	public int estimateNumSplits(InputFormat inputFormat, @Nullable SplitHintSpec splitHintSpec)
	{
	return numSplits;
	}

	@Override
	public InputSource withSplit(InputSplit<Long> split)
	{
	return new GeneratorInputSource(
	schemaName,
	schema,
	numRows,
	1,
	split.get(),
	startTime,
	numConsecutiveTimestamps,
	timestampIncrement
	);
	}

	@Override
	public boolean needsFormat()
	{
	return false;
	}

	@Override
	protected InputSourceReader fixedFormatReader(InputRowSchema inputRowSchema, @Nullable File temporaryDirectory)
	{
	return new InputSourceReader()
	{
	@Override
	public CloseableIterator<InputRow> read()
	{
	return CloseableIterators.withEmptyBaggage(new Iterator<InputRow>()
	{
	int rowCount = 0;
	private final DataGenerator generator = makeGenerator();

	@Override
	public boolean hasNext()
	{
	return rowCount < numRows;
	}

	@Override
	public InputRow next()
	{
	rowCount++;
	return generator.nextRow();
	}
	});
	}

	@Override
	public CloseableIterator<InputRowListPlusRawValues> sample()
	{
	return CloseableIterators.withEmptyBaggage(new Iterator<InputRowListPlusRawValues>()
	{
	int rowCount = 0;
	private final DataGenerator generator = makeGenerator();

	@Override
	public boolean hasNext()
	{
	return rowCount < numRows;
	}

	@Override
	public InputRowListPlusRawValues next()
	{
	rowCount++;
	InputRow row = generator.nextRow();
	return InputRowListPlusRawValues.of(row, ((MapBasedInputRow) row).getEvent());
	}
	});
	}
	};
	}

	@JsonProperty
	public String getSchemaName()
	{
	return schemaName;
	}

	@JsonProperty
	public List<GeneratorColumnSchema> getSchema()
	{
	return schemaName == null ? schema : null;
	}

	@JsonProperty
	public int getNumRows()
	{
	return numRows;
	}

	@JsonProperty
	public Integer getNumSplits()
	{
	return numSplits;
	}

	@JsonProperty
	public Long getSeed()
	{
	return seed;
	}

	@JsonProperty
	public Long getStartTime()
	{
	return startTime;
	}

	@JsonProperty
	public Integer getNumConsecutiveTimestamps()
	{
	return numConsecutiveTimestamps;
	}

	@JsonProperty
	public Double getTimestampIncrement()
	{
	return timestampIncrement;
	}

	@Override
	public boolean equals(Object o)
	{
	if (this == o) {
	return true;
	}
	if (o == null \|\| getClass() != o.getClass()) {
	return false;
	}
	GeneratorInputSource that = (GeneratorInputSource) o;
	return numRows == that.numRows &&
	Objects.equals(schemaName, that.schemaName) &&
	Objects.equals(schema, that.schema) &&
	Objects.equals(numSplits, that.numSplits) &&
	Objects.equals(seed, that.seed) &&
	Objects.equals(startTime, that.startTime) &&
	Objects.equals(numConsecutiveTimestamps, that.numConsecutiveTimestamps) &&
	Objects.equals(timestampIncrement, that.timestampIncrement);
	}

	@Override
	public int hashCode()
	{
	return Objects.hash(
	schemaName,
	schema,
	numRows,
	numSplits,
	seed,
	startTime,
	numConsecutiveTimestamps,
	timestampIncrement
	);
	}

	private DataGenerator makeGenerator()
	{
	return new DataGenerator(
	schema,
	seed,
	startTime,
	numConsecutiveTimestamps,
	timestampIncrement
	);
	}
	}