blob: 534c606c2fa3cf9fbfd4f4d20d043a6370e62888 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* <p/>
* http://www.apache.org/licenses/LICENSE-2.0
* <p/>
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.bigtop.bigpetstore.generator;
import java.util.Date
import org.apache.bigtop.bigpetstore.generator.util.State
import org.apache.commons.lang3.StringUtils
import java.util.Arrays.asList
import java.util.Random
import scala.collection.Iterator
import com.sun.org.apache.xml.internal.serializer.ToStream
import java.util.{Iterator => JavaIterator}
import scala.collection.JavaConversions.asJavaIterator
import org.apache.bigtop.bigpetstore.generator.util.Product
import org.apache.commons.lang3.Range;
import org.apache.bigtop.bigpetstore.generator.util.ProductType
/**
* This class generates our data. Over time we will use it to embed bias which
* can then be teased out, i.e. by clustering/classifiers. For example:
*
* certain products <--> certain years or days
*/
class TransactionIteratorFactory(private val records: Int,
private val customerIdRange: Range[java.lang.Long],
private val state: State) {
assert(records > 0, "Number of records must be greater than 0 to generate a data iterator!")
private val random = new Random(state.hashCode)
def data: JavaIterator[TransactionIteratorFactory.KeyVal[String, String]] = {
new TransactionIteratorFactory.DataIterator(records, customerIdRange, state, random)
}
}
object TransactionIteratorFactory {
class KeyVal[K, V](val key: K, val value: V)
private class DataIterator(records: Int,
customerIdRange: Range[java.lang.Long],
state: State,
r: Random) extends Iterator[KeyVal[String, String]] {
private var firstName: String = null
private var lastName: String = null
private var elementsProcducedCount = 0
private var repeatCount = 0
private var currentCustomerId = customerIdRange.getMinimum
private var currentProductType = selectRandomProductType;
def hasNext =
elementsProcducedCount < records && currentCustomerId <= customerIdRange.getMaximum
def next(): TransactionIteratorFactory.KeyVal[String,String] = {
val date = DataForger.randomDateInPastYears(50);
setIteratorState();
val product = randomProductOfCurrentlySelectedType
val key = StringUtils.join(asList("BigPetStore", "storeCode_" + state.name(),
elementsProcducedCount.toString), ",")
val value = StringUtils.join(asList(currentCustomerId, firstName, lastName, product.id,
product.name.toLowerCase, product.price, date), ",")
elementsProcducedCount += 1
new TransactionIteratorFactory.KeyVal(key, value)
}
private def setIteratorState() = {
/** Some customers come back for more :) We repeat a customer up to ten times */
if (repeatCount > 0) {
repeatCount -= 1
} else {
firstName = DataForger.firstName(r)
lastName = DataForger.lastName(r)
// this sometimes generates numbers much larger than 10. We don't really need Gaussian
// distribution since number of transactions per customer can be truly arbitrary.
repeatCount = (r.nextGaussian * 4f) toInt;
println("####Repeat: " + repeatCount)
currentCustomerId += 1
currentProductType = selectRandomProductType;
}
}
private def selectRandomProductType = {
ProductType.values.apply(r.nextInt(ProductType.values.length))
}
private def randomProductOfCurrentlySelectedType = {
currentProductType.getProducts.get(r.nextInt(currentProductType.getProducts.size))
}
}
}