blob: d4e035c84dce97c9577bbb6a2bf3c9a44d80593a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.wayang.apps.terasort
import com.google.common.primitives.UnsignedBytes
import org.apache.hadoop.util.PureJavaCrc32
import org.apache.wayang.api.PlanBuilder
import org.apache.wayang.commons.util.profiledb.model.Experiment
import org.apache.wayang.core.api.{Configuration, WayangContext}
import org.apache.wayang.core.plugin.Plugin
class TeraValidate(@transient plugins: Plugin*) extends Serializable {
def apply(input_url: String)
(implicit configuration: Configuration, experiment: Experiment) = {
val wayangCtx = new WayangContext(configuration)
plugins.foreach(wayangCtx.register)
val planBuilder = new PlanBuilder(wayangCtx)
val dataset = planBuilder
.readObjectFile[Tuple2[Array[Byte], Array[Byte]]](input_url)
val output = dataset.mapPartitions(
iterable_element => {
val iter = iterable_element.iterator
val sum = new Unsigned16
val checksum = new Unsigned16
val crc32 = new PureJavaCrc32()
val min = new Array[Byte](10)
val max = new Array[Byte](10)
val cmp = UnsignedBytes.lexicographicalComparator()
var pos = 0L
var prev = new Array[Byte](10)
while (iter.hasNext) {
val key = iter.next()._1
assert(cmp.compare(key, prev) >= 0)
crc32.reset()
crc32.update(key, 0, key.length)
checksum.set(crc32.getValue)
sum.add(checksum)
if (pos == 0) {
key.copyToArray(min, 0, 10)
}
pos += 1
prev = key
}
prev.copyToArray(max, 0, 10)
Iterator((sum, min, max)).toStream
}
)
val checksumOutput = output.collect()
val cmp = UnsignedBytes.lexicographicalComparator()
val sum = new Unsigned16
var numRecords = dataset.count.collect().head
checksumOutput.foreach { case (partSum, min, max) =>
sum.add(partSum)
}
println("num records: " + numRecords)
println("checksum: " + sum.toString)
var lastMax = new Array[Byte](10)
checksumOutput.map{ case (partSum, min, max) =>
(partSum, min.clone(), max.clone())
}.zipWithIndex.foreach { case ((partSum, min, max), i) =>
println(s"part $i")
println(s"lastMax" + lastMax.toSeq.map(x => if (x < 0) 256 + x else x))
println(s"min " + min.toSeq.map(x => if (x < 0) 256 + x else x))
println(s"max " + max.toSeq.map(x => if (x < 0) 256 + x else x))
assert(cmp.compare(min, max) <= 0, "min >= max")
assert(cmp.compare(lastMax, min) <= 0, "current partition min < last partition max")
lastMax = max
}
println("num records: " + numRecords)
println("checksum: " + sum.toString)
println("partitions are properly sorted")
}
}