integration/spark-common/src/main/scala/org/apache/carbondata/spark/load/DataLoadProcessBuilderOnSpark.scala - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.carbondata.spark.load

 import java.util.Comparator

 import org.apache.hadoop.conf.Configuration
 import org.apache.hadoop.io.NullWritable
 import org.apache.hadoop.mapred.JobConf
 import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
 import org.apache.spark.{SparkContext, TaskContext}
 import org.apache.spark.deploy.SparkHadoopUtil
 import org.apache.spark.rdd.NewHadoopRDD
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.command.ExecutionErrors
 import org.apache.spark.storage.StorageLevel

 import org.apache.carbondata.common.logging.LogServiceFactory
 import org.apache.carbondata.core.constants.CarbonCommonConstants
 import org.apache.carbondata.core.datastore.row.CarbonRow
 import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus}
 import org.apache.carbondata.core.util.CarbonProperties
 import org.apache.carbondata.processing.loading.{DataLoadProcessBuilder, FailureCauses}
 import org.apache.carbondata.processing.loading.csvinput.{CSVInputFormat, StringArrayWritable}
 import org.apache.carbondata.processing.loading.model.CarbonLoadModel
 import org.apache.carbondata.processing.sort.sortdata.{NewRowComparator, NewRowComparatorForNormalDims, SortParameters}
 import org.apache.carbondata.processing.util.CarbonDataProcessorUtil
 import org.apache.carbondata.spark.util.CommonUtil

 /**
  * Use sortBy operator in spark to load the data
  */
 object DataLoadProcessBuilderOnSpark {
   private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

   def loadDataUsingGlobalSort(
       sc: SparkContext,
       dataFrame: Option[DataFrame],
       model: CarbonLoadModel,
       hadoopConf: Configuration): Array[(String, (LoadMetadataDetails, ExecutionErrors))] = {
     val originRDD = if (dataFrame.isDefined) {
       dataFrame.get.rdd
     } else {
       // input data from files
       CommonUtil.configureCSVInputFormat(hadoopConf, model)
       hadoopConf.set(FileInputFormat.INPUT_DIR, model.getFactFilePath)
       val columnCount = model.getCsvHeaderColumns.length
       val jobConf = new JobConf(hadoopConf)
       SparkHadoopUtil.get.addCredentials(jobConf)
       new NewHadoopRDD[NullWritable, StringArrayWritable](
         sc,
         classOf[CSVInputFormat],
         classOf[NullWritable],
         classOf[StringArrayWritable],
         jobConf)
         .map(x => DataLoadProcessorStepOnSpark.toStringArrayRow(x._2, columnCount))
     }

     model.setPartitionId("0")
     val modelBroadcast = sc.broadcast(model)
     val partialSuccessAccum = sc.accumulator(0, "Partial Success Accumulator")

     val inputStepRowCounter = sc.accumulator(0, "Input Processor Accumulator")
     val convertStepRowCounter = sc.accumulator(0, "Convert Processor Accumulator")
     val sortStepRowCounter = sc.accumulator(0, "Sort Processor Accumulator")
     val writeStepRowCounter = sc.accumulator(0, "Write Processor Accumulator")

     // 1. Input
     val inputRDD = originRDD
       .mapPartitions(rows => DataLoadProcessorStepOnSpark.toRDDIterator(rows, modelBroadcast))
       .mapPartitionsWithIndex { case (index, rows) =>
         DataLoadProcessorStepOnSpark.inputFunc(rows, index, modelBroadcast, inputStepRowCounter)
       }

     // 2. Convert
     val convertRDD = inputRDD.mapPartitionsWithIndex { case (index, rows) =>
       DataLoadProcessorStepOnSpark.convertFunc(rows, index, modelBroadcast, partialSuccessAccum,
         convertStepRowCounter)
     }.filter(_ != null)// Filter the bad record

     // 3. Sort
     val configuration = DataLoadProcessBuilder.createConfiguration(model)
     val sortParameters = SortParameters.createSortParameters(configuration)
     object RowOrdering extends Ordering[Array[AnyRef]] {
       def compare(rowA: Array[AnyRef], rowB: Array[AnyRef]): Int = {
         val rowComparator: Comparator[Array[AnyRef]] =
           if (sortParameters.getNoDictionaryCount > 0) {
             new NewRowComparator(sortParameters.getNoDictionaryDimnesionColumn)
           } else {
             new NewRowComparatorForNormalDims(sortParameters.getDimColCount)
           }

         rowComparator.compare(rowA, rowB)
       }
     }

     var numPartitions = CarbonDataProcessorUtil.getGlobalSortPartitions(configuration)
     if (numPartitions <= 0) {
       numPartitions = convertRDD.partitions.length
     }

     // Because if the number of partitions greater than 1, there will be action operator(sample) in
     // sortBy operator. So here we cache the rdd to avoid do input and convert again.
     if (numPartitions > 1) {
       convertRDD.persist(StorageLevel.fromString(
         CarbonProperties.getInstance().getGlobalSortRddStorageLevel()))
     }

     import scala.reflect.classTag
     val sortRDD = convertRDD
       .sortBy(_.getData, numPartitions = numPartitions)(RowOrdering, classTag[Array[AnyRef]])
       .mapPartitionsWithIndex { case (index, rows) =>
         DataLoadProcessorStepOnSpark.convertTo3Parts(rows, index, modelBroadcast,
           sortStepRowCounter)
       }

     // 4. Write
     sc.runJob(sortRDD, (context: TaskContext, rows: Iterator[CarbonRow]) =>
       DataLoadProcessorStepOnSpark.writeFunc(rows, context.partitionId, modelBroadcast,
         writeStepRowCounter))

     // clean cache
     convertRDD.unpersist()

     // Log the number of rows in each step
     LOGGER.info("Total rows processed in step Input Processor: " + inputStepRowCounter.value)
     LOGGER.info("Total rows processed in step Data Converter: " + convertStepRowCounter.value)
     LOGGER.info("Total rows processed in step Sort Processor: " + sortStepRowCounter.value)
     LOGGER.info("Total rows processed in step Data Writer: " + writeStepRowCounter.value)

     // Update status
     if (partialSuccessAccum.value != 0) {
       val uniqueLoadStatusId = model.getTableName + CarbonCommonConstants.UNDERSCORE +
         "Partial_Success"
       val loadMetadataDetails = new LoadMetadataDetails()
       loadMetadataDetails.setSegmentStatus(SegmentStatus.LOAD_PARTIAL_SUCCESS)
       val executionErrors = new ExecutionErrors(FailureCauses.NONE, "")
       executionErrors.failureCauses = FailureCauses.BAD_RECORDS
       Array((uniqueLoadStatusId, (loadMetadataDetails, executionErrors)))
     } else {
       val uniqueLoadStatusId = model.getTableName + CarbonCommonConstants.UNDERSCORE + "Success"
       val loadMetadataDetails = new LoadMetadataDetails()
       loadMetadataDetails.setSegmentStatus(SegmentStatus.SUCCESS)
       val executionErrors = new ExecutionErrors(FailureCauses.NONE, "")
       Array((uniqueLoadStatusId, (loadMetadataDetails, executionErrors)))
     }
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.carbondata.spark.load

	import java.util.Comparator

	import org.apache.hadoop.conf.Configuration
	import org.apache.hadoop.io.NullWritable
	import org.apache.hadoop.mapred.JobConf
	import org.apache.hadoop.mapreduce.lib.input.FileInputFormat
	import org.apache.spark.{SparkContext, TaskContext}
	import org.apache.spark.deploy.SparkHadoopUtil
	import org.apache.spark.rdd.NewHadoopRDD
	import org.apache.spark.sql.DataFrame
	import org.apache.spark.sql.execution.command.ExecutionErrors
	import org.apache.spark.storage.StorageLevel

	import org.apache.carbondata.common.logging.LogServiceFactory
	import org.apache.carbondata.core.constants.CarbonCommonConstants
	import org.apache.carbondata.core.datastore.row.CarbonRow
	import org.apache.carbondata.core.statusmanager.{LoadMetadataDetails, SegmentStatus}
	import org.apache.carbondata.core.util.CarbonProperties
	import org.apache.carbondata.processing.loading.{DataLoadProcessBuilder, FailureCauses}
	import org.apache.carbondata.processing.loading.csvinput.{CSVInputFormat, StringArrayWritable}
	import org.apache.carbondata.processing.loading.model.CarbonLoadModel
	import org.apache.carbondata.processing.sort.sortdata.{NewRowComparator, NewRowComparatorForNormalDims, SortParameters}
	import org.apache.carbondata.processing.util.CarbonDataProcessorUtil
	import org.apache.carbondata.spark.util.CommonUtil

	/**
	* Use sortBy operator in spark to load the data
	*/
	object DataLoadProcessBuilderOnSpark {
	private val LOGGER = LogServiceFactory.getLogService(this.getClass.getCanonicalName)

	def loadDataUsingGlobalSort(
	sc: SparkContext,
	dataFrame: Option[DataFrame],
	model: CarbonLoadModel,
	hadoopConf: Configuration): Array[(String, (LoadMetadataDetails, ExecutionErrors))] = {
	val originRDD = if (dataFrame.isDefined) {
	dataFrame.get.rdd
	} else {
	// input data from files
	CommonUtil.configureCSVInputFormat(hadoopConf, model)
	hadoopConf.set(FileInputFormat.INPUT_DIR, model.getFactFilePath)
	val columnCount = model.getCsvHeaderColumns.length
	val jobConf = new JobConf(hadoopConf)
	SparkHadoopUtil.get.addCredentials(jobConf)
	new NewHadoopRDD[NullWritable, StringArrayWritable](
	sc,
	classOf[CSVInputFormat],
	classOf[NullWritable],
	classOf[StringArrayWritable],
	jobConf)
	.map(x => DataLoadProcessorStepOnSpark.toStringArrayRow(x._2, columnCount))
	}

	model.setPartitionId("0")
	val modelBroadcast = sc.broadcast(model)
	val partialSuccessAccum = sc.accumulator(0, "Partial Success Accumulator")

	val inputStepRowCounter = sc.accumulator(0, "Input Processor Accumulator")
	val convertStepRowCounter = sc.accumulator(0, "Convert Processor Accumulator")
	val sortStepRowCounter = sc.accumulator(0, "Sort Processor Accumulator")
	val writeStepRowCounter = sc.accumulator(0, "Write Processor Accumulator")

	// 1. Input
	val inputRDD = originRDD
	.mapPartitions(rows => DataLoadProcessorStepOnSpark.toRDDIterator(rows, modelBroadcast))
	.mapPartitionsWithIndex { case (index, rows) =>
	DataLoadProcessorStepOnSpark.inputFunc(rows, index, modelBroadcast, inputStepRowCounter)
	}

	// 2. Convert
	val convertRDD = inputRDD.mapPartitionsWithIndex { case (index, rows) =>
	DataLoadProcessorStepOnSpark.convertFunc(rows, index, modelBroadcast, partialSuccessAccum,
	convertStepRowCounter)
	}.filter(_ != null)// Filter the bad record

	// 3. Sort
	val configuration = DataLoadProcessBuilder.createConfiguration(model)
	val sortParameters = SortParameters.createSortParameters(configuration)
	object RowOrdering extends Ordering[Array[AnyRef]] {
	def compare(rowA: Array[AnyRef], rowB: Array[AnyRef]): Int = {
	val rowComparator: Comparator[Array[AnyRef]] =
	if (sortParameters.getNoDictionaryCount > 0) {
	new NewRowComparator(sortParameters.getNoDictionaryDimnesionColumn)
	} else {
	new NewRowComparatorForNormalDims(sortParameters.getDimColCount)
	}

	rowComparator.compare(rowA, rowB)
	}
	}

	var numPartitions = CarbonDataProcessorUtil.getGlobalSortPartitions(configuration)
	if (numPartitions <= 0) {
	numPartitions = convertRDD.partitions.length
	}

	// Because if the number of partitions greater than 1, there will be action operator(sample) in
	// sortBy operator. So here we cache the rdd to avoid do input and convert again.
	if (numPartitions > 1) {
	convertRDD.persist(StorageLevel.fromString(
	CarbonProperties.getInstance().getGlobalSortRddStorageLevel()))
	}

	import scala.reflect.classTag
	val sortRDD = convertRDD
	.sortBy(_.getData, numPartitions = numPartitions)(RowOrdering, classTag[Array[AnyRef]])
	.mapPartitionsWithIndex { case (index, rows) =>
	DataLoadProcessorStepOnSpark.convertTo3Parts(rows, index, modelBroadcast,
	sortStepRowCounter)
	}

	// 4. Write
	sc.runJob(sortRDD, (context: TaskContext, rows: Iterator[CarbonRow]) =>
	DataLoadProcessorStepOnSpark.writeFunc(rows, context.partitionId, modelBroadcast,
	writeStepRowCounter))

	// clean cache
	convertRDD.unpersist()

	// Log the number of rows in each step
	LOGGER.info("Total rows processed in step Input Processor: " + inputStepRowCounter.value)
	LOGGER.info("Total rows processed in step Data Converter: " + convertStepRowCounter.value)
	LOGGER.info("Total rows processed in step Sort Processor: " + sortStepRowCounter.value)
	LOGGER.info("Total rows processed in step Data Writer: " + writeStepRowCounter.value)

	// Update status
	if (partialSuccessAccum.value != 0) {
	val uniqueLoadStatusId = model.getTableName + CarbonCommonConstants.UNDERSCORE +
	"Partial_Success"
	val loadMetadataDetails = new LoadMetadataDetails()
	loadMetadataDetails.setSegmentStatus(SegmentStatus.LOAD_PARTIAL_SUCCESS)
	val executionErrors = new ExecutionErrors(FailureCauses.NONE, "")
	executionErrors.failureCauses = FailureCauses.BAD_RECORDS
	Array((uniqueLoadStatusId, (loadMetadataDetails, executionErrors)))
	} else {
	val uniqueLoadStatusId = model.getTableName + CarbonCommonConstants.UNDERSCORE + "Success"
	val loadMetadataDetails = new LoadMetadataDetails()
	loadMetadataDetails.setSegmentStatus(SegmentStatus.SUCCESS)
	val executionErrors = new ExecutionErrors(FailureCauses.NONE, "")
	Array((uniqueLoadStatusId, (loadMetadataDetails, executionErrors)))
	}
	}
	}