integration/spark-common/src/main/scala/org/apache/carbondata/spark/rdd/CarbonGlobalDictionaryRDD.scala - carbondata - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *    http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.carbondata.spark.rdd

 import java.util.regex.Pattern

 import scala.collection.mutable
 import scala.collection.mutable.ArrayBuffer

 import org.apache.commons.lang3.{ArrayUtils, StringUtils}
 import org.apache.spark._
 import org.apache.spark.sql.Row

 import org.apache.carbondata.core.constants.CarbonLoadOptionConstants
 import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, ColumnIdentifier}
 import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension
 import org.apache.carbondata.processing.loading.exception.NoRetryException

 /**
  * A partitioner partition by column.
  *
  * @constructor create a partitioner
  * @param numParts the number of partitions
  */
 class ColumnPartitioner(numParts: Int) extends Partitioner {
   override def numPartitions: Int = numParts

   override def getPartition(key: Any): Int = key.asInstanceOf[Int]
 }

 trait GenericParser {
   val dimension: CarbonDimension

   def addChild(child: GenericParser): Unit

   def parseString(input: String): Unit
 }

 case class PrimitiveParser(dimension: CarbonDimension,
     setOpt: Option[mutable.HashSet[String]]) extends GenericParser {
   val (hasDictEncoding, set: mutable.HashSet[String]) = setOpt match {
     case None => (false, new mutable.HashSet[String])
     case Some(x) => (true, x)
   }

   def addChild(child: GenericParser): Unit = {
   }

   def parseString(input: String): Unit = {
     if (hasDictEncoding && input != null) {
       if (set.size < CarbonLoadOptionConstants.MAX_EXTERNAL_DICTIONARY_SIZE) {
         set.add(input)
       } else {
         throw new NoRetryException(s"Cannot provide more than ${
           CarbonLoadOptionConstants.MAX_EXTERNAL_DICTIONARY_SIZE } dictionary values")
       }
     }
   }
 }

 case class ArrayParser(dimension: CarbonDimension, format: DataFormat) extends GenericParser {
   var children: GenericParser = _

   def addChild(child: GenericParser): Unit = {
     children = child
   }

   def parseString(input: String): Unit = {
     if (StringUtils.isNotEmpty(input)) {
       val splits = format.getSplits(input)
       if (ArrayUtils.isNotEmpty(splits)) {
         splits.foreach { s =>
           children.parseString(s)
         }
       }
     }
   }
 }

 case class StructParser(dimension: CarbonDimension,
     format: DataFormat) extends GenericParser {
   val children = new ArrayBuffer[GenericParser]

   def addChild(child: GenericParser): Unit = {
     children += child
   }

   def parseString(input: String): Unit = {
     if (StringUtils.isNotEmpty(input)) {
       val splits = format.getSplits(input)
       val len = Math.min(children.length, splits.length)
       for (i <- 0 until len) {
         children(i).parseString(splits(i))
       }
     }
   }
 }

 case class DataFormat(delimiters: Array[String],
     var delimiterIndex: Int,
     patterns: Array[Pattern]) extends Serializable {
   self =>
   def getSplits(input: String): Array[String] = {
     // -1 in case after splitting the last column is empty, the surrogate key ahs to be generated
     // for empty value too
     patterns(delimiterIndex).split(input, -1)
   }

   def cloneAndIncreaseIndex: DataFormat = {
     DataFormat(delimiters, Math.min(delimiterIndex + 1, delimiters.length - 1), patterns)
   }
 }

 /**
  * a case class to package some attributes
  */
 case class DictionaryLoadModel(
     table: AbsoluteTableIdentifier,
     dimensions: Array[CarbonDimension],
     hdfsLocation: String,
     dictfolderPath: String,
     dictFilePaths: Array[String],
     dictFileExists: Array[Boolean],
     isComplexes: Array[Boolean],
     primDimensions: Array[CarbonDimension],
     delimiters: Array[String],
     columnIdentifier: Array[ColumnIdentifier],
     isFirstLoad: Boolean,
     hdfsTempLocation: String,
     lockType: String,
     zooKeeperUrl: String,
     serializationNullFormat: String,
     defaultTimestampFormat: String,
     defaultDateFormat: String) extends Serializable

 case class ColumnDistinctValues(values: Array[String], rowCount: Long) extends Serializable

 class StringArrayRow(var values: Array[String]) extends Row {

   override def length: Int = values.length

   override def get(i: Int): Any = values(i)

   override def getString(i: Int): String = values(i)

   private def reset(): Unit = {
     for (i <- 0 until values.length) {
       values(i) = null
     }
   }

   override def copy(): Row = {
     val tmpValues = new Array[String](values.length)
     System.arraycopy(values, 0, tmpValues, 0, values.length)
     new StringArrayRow(tmpValues)
   }

   def setValues(values: Array[String]): StringArrayRow = {
     reset()
     if (values != null) {
       val minLength = Math.min(this.values.length, values.length)
       System.arraycopy(values, 0, this.values, 0, minLength)
     }
     this
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.carbondata.spark.rdd

	import java.util.regex.Pattern

	import scala.collection.mutable
	import scala.collection.mutable.ArrayBuffer

	import org.apache.commons.lang3.{ArrayUtils, StringUtils}
	import org.apache.spark._
	import org.apache.spark.sql.Row

	import org.apache.carbondata.core.constants.CarbonLoadOptionConstants
	import org.apache.carbondata.core.metadata.{AbsoluteTableIdentifier, ColumnIdentifier}
	import org.apache.carbondata.core.metadata.schema.table.column.CarbonDimension
	import org.apache.carbondata.processing.loading.exception.NoRetryException

	/**
	* A partitioner partition by column.
	*
	* @constructor create a partitioner
	* @param numParts the number of partitions
	*/
	class ColumnPartitioner(numParts: Int) extends Partitioner {
	override def numPartitions: Int = numParts

	override def getPartition(key: Any): Int = key.asInstanceOf[Int]
	}

	trait GenericParser {
	val dimension: CarbonDimension

	def addChild(child: GenericParser): Unit

	def parseString(input: String): Unit
	}

	case class PrimitiveParser(dimension: CarbonDimension,
	setOpt: Option[mutable.HashSet[String]]) extends GenericParser {
	val (hasDictEncoding, set: mutable.HashSet[String]) = setOpt match {
	case None => (false, new mutable.HashSet[String])
	case Some(x) => (true, x)
	}

	def addChild(child: GenericParser): Unit = {
	}

	def parseString(input: String): Unit = {
	if (hasDictEncoding && input != null) {
	if (set.size < CarbonLoadOptionConstants.MAX_EXTERNAL_DICTIONARY_SIZE) {
	set.add(input)
	} else {
	throw new NoRetryException(s"Cannot provide more than ${
	CarbonLoadOptionConstants.MAX_EXTERNAL_DICTIONARY_SIZE } dictionary values")
	}
	}
	}
	}

	case class ArrayParser(dimension: CarbonDimension, format: DataFormat) extends GenericParser {
	var children: GenericParser = _

	def addChild(child: GenericParser): Unit = {
	children = child
	}

	def parseString(input: String): Unit = {
	if (StringUtils.isNotEmpty(input)) {
	val splits = format.getSplits(input)
	if (ArrayUtils.isNotEmpty(splits)) {
	splits.foreach { s =>
	children.parseString(s)
	}
	}
	}
	}
	}

	case class StructParser(dimension: CarbonDimension,
	format: DataFormat) extends GenericParser {
	val children = new ArrayBuffer[GenericParser]

	def addChild(child: GenericParser): Unit = {
	children += child
	}

	def parseString(input: String): Unit = {
	if (StringUtils.isNotEmpty(input)) {
	val splits = format.getSplits(input)
	val len = Math.min(children.length, splits.length)
	for (i <- 0 until len) {
	children(i).parseString(splits(i))
	}
	}
	}
	}

	case class DataFormat(delimiters: Array[String],
	var delimiterIndex: Int,
	patterns: Array[Pattern]) extends Serializable {
	self =>
	def getSplits(input: String): Array[String] = {
	// -1 in case after splitting the last column is empty, the surrogate key ahs to be generated
	// for empty value too
	patterns(delimiterIndex).split(input, -1)
	}

	def cloneAndIncreaseIndex: DataFormat = {
	DataFormat(delimiters, Math.min(delimiterIndex + 1, delimiters.length - 1), patterns)
	}
	}

	/**
	* a case class to package some attributes
	*/
	case class DictionaryLoadModel(
	table: AbsoluteTableIdentifier,
	dimensions: Array[CarbonDimension],
	hdfsLocation: String,
	dictfolderPath: String,
	dictFilePaths: Array[String],
	dictFileExists: Array[Boolean],
	isComplexes: Array[Boolean],
	primDimensions: Array[CarbonDimension],
	delimiters: Array[String],
	columnIdentifier: Array[ColumnIdentifier],
	isFirstLoad: Boolean,
	hdfsTempLocation: String,
	lockType: String,
	zooKeeperUrl: String,
	serializationNullFormat: String,
	defaultTimestampFormat: String,
	defaultDateFormat: String) extends Serializable

	case class ColumnDistinctValues(values: Array[String], rowCount: Long) extends Serializable

	class StringArrayRow(var values: Array[String]) extends Row {

	override def length: Int = values.length

	override def get(i: Int): Any = values(i)

	override def getString(i: Int): String = values(i)

	private def reset(): Unit = {
	for (i <- 0 until values.length) {
	values(i) = null
	}
	}

	override def copy(): Row = {
	val tmpValues = new Array[String](values.length)
	System.arraycopy(values, 0, tmpValues, 0, values.length)
	new StringArrayRow(tmpValues)
	}

	def setValues(values: Array[String]): StringArrayRow = {
	reset()
	if (values != null) {
	val minLength = Math.min(this.values.length, values.length)
	System.arraycopy(values, 0, this.values, 0, minLength)
	}
	this
	}
	}