blob: 412fd3775163a735f8c9d507ee8b17511d33305a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.carbondata.spark.testsuite.dataload
import java.io.{BufferedWriter, File, FileWriter, FilenameFilter}
import org.apache.spark.sql.Row
import org.scalatest.BeforeAndAfterAll
import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.core.util.CarbonProperties
import org.apache.spark.sql.test.util.QueryTest
import org.apache.carbondata.core.datamap.Segment
import org.apache.carbondata.core.datastore.impl.FileFactory
import org.apache.carbondata.core.indexstore.blockletindex.SegmentIndexFileStore
import org.apache.carbondata.core.metadata.{CarbonMetadata, SegmentFileStore}
import org.apache.carbondata.core.metadata.schema.table.CarbonTable
import org.apache.carbondata.core.util.path.CarbonTablePath
class TestBatchSortDataLoad extends QueryTest with BeforeAndAfterAll {
var filePath: String = _
def buildTestData() = {
filePath = s"${integrationPath}/spark-common-test/target/big.csv"
val file = new File(filePath)
val writer = new BufferedWriter(new FileWriter(file))
writer.write("c1,c2,c3, c4, c5, c6, c7, c8, c9, c10")
writer.newLine()
for(i <- 0 until 100000) {
writer.write("a" + i%1000 + "," +
"b" + i%1000 + "," +
"c" + i%1000 + "," +
"d" + i%1000 + "," +
"e" + i%1000 + "," +
"f" + i%1000 + "," +
i%1000 + "," +
i%1000 + "," +
i%1000 + "," +
i%1000 + "\n")
if ( i % 10000 == 0) {
writer.flush()
}
}
writer.close()
}
def dropTable() = {
sql("DROP TABLE IF EXISTS carbon_load1")
sql("DROP TABLE IF EXISTS carbon_load2")
sql("DROP TABLE IF EXISTS carbon_load3")
sql("DROP TABLE IF EXISTS carbon_load4")
sql("DROP TABLE IF EXISTS carbon_load5")
sql("DROP TABLE IF EXISTS carbon_load6")
}
override def beforeAll {
dropTable
buildTestData
}
test("test batch sort load by passing option to load command") {
sql(
"""
| CREATE TABLE carbon_load1(c1 string, c2 string, c3 string, c4 string, c5 string,
| c6 string, c7 int, c8 int, c9 int, c10 int)
| STORED BY 'org.apache.carbondata.format'
| TBLPROPERTIES('dictionary_include'='c1,c2,c3,c4,c5,c6',
| 'sort_scope'='batch_sort', 'sort_columns'='c1,c2,c3,c4,c5,c6')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load1 " +
s"OPTIONS('batch_sort_size_inmb'='1')")
checkAnswer(sql("select count(*) from carbon_load1"), Seq(Row(100000)))
assert(getIndexfileCount("carbon_load1") == 5, "Something wrong in batch sort")
}
test("test batch sort load by passing option to load command and compare with normal load") {
sql(
"""
| CREATE TABLE carbon_load2(c1 string, c2 string, c3 string, c4 string, c5 string,
| c6 string, c7 int, c8 int, c9 int, c10 int)
| STORED BY 'org.apache.carbondata.format'
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load2 ")
checkAnswer(sql("select * from carbon_load1 where c1='a1' order by c1"),
sql("select * from carbon_load2 where c1='a1' order by c1"))
}
test("test batch sort load by passing option and compaction") {
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load1 " +
s"OPTIONS('batch_sort_size_inmb'='1')")
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load1 " +
s"OPTIONS('batch_sort_size_inmb'='1')")
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load1 " +
s"OPTIONS('batch_sort_size_inmb'='1')")
sql("alter table carbon_load1 compact 'major'")
Thread.sleep(4000)
checkAnswer(sql("select count(*) from carbon_load1"), Seq(Row(400000)))
assert(getIndexfileCount("carbon_load1", "0.1") == 1, "Something wrong in compaction after batch sort")
}
test("test batch sort load by passing option with single pass") {
sql(
"""
| CREATE TABLE carbon_load3(c1 string, c2 string, c3 string, c4 string, c5 string,
| c6 string, c7 int, c8 int, c9 int, c10 int)
| STORED BY 'org.apache.carbondata.format'
| TBLPROPERTIES('dictionary_include'='c1,c2,c3,c4,c5,c6',
| 'sort_scope'='batch_sort', 'sort_columns'='c1,c2,c3,c4,c5,c6')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load3 " +
s"OPTIONS('batch_sort_size_inmb'='1', 'single_pass'='true')")
checkAnswer(sql("select count(*) from carbon_load3"), Seq(Row(100000)))
assert(getIndexfileCount("carbon_load3") == 5, "Something wrong in batch sort")
checkAnswer(sql("select * from carbon_load3 where c1='a1' order by c1"),
sql("select * from carbon_load2 where c1='a1' order by c1"))
}
test("test batch sort load by with out passing option but through carbon properties") {
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.LOAD_SORT_SCOPE, "BATCH_SORT")
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.LOAD_BATCH_SORT_SIZE_INMB, "1")
sql(
"""
| CREATE TABLE carbon_load4(c1 string, c2 string, c3 string, c4 string, c5 string,
| c6 string, c7 int, c8 int, c9 int, c10 int)
| STORED BY 'org.apache.carbondata.format'
| TBLPROPERTIES('dictionary_include'='c1,c2,c3,c4,c5,c6', 'sort_columns'='c1,c2,c3,c4,c5,c6')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load4 " )
checkAnswer(sql("select count(*) from carbon_load4"), Seq(Row(100000)))
assert(getIndexfileCount("carbon_load4") == 5, "Something wrong in batch sort")
CarbonProperties.getInstance().
addProperty(CarbonCommonConstants.LOAD_SORT_SCOPE,
CarbonCommonConstants.LOAD_SORT_SCOPE_DEFAULT)
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.LOAD_BATCH_SORT_SIZE_INMB, "0")
}
test("test batch sort load by with out passing option but through carbon properties with default size") {
CarbonProperties.getInstance().addProperty(CarbonCommonConstants.LOAD_SORT_SCOPE, "BATCH_SORT")
sql(
"""
| CREATE TABLE carbon_load6(c1 string, c2 string, c3 string, c4 string, c5 string,
| c6 string, c7 int, c8 int, c9 int, c10 int)
| STORED BY 'org.apache.carbondata.format'
| TBLPROPERTIES('dictionary_include'='c1,c2,c3,c4,c5,c6')
""".stripMargin)
sql(s"LOAD DATA LOCAL INPATH '$filePath' into table carbon_load6 " )
checkAnswer(sql("select count(*) from carbon_load6"), Seq(Row(100000)))
assert(getIndexfileCount("carbon_load6") == 1, "Something wrong in batch sort")
CarbonProperties.getInstance().
addProperty(CarbonCommonConstants.LOAD_SORT_SCOPE,
CarbonCommonConstants.LOAD_SORT_SCOPE_DEFAULT)
}
def getIndexfileCount(tableName: String, segmentNo: String = "0"): Int = {
val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", tableName)
val segmentDir = CarbonTablePath.getSegmentPath(carbonTable.getTablePath, segmentNo)
if (FileFactory.isFileExist(segmentDir)) {
new SegmentIndexFileStore().getIndexFilesFromSegment(segmentDir).size()
} else {
val segment = Segment.getSegment(segmentNo, carbonTable.getTablePath)
new SegmentFileStore(carbonTable.getTablePath, segment.getSegmentFileName).getIndexCarbonFiles.size()
}
}
override def afterAll {
dropTable
new File(filePath).delete()
CarbonProperties.getInstance()
.addProperty(CarbonCommonConstants.LOAD_SORT_SCOPE,
CarbonCommonConstants.LOAD_SORT_SCOPE_DEFAULT)
}
}