| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| package org.apache.carbondata.integration.spark.testsuite.dataload |
| |
| import java.math.BigDecimal |
| |
| import org.apache.spark.sql.Row |
| import org.apache.spark.sql.test.util.QueryTest |
| import org.scalatest.BeforeAndAfterEach |
| |
| import org.apache.carbondata.core.util.path.CarbonTablePath |
| import org.apache.carbondata.core.datastore.impl.FileFactory |
| import org.apache.carbondata.core.metadata.CarbonMetadata |
| import org.apache.carbondata.core.constants.{CarbonCommonConstants, CarbonLoadOptionConstants} |
| import org.apache.carbondata.core.index.Segment |
| import org.apache.carbondata.core.util.CarbonProperties |
| import org.apache.carbondata.spark.util.BadRecordUtil |
| import org.apache.commons.io.FileUtils |
| import org.apache.commons.lang3.RandomStringUtils |
| |
| class TestLoadDataGeneral extends QueryTest with BeforeAndAfterEach { |
| |
| val badRecordAction = CarbonProperties.getInstance() |
| .getProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION); |
| val testdata =s"$resourcesPath/MoreThan32KChar.csv" |
| val longChar: String = RandomStringUtils.randomAlphabetic(33000) |
| |
| override def beforeEach { |
| sql("DROP TABLE IF EXISTS loadtest") |
| sql( |
| """ |
| | CREATE TABLE loadtest(id int, name string, city string, age int) |
| | STORED AS carbondata |
| """.stripMargin) |
| sql("drop table if exists longerThan32kChar") |
| } |
| |
| private def checkSegmentExists( |
| segmentId: String, |
| databaseName: String, |
| tableName: String): Boolean = { |
| val carbonTable = CarbonMetadata.getInstance().getCarbonTable(databaseName, tableName) |
| val partitionPath = |
| CarbonTablePath.getPartitionDir(carbonTable.getAbsoluteTableIdentifier.getTablePath) |
| val segment = Segment.getSegment(segmentId, carbonTable.getAbsoluteTableIdentifier.getTablePath) |
| segment != null |
| } |
| |
| test("test data loading CSV file") { |
| val testData = s"$resourcesPath/sample.csv" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(6)) |
| ) |
| } |
| |
| test("test data loading CSV file without extension name") { |
| val testData = s"$resourcesPath/sample" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(4)) |
| ) |
| } |
| |
| test("test data loading GZIP compressed CSV file") { |
| val testData = s"$resourcesPath/sample.csv.gz" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(4)) |
| ) |
| } |
| |
| test("test data loading BZIP2 compressed CSV file") { |
| val testData = s"$resourcesPath/sample.csv.bz2" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(4)) |
| ) |
| } |
| |
| test("test data loading CSV file with delimiter char \\017") { |
| val testData = s"$resourcesPath/sample_withDelimiter017.csv" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest options ('delimiter'='\\017')") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(4)) |
| ) |
| } |
| |
| test("test data loading with invalid values for mesasures") { |
| val testData = s"$resourcesPath/invalidMeasures.csv" |
| sql("drop table if exists invalidMeasures") |
| sql("CREATE TABLE invalidMeasures (country String, salary double, age decimal(10,2)) STORED AS carbondata") |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table invalidMeasures options('Fileheader'='country,salary,age')") |
| checkAnswer( |
| sql("SELECT * FROM invalidMeasures"), |
| Seq(Row("India",null,new BigDecimal("22.44")), Row("Russia",null,null), Row("USA",234.43,null)) |
| ) |
| } |
| |
| test("test data loading into table whose name has '_'") { |
| sql("DROP TABLE IF EXISTS load_test") |
| sql(""" CREATE TABLE load_test(id int, name string, city string, age int) |
| STORED AS carbondata """) |
| val testData = s"$resourcesPath/sample.csv" |
| try { |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table load_test") |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table load_test") |
| } catch { |
| case ex: Exception => |
| assert(false) |
| } |
| assert(checkSegmentExists("0", "default", "load_test")) |
| assert(checkSegmentExists("1", "default", "load_test")) |
| sql("DROP TABLE load_test") |
| } |
| |
| test("test load data with decimal type and sort intermediate files as 1") { |
| sql("drop table if exists carbon_table") |
| sql("drop table if exists carbonBigDecimalLoad") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.SORT_INTERMEDIATE_FILES_LIMIT, "1") |
| .addProperty(CarbonCommonConstants.SORT_SIZE, "1") |
| .addProperty(CarbonCommonConstants.DATA_LOAD_BATCH_SIZE, "1") |
| sql("create table if not exists carbonBigDecimalLoad (ID Int, date Timestamp, country String, name String, phonetype String, serialname String, salary decimal(27, 10)) STORED AS carbondata") |
| sql(s"LOAD DATA LOCAL INPATH '$resourcesPath/decimalBoundaryDataCarbon.csv' into table carbonBigDecimalLoad") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.SORT_INTERMEDIATE_FILES_LIMIT, |
| CarbonCommonConstants.SORT_INTERMEDIATE_FILES_LIMIT_DEFAULT_VALUE) |
| .addProperty(CarbonCommonConstants.SORT_SIZE, CarbonCommonConstants.SORT_SIZE_DEFAULT_VAL) |
| .addProperty(CarbonCommonConstants.DATA_LOAD_BATCH_SIZE, |
| CarbonCommonConstants.DATA_LOAD_BATCH_SIZE_DEFAULT) |
| sql("drop table if exists carbon_table") |
| } |
| |
| private def createTableAndLoadData (badRecordAction: String): Unit = { |
| BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") |
| sql("CREATE TABLE longerthan32kchar(dim1 String, dim2 String, mes1 int) STORED AS carbondata") |
| sql(s"LOAD DATA LOCAL INPATH '$testdata' into table longerThan32kChar OPTIONS('FILEHEADER'='dim1,dim2,mes1', " + |
| s"'BAD_RECORDS_ACTION'='${badRecordAction}','BAD_RECORDS_LOGGER_ENABLE'='TRUE')") |
| } |
| |
| test("test load / insert / update with data more than 32000 characters and bad record action as Redirect") { |
| createTableAndLoadData("REDIRECT") |
| var redirectCsvPath = BadRecordUtil |
| .getRedirectCsvPath("default", "longerthan32kchar", "0", "0") |
| assert(BadRecordUtil.checkRedirectedCsvContentAvailableInSource(testdata, redirectCsvPath)) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "REDIRECT"); |
| sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2))) |
| redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "1", "0") |
| var redirectedFileLineList = FileUtils.readLines(redirectCsvPath) |
| var iterator = redirectedFileLineList.iterator() |
| while (iterator.hasNext) { |
| assert(iterator.next().equals("33000,"+longChar+",4")) |
| } |
| |
| // Update strings of length greater than 32000 |
| sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + |
| "where longerthan32kchar.mes1=1").show() |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("itsok", "hello", 2))) |
| redirectCsvPath = BadRecordUtil.getRedirectCsvPath("default", "longerthan32kchar", "0", "1") |
| redirectedFileLineList = FileUtils.readLines(redirectCsvPath) |
| iterator = redirectedFileLineList.iterator() |
| while (iterator.hasNext) { |
| assert(iterator.next().equals("ok,"+longChar+",1")) |
| } |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") |
| |
| // Insert longer string without converter step will throw exception |
| intercept[Exception] { |
| sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") |
| } |
| BadRecordUtil.cleanBadRecordPath("default", "longerthan32kchar") |
| } |
| |
| test("test load / insert / update with data more than 32000 characters and bad record action as Force") { |
| createTableAndLoadData("FORCE") |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3))) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FORCE"); |
| sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") |
| checkAnswer(sql("select * from longerthan32kchar"), |
| Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) |
| |
| // Update strings of length greater than 32000 |
| sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + |
| "where longerthan32kchar.mes1=1").show() |
| checkAnswer(sql("select * from longerthan32kchar"), |
| Seq(Row("ok", null, 1), Row("itsok", "hello", 2), Row("32123", null, 3), Row("33000", null, 4))) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") |
| |
| // Insert longer string without converter step will throw exception |
| intercept[Exception] { |
| sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") |
| } |
| } |
| |
| test("test load / insert / update with data more than 32000 characters and bad record action as Fail") { |
| sql("CREATE TABLE longerthan32kchar(dim1 String, dim2 String, mes1 int) STORED AS carbondata") |
| var exception = intercept[Exception] { |
| sql(s"LOAD DATA LOCAL INPATH '$testdata' into table longerThan32kChar OPTIONS('FILEHEADER'='dim1,dim2,mes1', " + |
| s"'BAD_RECORDS_ACTION'='FAIL','BAD_RECORDS_LOGGER_ENABLE'='TRUE')") |
| } |
| |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "FAIL"); |
| exception = intercept[Exception] { |
| sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") |
| } |
| assert(exception.getMessage.contains(s"Record [33000, $longChar, 4] of column dim2 exceeded " + |
| s"${CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT} characters. Please consider long string data type.")) |
| // Update strings of length greater than 32000 |
| sql(s"insert into longerthan32kchar values('ok', 'hi', 1)") |
| exception = intercept[Exception] { |
| sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + |
| "where longerthan32kchar.mes1=1").show() |
| } |
| assert(exception.getMessage.contains(s"Record [ok, $longChar, 1] of column dim2 exceeded " + |
| s"${CarbonCommonConstants.MAX_CHARS_PER_COLUMN_DEFAULT} characters. Please consider long string data type.")) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") |
| |
| // Insert longer string without converter step will throw exception |
| intercept[Exception] { |
| sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") |
| } |
| } |
| |
| test("test load / insert / update with data more than 32000 characters and bad record action as Ignore") { |
| createTableAndLoadData("IGNORE") |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2))) |
| |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "true") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, "IGNORE"); |
| sql(s"insert into longerthan32kchar values('33000', '$longChar', 4)") |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("ok", "hi", 1), Row("itsok", "hello", 2))) |
| |
| // Update strings of length greater than 32000 |
| sql(s"update longerthan32kchar set(longerthan32kchar.dim2)=('$longChar') " + |
| "where longerthan32kchar.mes1=1").show() |
| checkAnswer(sql("select * from longerthan32kchar"), Seq(Row("itsok", "hello", 2))) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_ENABLE_BAD_RECORD_HANDLING_FOR_INSERT, "false") |
| |
| // Insert longer string without converter step will throw exception |
| intercept[Exception] { |
| sql(s"insert into longerthan32kchar values('32000', '$longChar', 3)") |
| } |
| } |
| |
| test("test load / insert with data more than 32000 bytes - dictionary_exclude") { |
| val testdata = s"$resourcesPath/unicodechar.csv" |
| sql("drop table if exists load32000bytes") |
| sql("create table load32000bytes(name string) STORED AS carbondata") |
| sql("insert into table load32000bytes select 'aaa'") |
| checkAnswer(sql("select count(*) from load32000bytes"), Seq(Row(1))) |
| |
| // Below load will be inserted as null because Strings greater than 32000 is bad record. |
| sql(s"load data local inpath '$testdata' into table load32000bytes OPTIONS ('FILEHEADER'='name')") |
| checkAnswer(sql("select count(*) from load32000bytes"), Seq(Row(2))) |
| checkAnswer(sql("select * from load32000bytes"), Seq(Row("aaa"), Row(null))) |
| |
| val source = scala.io.Source.fromFile(testdata, CarbonCommonConstants.DEFAULT_CHARSET) |
| val data = source.mkString |
| |
| // Insert will throw exception as it is without converter step. |
| intercept[Exception] { |
| sql(s"insert into load32000bytes values('$data')") |
| } |
| |
| sql("drop table if exists load32000bytes") |
| } |
| |
| test("test if stale folders are deleting on data load") { |
| sql("drop table if exists stale") |
| sql("create table stale(a string) STORED AS carbondata") |
| sql("insert into stale values('k')") |
| val carbonTable = CarbonMetadata.getInstance().getCarbonTable("default", "stale") |
| val tableStatusFile = CarbonTablePath.getTableStatusFilePath(carbonTable.getTablePath) |
| FileFactory.getCarbonFile(tableStatusFile).delete() |
| sql("insert into stale values('k')") |
| checkAnswer(sql("select * from stale"), Row("k")) |
| } |
| |
| test("test data loading with directly writing fact data to hdfs") { |
| val originStatus = CarbonProperties.getInstance().getProperty( |
| CarbonLoadOptionConstants.ENABLE_CARBON_LOAD_DIRECT_WRITE_TO_STORE_PATH, |
| CarbonLoadOptionConstants.ENABLE_CARBON_LOAD_DIRECT_WRITE_TO_STORE_PATH_DEFAULT) |
| CarbonProperties.getInstance().addProperty( |
| CarbonLoadOptionConstants.ENABLE_CARBON_LOAD_DIRECT_WRITE_TO_STORE_PATH, "true") |
| |
| val testData = s"$resourcesPath/sample.csv" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(6)) |
| ) |
| |
| CarbonProperties.getInstance().addProperty( |
| CarbonLoadOptionConstants.ENABLE_CARBON_LOAD_DIRECT_WRITE_TO_STORE_PATH, |
| originStatus) |
| } |
| |
| test("test data loading with page size less than 32000") { |
| CarbonProperties.getInstance().addProperty( |
| CarbonCommonConstants.BLOCKLET_SIZE, "16000") |
| |
| val testData = s"$resourcesPath/sample.csv" |
| sql(s"LOAD DATA LOCAL INPATH '$testData' into table loadtest") |
| checkAnswer( |
| sql("SELECT COUNT(*) FROM loadtest"), |
| Seq(Row(6)) |
| ) |
| |
| CarbonProperties.getInstance().addProperty(CarbonCommonConstants.BLOCKLET_SIZE, |
| CarbonCommonConstants.BLOCKLET_SIZE_DEFAULT_VAL) |
| } |
| |
| test("test table creation with special char and other commands") { |
| sql("drop table if exists special_char") |
| sql("create table special_char(`i#d` string, `nam(e` string,`ci)&#@!ty` string,`a\be` int, `ag!e` float, `na^me1` Decimal(8,4)) stored as carbondata") |
| sql("insert into special_char values('1','joey','hud', 2, 2.2, 2.3456)") |
| checkAnswer(sql("select * from special_char"), Seq(Row("1","joey","hud", 2, 2.2, 2.3456))) |
| val df = sql("describe formatted special_char").collect() |
| assert(df.exists(_.get(0).toString.contains("i#d"))) |
| assert(df.exists(_.get(0).toString.contains("nam(e"))) |
| assert(df.exists(_.get(0).toString.contains("ci)&#@!ty"))) |
| assert(df.exists(_.get(0).toString.contains("a\be"))) |
| assert(df.exists(_.get(0).toString.contains("ag!e"))) |
| assert(df.exists(_.get(0).toString.contains("na^me1"))) |
| } |
| |
| override def afterEach { |
| sql("DROP TABLE if exists loadtest") |
| sql("drop table if exists invalidMeasures") |
| sql("drop table if exists longerThan32kChar") |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.SORT_INTERMEDIATE_FILES_LIMIT, |
| CarbonCommonConstants.SORT_INTERMEDIATE_FILES_LIMIT_DEFAULT_VALUE) |
| .addProperty(CarbonCommonConstants.SORT_SIZE, CarbonCommonConstants.SORT_SIZE_DEFAULT_VAL) |
| .addProperty(CarbonCommonConstants.DATA_LOAD_BATCH_SIZE, |
| CarbonCommonConstants.DATA_LOAD_BATCH_SIZE_DEFAULT) |
| CarbonProperties.getInstance() |
| .addProperty(CarbonCommonConstants.CARBON_BAD_RECORDS_ACTION, badRecordAction); |
| } |
| } |