blob: 06ed5d2da207363d72214ccf0e43aabb5ad7c917 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc.tools.convert;
import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.orc.RecordReader;
import org.apache.orc.TypeDescription;
import org.junit.After;
import org.junit.Before;
import org.junit.Test;
import java.io.StringReader;
import java.util.Locale;
import static org.apache.orc.tools.convert.ConvertTool.DEFAULT_TIMESTAMP_FORMAT;
import static org.junit.Assert.assertEquals;
public class TestCsvReader {
Locale defaultLocale;
@Before
public void storeDefaultLocale() {
defaultLocale = Locale.getDefault();
Locale.setDefault(Locale.US);
}
@After
public void restoreDefaultLocale() {
Locale.setDefault(defaultLocale);
}
@Test
public void testSimple() throws Exception {
// yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]
StringReader input = new StringReader(
"1,1.25,1.01,'a',f,'2000-01-01T00:00:00+00:00'\n" +
"2,2.5,2.02,'14',t,'2000/01/01T00:00:00+00'\n" +
"3,3.75,3.03,'1e',false,'2000-01-01T00:00:00Z'\n" +
"4,5,4.04,'28',true,'2000-01-01 00:00:00+00'\n" +
"5,6.25,5.05,'32',0,'2000-01-01 00:00:00-00'\n" +
"6,7.5,6.06,'3c',1,'2000-01-01T04:00:00+04'\n" +
"7,8.75,7.07,'46',2,'1999-12-31T20:00:00-04:00'\n" +
"8,10,8.08,'50',t,'2000-01-01T00:00:00+00'\n"
);
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
'\\', 0, "", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch(5);
assertEquals(true, reader.nextBatch(batch));
assertEquals(5, batch.size);
long bool = 0;
for(int r = 0; r < batch.size; ++r) {
assertEquals(r+1, ((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals(1.25 * (r + 1), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
assertEquals((r + 1) + ".0" + (r + 1), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
assertEquals(Integer.toHexString((r + 1) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
bool = 1 - bool;
assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
}
assertEquals(true, reader.nextBatch(batch));
assertEquals(3, batch.size);
for(int r = 0; r < batch.size; ++r) {
assertEquals(r + 6, ((LongColumnVector) batch.cols[0]).vector[r]);
assertEquals(1.25 * (r + 6), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
assertEquals((r + 6) + ".0" + (r + 6), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
assertEquals(Integer.toHexString((r + 6) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
bool = 1 - bool;
assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
}
assertEquals(false, reader.nextBatch(batch));
}
@Test
public void testNulls() throws Exception {
StringReader input = new StringReader(
"1,1,1,'a'\n" +
"'null','null','null','null'\n" +
"3,3,3,'row 3'\n"
);
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:double,c:decimal(10,2),d:string>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(3, batch.size);
for(int c=0; c < 4; ++c) {
assertEquals("column " + c, false, batch.cols[c].noNulls);
}
// check row 0
assertEquals(1, ((LongColumnVector) batch.cols[0]).vector[0]);
assertEquals(1, ((DoubleColumnVector) batch.cols[1]).vector[0], 0.001);
assertEquals("1", ((DecimalColumnVector) batch.cols[2]).vector[0].toString());
assertEquals("a", ((BytesColumnVector) batch.cols[3]).toString(0));
for(int c=0; c < 4; ++c) {
assertEquals("column " + c, false, batch.cols[c].isNull[0]);
}
// row 1
for(int c=0; c < 4; ++c) {
assertEquals("column " + c, true, batch.cols[c].isNull[1]);
}
// check row 2
assertEquals(3, ((LongColumnVector) batch.cols[0]).vector[2]);
assertEquals(3, ((DoubleColumnVector) batch.cols[1]).vector[2], 0.001);
assertEquals("3", ((DecimalColumnVector) batch.cols[2]).vector[2].toString());
assertEquals("row 3", ((BytesColumnVector) batch.cols[3]).toString(2));
for(int c=0; c < 4; ++c) {
assertEquals("column " + c, false, batch.cols[c].isNull[2]);
}
}
@Test
public void testStructs() throws Exception {
StringReader input = new StringReader(
"1,2,3,4\n" +
"5,6,7,8\n"
);
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:struct<c:int,d:int>,e:int>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(2, batch.size);
int nextVal = 1;
for(int r=0; r < 2; ++r) {
assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[0]).vector[r]);
StructColumnVector b = (StructColumnVector) batch.cols[1];
assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[0]).vector[r]);
assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[1]).vector[r]);
assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[2]).vector[r]);
}
assertEquals(false, reader.nextBatch(batch));
}
@Test
public void testLargeNumbers() throws Exception {
StringReader input = new StringReader(
"2147483646,-2147483647,9223372036854775806,-9223372036854775807\n"
);
TypeDescription schema = TypeDescription.fromString(
"struct<a:int,b:int,d:bigint,e:bigint>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
VectorizedRowBatch batch = schema.createRowBatch();
assertEquals(true, reader.nextBatch(batch));
assertEquals(1, batch.size);
assertEquals(2147483646, ((LongColumnVector) batch.cols[0]).vector[0]);
assertEquals(-2147483647, ((LongColumnVector) batch.cols[1]).vector[0]);
assertEquals(9223372036854775806L, ((LongColumnVector) batch.cols[2]).vector[0]);
assertEquals(-9223372036854775807L, ((LongColumnVector) batch.cols[3]).vector[0]);
assertEquals(false, reader.nextBatch(batch));
}
@Test
public void testCustomTimestampFormat() throws Exception {
String tsFormat = "d[d] MMM yyyy HH:mm:ss.SSSSSS";
StringReader input = new StringReader(
"'21 Mar 2018 12:23:34.123456'\n" +
"'3 Feb 2018 18:04:51.456789'\n"
);
TypeDescription schema = TypeDescription.fromString(
"struct<a:timestamp>");
RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
'\\', 0, "", tsFormat);
VectorizedRowBatch batch = schema.createRowBatch(2);
assertEquals(true, reader.nextBatch(batch));
assertEquals(2, batch.size);
TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
assertEquals("2018-03-21 12:23:34.123456", cv.asScratchTimestamp(0).toString());
assertEquals("2018-02-03 18:04:51.456789", cv.asScratchTimestamp(1).toString());
}
}