java/tools/src/test/org/apache/orc/tools/convert/TestCsvReader.java - orc - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 package org.apache.orc.tools.convert;

 import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
 import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
 import org.apache.orc.RecordReader;
 import org.apache.orc.TypeDescription;
 import org.junit.After;
 import org.junit.Before;
 import org.junit.Test;

 import java.io.StringReader;
 import java.util.Locale;

 import static org.apache.orc.tools.convert.ConvertTool.DEFAULT_TIMESTAMP_FORMAT;
 import static org.junit.Assert.assertEquals;

 public class TestCsvReader {

   Locale defaultLocale;

   @Before
   public void storeDefaultLocale() {
     defaultLocale = Locale.getDefault();
     Locale.setDefault(Locale.US);
   }

   @After
   public void restoreDefaultLocale() {
     Locale.setDefault(defaultLocale);
   }

   @Test
   public void testSimple() throws Exception {
     // yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]
     StringReader input = new StringReader(
         "1,1.25,1.01,'a',f,'2000-01-01T00:00:00+00:00'\n" +
         "2,2.5,2.02,'14',t,'2000/01/01T00:00:00+00'\n" +
         "3,3.75,3.03,'1e',false,'2000-01-01T00:00:00Z'\n" +
         "4,5,4.04,'28',true,'2000-01-01 00:00:00+00'\n" +
         "5,6.25,5.05,'32',0,'2000-01-01 00:00:00-00'\n" +
         "6,7.5,6.06,'3c',1,'2000-01-01T04:00:00+04'\n" +
         "7,8.75,7.07,'46',2,'1999-12-31T20:00:00-04:00'\n" +
         "8,10,8.08,'50',t,'2000-01-01T00:00:00+00'\n"
     );
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
         '\\', 0, "", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch(5);
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(5, batch.size);
     long bool = 0;
     for(int r = 0; r < batch.size; ++r) {
       assertEquals(r+1, ((LongColumnVector) batch.cols[0]).vector[r]);
       assertEquals(1.25 * (r + 1), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
       assertEquals((r + 1) + ".0" + (r + 1), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
       assertEquals(Integer.toHexString((r + 1) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
       assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
       bool = 1 - bool;
       assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
     }
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(3, batch.size);
     for(int r = 0; r < batch.size; ++r) {
       assertEquals(r + 6, ((LongColumnVector) batch.cols[0]).vector[r]);
       assertEquals(1.25 * (r + 6), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
       assertEquals((r + 6) + ".0" + (r + 6), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
       assertEquals(Integer.toHexString((r + 6) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
       assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
       bool = 1 - bool;
       assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
     }
     assertEquals(false, reader.nextBatch(batch));
   }

   @Test
   public void testNulls() throws Exception {
     StringReader input = new StringReader(
         "1,1,1,'a'\n" +
         "'null','null','null','null'\n" +
         "3,3,3,'row 3'\n"
     );
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:double,c:decimal(10,2),d:string>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
         '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(3, batch.size);
     for(int c=0; c < 4; ++c) {
       assertEquals("column " + c, false, batch.cols[c].noNulls);
     }

     // check row 0
     assertEquals(1, ((LongColumnVector) batch.cols[0]).vector[0]);
     assertEquals(1, ((DoubleColumnVector) batch.cols[1]).vector[0], 0.001);
     assertEquals("1", ((DecimalColumnVector) batch.cols[2]).vector[0].toString());
     assertEquals("a", ((BytesColumnVector) batch.cols[3]).toString(0));
     for(int c=0; c < 4; ++c) {
       assertEquals("column " + c, false, batch.cols[c].isNull[0]);
     }

     // row 1
     for(int c=0; c < 4; ++c) {
       assertEquals("column " + c, true, batch.cols[c].isNull[1]);
     }

     // check row 2
     assertEquals(3, ((LongColumnVector) batch.cols[0]).vector[2]);
     assertEquals(3, ((DoubleColumnVector) batch.cols[1]).vector[2], 0.001);
     assertEquals("3", ((DecimalColumnVector) batch.cols[2]).vector[2].toString());
     assertEquals("row 3", ((BytesColumnVector) batch.cols[3]).toString(2));
     for(int c=0; c < 4; ++c) {
       assertEquals("column " + c, false, batch.cols[c].isNull[2]);
     }
   }

   @Test
   public void testStructs() throws Exception {
     StringReader input = new StringReader(
         "1,2,3,4\n" +
         "5,6,7,8\n"
     );
     TypeDescription schema = TypeDescription.fromString(
         "struct<a:int,b:struct<c:int,d:int>,e:int>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
         '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(2, batch.size);
     int nextVal = 1;
     for(int r=0; r < 2; ++r) {
       assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[0]).vector[r]);
       StructColumnVector b = (StructColumnVector) batch.cols[1];
       assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[0]).vector[r]);
       assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[1]).vector[r]);
       assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[2]).vector[r]);
     }
     assertEquals(false, reader.nextBatch(batch));
   }

   @Test
   public void testLargeNumbers() throws Exception {
     StringReader input = new StringReader(
             "2147483646,-2147483647,9223372036854775806,-9223372036854775807\n"
     );
     TypeDescription schema = TypeDescription.fromString(
             "struct<a:int,b:int,d:bigint,e:bigint>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
             '\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
     VectorizedRowBatch batch = schema.createRowBatch();
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(1, batch.size);
     assertEquals(2147483646, ((LongColumnVector) batch.cols[0]).vector[0]);
     assertEquals(-2147483647, ((LongColumnVector) batch.cols[1]).vector[0]);
     assertEquals(9223372036854775806L, ((LongColumnVector) batch.cols[2]).vector[0]);
     assertEquals(-9223372036854775807L, ((LongColumnVector) batch.cols[3]).vector[0]);
     assertEquals(false, reader.nextBatch(batch));
   }

   @Test
   public void testCustomTimestampFormat() throws Exception {
     String tsFormat = "d[d] MMM yyyy HH:mm:ss.SSSSSS";

     StringReader input = new StringReader(
             "'21 Mar 2018 12:23:34.123456'\n" +
                     "'3 Feb 2018 18:04:51.456789'\n"
     );
     TypeDescription schema = TypeDescription.fromString(
             "struct<a:timestamp>");
     RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
             '\\', 0, "", tsFormat);
     VectorizedRowBatch batch = schema.createRowBatch(2);
     assertEquals(true, reader.nextBatch(batch));
     assertEquals(2, batch.size);
     TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
     assertEquals("2018-03-21 12:23:34.123456", cv.asScratchTimestamp(0).toString());
     assertEquals("2018-02-03 18:04:51.456789", cv.asScratchTimestamp(1).toString());
   }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	package org.apache.orc.tools.convert;

	import org.apache.hadoop.hive.ql.exec.vector.BytesColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.DecimalColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.DoubleColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.LongColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.StructColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
	import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
	import org.apache.orc.RecordReader;
	import org.apache.orc.TypeDescription;
	import org.junit.After;
	import org.junit.Before;
	import org.junit.Test;

	import java.io.StringReader;
	import java.util.Locale;

	import static org.apache.orc.tools.convert.ConvertTool.DEFAULT_TIMESTAMP_FORMAT;
	import static org.junit.Assert.assertEquals;

	public class TestCsvReader {

	Locale defaultLocale;

	@Before
	public void storeDefaultLocale() {
	defaultLocale = Locale.getDefault();
	Locale.setDefault(Locale.US);
	}

	@After
	public void restoreDefaultLocale() {
	Locale.setDefault(defaultLocale);
	}

	@Test
	public void testSimple() throws Exception {
	// yyyy[[-][/]]MM[[-][/]]dd[['T'][ ]]HH:mm:ss[ ][XXX][X]
	StringReader input = new StringReader(
	"1,1.25,1.01,'a',f,'2000-01-01T00:00:00+00:00'\n" +
	"2,2.5,2.02,'14',t,'2000/01/01T00:00:00+00'\n" +
	"3,3.75,3.03,'1e',false,'2000-01-01T00:00:00Z'\n" +
	"4,5,4.04,'28',true,'2000-01-01 00:00:00+00'\n" +
	"5,6.25,5.05,'32',0,'2000-01-01 00:00:00-00'\n" +
	"6,7.5,6.06,'3c',1,'2000-01-01T04:00:00+04'\n" +
	"7,8.75,7.07,'46',2,'1999-12-31T20:00:00-04:00'\n" +
	"8,10,8.08,'50',t,'2000-01-01T00:00:00+00'\n"
	);
	TypeDescription schema = TypeDescription.fromString(
	"struct<a:int,b:double,c:decimal(10,2),d:string,e:boolean,e:timestamp>");
	RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
	'\\', 0, "", DEFAULT_TIMESTAMP_FORMAT);
	VectorizedRowBatch batch = schema.createRowBatch(5);
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(5, batch.size);
	long bool = 0;
	for(int r = 0; r < batch.size; ++r) {
	assertEquals(r+1, ((LongColumnVector) batch.cols[0]).vector[r]);
	assertEquals(1.25 * (r + 1), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
	assertEquals((r + 1) + ".0" + (r + 1), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
	assertEquals(Integer.toHexString((r + 1) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
	assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
	bool = 1 - bool;
	assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
	}
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(3, batch.size);
	for(int r = 0; r < batch.size; ++r) {
	assertEquals(r + 6, ((LongColumnVector) batch.cols[0]).vector[r]);
	assertEquals(1.25 * (r + 6), ((DoubleColumnVector) batch.cols[1]).vector[r], 0.001);
	assertEquals((r + 6) + ".0" + (r + 6), ((DecimalColumnVector) batch.cols[2]).vector[r].toFormatString(2));
	assertEquals(Integer.toHexString((r + 6) * 10), ((BytesColumnVector) batch.cols[3]).toString(r));
	assertEquals(bool, ((LongColumnVector) batch.cols[4]).vector[r]);
	bool = 1 - bool;
	assertEquals(946684800000L, ((TimestampColumnVector) batch.cols[5]).getTime(r));
	}
	assertEquals(false, reader.nextBatch(batch));
	}

	@Test
	public void testNulls() throws Exception {
	StringReader input = new StringReader(
	"1,1,1,'a'\n" +
	"'null','null','null','null'\n" +
	"3,3,3,'row 3'\n"
	);
	TypeDescription schema = TypeDescription.fromString(
	"struct<a:int,b:double,c:decimal(10,2),d:string>");
	RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
	'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
	VectorizedRowBatch batch = schema.createRowBatch();
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(3, batch.size);
	for(int c=0; c < 4; ++c) {
	assertEquals("column " + c, false, batch.cols[c].noNulls);
	}

	// check row 0
	assertEquals(1, ((LongColumnVector) batch.cols[0]).vector[0]);
	assertEquals(1, ((DoubleColumnVector) batch.cols[1]).vector[0], 0.001);
	assertEquals("1", ((DecimalColumnVector) batch.cols[2]).vector[0].toString());
	assertEquals("a", ((BytesColumnVector) batch.cols[3]).toString(0));
	for(int c=0; c < 4; ++c) {
	assertEquals("column " + c, false, batch.cols[c].isNull[0]);
	}

	// row 1
	for(int c=0; c < 4; ++c) {
	assertEquals("column " + c, true, batch.cols[c].isNull[1]);
	}

	// check row 2
	assertEquals(3, ((LongColumnVector) batch.cols[0]).vector[2]);
	assertEquals(3, ((DoubleColumnVector) batch.cols[1]).vector[2], 0.001);
	assertEquals("3", ((DecimalColumnVector) batch.cols[2]).vector[2].toString());
	assertEquals("row 3", ((BytesColumnVector) batch.cols[3]).toString(2));
	for(int c=0; c < 4; ++c) {
	assertEquals("column " + c, false, batch.cols[c].isNull[2]);
	}
	}

	@Test
	public void testStructs() throws Exception {
	StringReader input = new StringReader(
	"1,2,3,4\n" +
	"5,6,7,8\n"
	);
	TypeDescription schema = TypeDescription.fromString(
	"struct<a:int,b:struct<c:int,d:int>,e:int>");
	RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
	'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
	VectorizedRowBatch batch = schema.createRowBatch();
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(2, batch.size);
	int nextVal = 1;
	for(int r=0; r < 2; ++r) {
	assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[0]).vector[r]);
	StructColumnVector b = (StructColumnVector) batch.cols[1];
	assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[0]).vector[r]);
	assertEquals("row " + r, nextVal++, ((LongColumnVector) b.fields[1]).vector[r]);
	assertEquals("row " + r, nextVal++, ((LongColumnVector) batch.cols[2]).vector[r]);
	}
	assertEquals(false, reader.nextBatch(batch));
	}

	@Test
	public void testLargeNumbers() throws Exception {
	StringReader input = new StringReader(
	"2147483646,-2147483647,9223372036854775806,-9223372036854775807\n"
	);
	TypeDescription schema = TypeDescription.fromString(
	"struct<a:int,b:int,d:bigint,e:bigint>");
	RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
	'\\', 0, "null", DEFAULT_TIMESTAMP_FORMAT);
	VectorizedRowBatch batch = schema.createRowBatch();
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(1, batch.size);
	assertEquals(2147483646, ((LongColumnVector) batch.cols[0]).vector[0]);
	assertEquals(-2147483647, ((LongColumnVector) batch.cols[1]).vector[0]);
	assertEquals(9223372036854775806L, ((LongColumnVector) batch.cols[2]).vector[0]);
	assertEquals(-9223372036854775807L, ((LongColumnVector) batch.cols[3]).vector[0]);
	assertEquals(false, reader.nextBatch(batch));
	}

	@Test
	public void testCustomTimestampFormat() throws Exception {
	String tsFormat = "d[d] MMM yyyy HH:mm:ss.SSSSSS";

	StringReader input = new StringReader(
	"'21 Mar 2018 12:23:34.123456'\n" +
	"'3 Feb 2018 18:04:51.456789'\n"
	);
	TypeDescription schema = TypeDescription.fromString(
	"struct<a:timestamp>");
	RecordReader reader = new CsvReader(input, null, 1, schema, ',', '\'',
	'\\', 0, "", tsFormat);
	VectorizedRowBatch batch = schema.createRowBatch(2);
	assertEquals(true, reader.nextBatch(batch));
	assertEquals(2, batch.size);
	TimestampColumnVector cv = (TimestampColumnVector) batch.cols[0];
	assertEquals("2018-03-21 12:23:34.123456", cv.asScratchTimestamp(0).toString());
	assertEquals("2018-02-03 18:04:51.456789", cv.asScratchTimestamp(1).toString());
	}
	}