blob: c0b7d4487ced8d0b4f3fd0eef1aceecd057e11fb [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* http://www.apache.org/licenses/LICENSE-2.0
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.orc;
import static junit.framework.Assert.assertEquals;
import java.io.File;
import java.sql.Timestamp;
import java.text.DateFormat;
import java.text.SimpleDateFormat;
import java.util.Arrays;
import java.util.Collection;
import java.util.List;
import java.util.TimeZone;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.hive.ql.exec.vector.TimestampColumnVector;
import org.apache.hadoop.hive.ql.exec.vector.VectorizedRowBatch;
import org.apache.hadoop.hive.ql.io.sarg.PredicateLeaf;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgument;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentFactory;
import org.apache.hadoop.hive.ql.io.sarg.SearchArgumentImpl;
import org.apache.orc.impl.OrcIndex;
import org.apache.orc.impl.RecordReaderImpl;
import org.apache.orc.impl.SerializationUtils;
import org.apache.orc.util.BloomFilter;
import org.apache.orc.util.BloomFilterIO;
import org.junit.After;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Rule;
import org.junit.Test;
import org.junit.rules.TestName;
import org.junit.runner.RunWith;
import org.junit.runners.Parameterized;
import com.google.common.collect.Lists;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
/**
*
*/
@RunWith(Parameterized.class)
public class TestOrcTimezonePPD {
private static final Logger LOG = LoggerFactory.getLogger(TestOrcTimezonePPD.class);
Path workDir = new Path(System.getProperty("test.tmp.dir",
"target" + File.separator + "test" + File.separator + "tmp"));
Configuration conf;
FileSystem fs;
Path testFilePath;
String writerTimeZone;
String readerTimeZone;
static TimeZone defaultTimeZone = TimeZone.getDefault();
TimeZone utcTz = TimeZone.getTimeZone("UTC");
DateFormat dateFormat = new SimpleDateFormat("yyyy-MM-dd HH:mm:ss");
public TestOrcTimezonePPD(String writerTZ, String readerTZ) {
this.writerTimeZone = writerTZ;
this.readerTimeZone = readerTZ;
}
@Parameterized.Parameters
public static Collection<Object[]> data() {
List<Object[]> result = Arrays.asList(new Object[][]{
{"US/Eastern", "America/Los_Angeles"},
{"US/Eastern", "UTC"},
/* Extreme timezones */
{"GMT-12:00", "GMT+14:00"},
/* No difference in DST */
{"America/Los_Angeles", "America/Los_Angeles"}, /* same timezone both with DST */
{"Europe/Berlin", "Europe/Berlin"}, /* same as above but europe */
{"America/Phoenix", "Asia/Kolkata"} /* Writer no DST, Reader no DST */,
{"Europe/Berlin", "America/Los_Angeles"} /* Writer DST, Reader DST */,
{"Europe/Berlin", "America/Chicago"} /* Writer DST, Reader DST */,
/* With DST difference */
{"Europe/Berlin", "UTC"},
{"UTC", "Europe/Berlin"} /* Writer no DST, Reader DST */,
{"America/Los_Angeles", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
{"Europe/Berlin", "Asia/Kolkata"} /* Writer DST, Reader no DST */,
/* Timezone offsets for the reader has changed historically */
{"Asia/Saigon", "Pacific/Enderbury"},
{"UTC", "Asia/Jerusalem"},
});
return result;
}
@Rule
public TestName testCaseName = new TestName();
@Before
public void openFileSystem() throws Exception {
conf = new Configuration();
fs = FileSystem.getLocal(conf);
testFilePath = new Path(workDir, "TestOrcFile." +
testCaseName.getMethodName() + ".orc");
fs.delete(testFilePath, false);
}
@After
public void restoreTimeZone() {
TimeZone.setDefault(defaultTimeZone);
}
public static PredicateLeaf createPredicateLeaf(PredicateLeaf.Operator operator,
PredicateLeaf.Type type,
String columnName,
Object literal,
List<Object> literalList) {
return new SearchArgumentImpl.PredicateLeafImpl(operator, type, columnName,
literal, literalList);
}
@Test
public void testTimestampPPDMinMax() throws Exception {
TypeDescription schema = TypeDescription.createTimestamp();
TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
.bufferSize(10000));
assertEquals(writerTimeZone, TimeZone.getDefault().getID());
List<String> ts = Lists.newArrayList();
ts.add("2007-08-01 00:00:00.0");
ts.add("2007-08-01 04:00:00.0");
VectorizedRowBatch batch = schema.createRowBatch();
TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
for (String t : ts) {
times.set(batch.size++, Timestamp.valueOf(t));
}
writer.addRowBatch(batch);
writer.close();
TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(readerTimeZone, TimeZone.getDefault().getID());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
times = (TimestampColumnVector) batch.cols[0];
int idx = 0;
while (rows.nextBatch(batch)) {
for (int r = 0; r < batch.size; ++r) {
assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
}
}
rows.close();
ColumnStatistics[] colStats = reader.getStatistics();
Timestamp gotMin = ((TimestampColumnStatistics) colStats[0]).getMinimum();
assertEquals("2007-08-01 00:00:00.0", gotMin.toString());
Timestamp gotMax = ((TimestampColumnStatistics) colStats[0]).getMaximum();
assertEquals("2007-08-01 04:00:00.0", gotMax.toString());
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().equals
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0")).build().getLeaves().get(0),
null));
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().equals
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0")).build().getLeaves().get(0),
null));
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().between
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 05:00:00.0"),
Timestamp.valueOf("2007-08-01 06:00:00.0")).build().getLeaves().get(0),
null));
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().between
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"),
Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0),
null));
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().in
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"),
Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0),
null));
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[0],
SearchArgumentFactory.newBuilder().in
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0"),
Timestamp.valueOf("2007-08-02 03:00:00.0")).build().getLeaves().get(0),
null));
}
static OrcProto.ColumnEncoding buildEncoding() {
OrcProto.ColumnEncoding.Builder result =
OrcProto.ColumnEncoding.newBuilder();
result.setKind(OrcProto.ColumnEncoding.Kind.DIRECT)
.setBloomEncoding(BloomFilterIO.Encoding.UTF8_UTC.getId());
return result.build();
}
@Test
public void testTimestampPPDBloomFilter() throws Exception {
LOG.info("Writer = " + writerTimeZone + " reader = " + readerTimeZone);
TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp());
TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
.bufferSize(10000).bloomFilterColumns("ts").writerVersion(OrcFile.WriterVersion.ORC_101));
assertEquals(writerTimeZone, TimeZone.getDefault().getID());
List<String> ts = Lists.newArrayList();
ts.add("2007-08-01 00:00:00.0");
ts.add("2007-08-01 04:00:00.0");
VectorizedRowBatch batch = schema.createRowBatch();
TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
for (String t : ts) {
times.set(batch.size++, Timestamp.valueOf(t));
}
writer.addRowBatch(batch);
writer.close();
TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(readerTimeZone, TimeZone.getDefault().getID());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
times = (TimestampColumnVector) batch.cols[0];
int idx = 0;
while (rows.nextBatch(batch)) {
for (int r = 0; r < batch.size; ++r) {
assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
}
}
boolean[] sargColumns = new boolean[2];
Arrays.fill(sargColumns, true);
OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns);
rows.close();
ColumnStatistics[] colStats = reader.getStatistics();
Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum();
assertEquals("2007-08-01 00:00:00.0", gotMin.toString());
Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum();
assertEquals("2007-08-01 04:00:00.0", gotMax.toString());
OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex();
OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0);
BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8,
buildEncoding(), reader.getWriterVersion(),
TypeDescription.Category.TIMESTAMP, bloomFilter);
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1],
SearchArgumentFactory.newBuilder().equals
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0")).build().getLeaves().get(0),
bf));
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1],
SearchArgumentFactory.newBuilder().equals
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0")).build().getLeaves().get(0),
bf));
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1],
SearchArgumentFactory.newBuilder().in
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-01 00:00:00.0"),
Timestamp.valueOf("2007-08-01 03:00:00.0")).build().getLeaves().get(0),
bf));
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1],
SearchArgumentFactory.newBuilder().in
("c", PredicateLeaf.Type.TIMESTAMP, Timestamp.valueOf("2007-08-02 00:00:00.0"),
Timestamp.valueOf("2007-08-02 03:00:00.0")).build().getLeaves().get(0),
bf));
}
@Test
public void testTimestampMinMaxAndBloomFilter() throws Exception {
TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp());
TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
.bufferSize(10000).bloomFilterColumns("ts"));
assertEquals(writerTimeZone, TimeZone.getDefault().getID());
List<String> ts = Lists.newArrayList();
ts.add("2007-08-01 00:00:00.0");
ts.add("2007-08-01 04:00:00.0");
VectorizedRowBatch batch = schema.createRowBatch();
TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
for (String t : ts) {
times.set(batch.size++, Timestamp.valueOf(t));
}
writer.addRowBatch(batch);
writer.close();
TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(readerTimeZone, TimeZone.getDefault().getID());
RecordReader rows = reader.rows();
batch = reader.getSchema().createRowBatch();
times = (TimestampColumnVector) batch.cols[0];
int idx = 0;
while (rows.nextBatch(batch)) {
for (int r = 0; r < batch.size; ++r) {
assertEquals(ts.get(idx++), times.asScratchTimestamp(r).toString());
}
}
boolean[] sargColumns = new boolean[2];
Arrays.fill(sargColumns, true);
OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns);
rows.close();
ColumnStatistics[] colStats = reader.getStatistics();
Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum();
assertEquals("2007-08-01 00:00:00.0", gotMin.toString());
Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum();
assertEquals("2007-08-01 04:00:00.0", gotMax.toString());
OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex();
OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0);
BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8,
buildEncoding(), reader.getWriterVersion(),
TypeDescription.Category.TIMESTAMP, bloomFilter);
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
Timestamp.valueOf("2007-08-01 00:00:00.0"), null);
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
pred = createPredicateLeaf(PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
Timestamp.valueOf("2007-08-01 02:00:00.0"), null);
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
bf.addLong(SerializationUtils.convertToUtc(TimeZone.getDefault(),
Timestamp.valueOf("2007-08-01 02:00:00.0").getTime()));
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN, PredicateLeaf.Type.TIMESTAMP, "x",
Timestamp.valueOf("2007-08-01 00:00:00.0"), null);
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
pred = createPredicateLeaf(PredicateLeaf.Operator.LESS_THAN_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
Timestamp.valueOf("2007-08-01 00:00:00.0"), null);
Assert.assertEquals(SearchArgument.TruthValue.YES_NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
pred = createPredicateLeaf(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.TIMESTAMP, "x", null, null);
Assert.assertEquals(SearchArgument.TruthValue.NO, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
}
@Test
public void testTimestampAllNulls() throws Exception {
TypeDescription schema = TypeDescription.createStruct().addField("ts", TypeDescription.createTimestamp());
TimeZone.setDefault(TimeZone.getTimeZone(writerTimeZone));
Writer writer = OrcFile.createWriter(testFilePath,
OrcFile.writerOptions(conf).setSchema(schema).stripeSize(100000)
.bufferSize(10000).bloomFilterColumns("ts"));
assertEquals(writerTimeZone, TimeZone.getDefault().getID());
VectorizedRowBatch batch = schema.createRowBatch();
TimestampColumnVector times = (TimestampColumnVector) batch.cols[0];
for (int i = 0; i < 3; i++) {
times.set(batch.size++, null);
}
writer.addRowBatch(batch);
writer.close();
TimeZone.setDefault(TimeZone.getTimeZone(readerTimeZone));
Reader reader = OrcFile.createReader(testFilePath,
OrcFile.readerOptions(conf).filesystem(fs));
assertEquals(readerTimeZone, TimeZone.getDefault().getID());
RecordReader rows = reader.rows();
boolean[] sargColumns = new boolean[2];
Arrays.fill(sargColumns, true);
OrcIndex indices = ((RecordReaderImpl) rows).readRowIndex(0, null, sargColumns);
rows.close();
ColumnStatistics[] colStats = reader.getStatistics();
Timestamp gotMin = ((TimestampColumnStatistics) colStats[1]).getMinimum();
Assert.assertNull(gotMin);
Timestamp gotMax = ((TimestampColumnStatistics) colStats[1]).getMaximum();
Assert.assertNull(gotMax);
OrcProto.BloomFilterIndex[] bloomFilterIndices = indices.getBloomFilterIndex();
OrcProto.BloomFilter bloomFilter = bloomFilterIndices[1].getBloomFilter(0);
BloomFilter bf = BloomFilterIO.deserialize(OrcProto.Stream.Kind.BLOOM_FILTER_UTF8,
buildEncoding(), reader.getWriterVersion(),
TypeDescription.Category.TIMESTAMP, bloomFilter);
PredicateLeaf pred = createPredicateLeaf(
PredicateLeaf.Operator.NULL_SAFE_EQUALS, PredicateLeaf.Type.TIMESTAMP, "x",
Timestamp.valueOf("2007-08-01 00:00:00.0"), null);
Assert.assertEquals(SearchArgument.TruthValue.NULL, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
pred = createPredicateLeaf(PredicateLeaf.Operator.IS_NULL, PredicateLeaf.Type.TIMESTAMP, "x", null, null);
Assert.assertEquals(SearchArgument.TruthValue.YES, RecordReaderImpl.evaluatePredicate(colStats[1], pred, bf));
}
}