blob: 809d6f3b64264240d761cada1f64bd50ebf9ae6d [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#include "orc/OrcFile.hh"
#include "Adaptor.hh"
#include "ToolTest.hh"
#include "wrap/gmock.h"
#include "wrap/gtest-wrapper.h"
TEST(TestFileMetadata, testRaw) {
const std::string pgm = findProgram("tools/src/orc-metadata");
const std::string file = findExample("orc_split_elim.orc");
const std::string expected = "Raw file tail: " + file +
"\n"
"postscript {\n"
" footerLength: 288\n"
" compression: NONE\n"
" version: 0\n"
" version: 12\n"
" metadataLength: 526\n"
" magic: \"ORC\"\n"
"}\n"
"footer {\n"
" headerLength: 3\n"
" contentLength: 245568\n"
" stripes {\n"
" offset: 3\n"
" indexLength: 137\n"
" dataLength: 45282\n"
" footerLength: 149\n"
" numberOfRows: 5000\n"
" }\n"
" stripes {\n"
" offset: 45571\n"
" indexLength: 137\n"
" dataLength: 45282\n"
" footerLength: 149\n"
" numberOfRows: 5000\n"
" }\n"
" stripes {\n"
" offset: 91139\n"
" indexLength: 137\n"
" dataLength: 45282\n"
" footerLength: 149\n"
" numberOfRows: 5000\n"
" }\n"
" stripes {\n"
" offset: 136707\n"
" indexLength: 138\n"
" dataLength: 45283\n"
" footerLength: 149\n"
" numberOfRows: 5000\n"
" }\n"
" stripes {\n"
" offset: 200000\n"
" indexLength: 137\n"
" dataLength: 45282\n"
" footerLength: 149\n"
" numberOfRows: 5000\n"
" }\n"
" types {\n"
" kind: STRUCT\n"
" subtypes: 1\n"
" subtypes: 2\n"
" subtypes: 3\n"
" subtypes: 4\n"
" subtypes: 5\n"
" fieldNames: \"userid\"\n"
" fieldNames: \"string1\"\n"
" fieldNames: \"subtype\"\n"
" fieldNames: \"decimal1\"\n"
" fieldNames: \"ts\"\n"
" }\n"
" types {\n"
" kind: LONG\n"
" }\n"
" types {\n"
" kind: STRING\n"
" }\n"
" types {\n"
" kind: DOUBLE\n"
" }\n"
" types {\n"
" kind: DECIMAL\n"
" }\n"
" types {\n"
" kind: TIMESTAMP\n"
" }\n"
" numberOfRows: 25000\n"
" statistics {\n"
" numberOfValues: 25000\n"
" }\n"
" statistics {\n"
" numberOfValues: 25000\n"
" intStatistics {\n"
" minimum: 2\n"
" maximum: 100\n"
" sum: 2499619\n"
" }\n"
" }\n"
" statistics {\n"
" numberOfValues: 25000\n"
" stringStatistics {\n"
" minimum: \"bar\"\n"
" maximum: \"zebra\"\n"
" sum: 124990\n"
" }\n"
" }\n"
" statistics {\n"
" numberOfValues: 25000\n"
" doubleStatistics {\n"
" minimum: 0.8\n"
" maximum: 80\n"
" sum: 200051.40000000002\n"
" }\n"
" }\n"
" statistics {\n"
" numberOfValues: 25000\n"
" decimalStatistics {\n"
" minimum: \"0\"\n"
" maximum: \"5.5\"\n"
" sum: \"16.6\"\n"
" }\n"
" }\n"
" statistics {\n"
" numberOfValues: 25000\n"
" }\n"
" rowIndexStride: 10000\n"
"}\n"
"fileLength: 246402\n"
"postscriptLength: 19\n";
std::string output;
std::string error;
EXPECT_EQ(0, runProgram({pgm, std::string("-r"), file}, output, error));
EXPECT_EQ(expected, output);
EXPECT_EQ("", error);
EXPECT_EQ(0, runProgram({pgm, std::string("--raw"), file}, output, error));
EXPECT_EQ(expected, output);
EXPECT_EQ("", error);
}
TEST(TestFileMetadata, testJson) {
const std::string pgm = findProgram("tools/src/orc-metadata");
const std::string file = findExample("orc_split_elim.orc");
const std::string expected =
"{ \"name\": \"" + file +
"\",\n"
" \"type\": "
"\"struct<userid:bigint,string1:string,subtype:double,decimal1:decimal(0,0),ts:timestamp>\","
"\n"
" \"attributes\": {},\n"
" \"rows\": 25000,\n"
" \"stripe count\": 5,\n"
" \"format\": \"0.12\", \"writer version\": \"original\", \"software version\": \"ORC "
"Java\",\n"
" \"compression\": \"none\",\n"
" \"file length\": 246402,\n"
" \"content\": 245568, \"stripe stats\": 526, \"footer\": 288, \"postscript\": 19,\n"
" \"row index stride\": 10000,\n"
" \"user metadata\": {\n"
" },\n"
" \"stripes\": [\n"
" { \"stripe\": 0, \"rows\": 5000,\n"
" \"offset\": 3, \"length\": 45568,\n"
" \"index\": 137, \"data\": 45282, \"footer\": 149\n"
" },\n"
" { \"stripe\": 1, \"rows\": 5000,\n"
" \"offset\": 45571, \"length\": 45568,\n"
" \"index\": 137, \"data\": 45282, \"footer\": 149\n"
" },\n"
" { \"stripe\": 2, \"rows\": 5000,\n"
" \"offset\": 91139, \"length\": 45568,\n"
" \"index\": 137, \"data\": 45282, \"footer\": 149\n"
" },\n"
" { \"stripe\": 3, \"rows\": 5000,\n"
" \"offset\": 136707, \"length\": 45570,\n"
" \"index\": 138, \"data\": 45283, \"footer\": 149\n"
" },\n"
" { \"stripe\": 4, \"rows\": 5000,\n"
" \"offset\": 200000, \"length\": 45568,\n"
" \"index\": 137, \"data\": 45282, \"footer\": 149\n"
" }\n"
" ]\n"
"}\n";
std::string output;
std::string error;
EXPECT_EQ(0, runProgram({pgm, file}, output, error));
EXPECT_EQ(expected, output);
EXPECT_EQ("", error);
}
TEST(TestFileMetadata, testNoFormat) {
const std::string pgm = findProgram("tools/src/orc-metadata");
const std::string file = findExample("orc_no_format.orc");
const std::string expected =
"{ \"name\": \"" + file +
"\",\n"
" \"type\": "
"\"struct<_col0:array<string>,_col1:map<int,string>,_col2:struct<name:string,score:int>>\",\n"
" \"attributes\": {},\n"
" \"rows\": 5,\n"
" \"stripe count\": 1,\n"
" \"format\": \"0.11\", \"writer version\": \"original\", \"software version\": \"ORC "
"Java\",\n"
" \"compression\": \"zlib\", \"compression block\": 262144,\n"
" \"file length\": 745,\n"
" \"content\": 525, \"stripe stats\": 0, \"footer\": 210, \"postscript\": 9,\n"
" \"row index stride\": 10000,\n"
" \"user metadata\": {\n"
" },\n"
" \"stripes\": [\n"
" { \"stripe\": 0, \"rows\": 5,\n"
" \"offset\": 3, \"length\": 522,\n"
" \"index\": 224, \"data\": 187, \"footer\": 111\n"
" }\n"
" ]\n"
"}\n";
std::string output;
std::string error;
std::cout << expected;
EXPECT_EQ(0, runProgram({pgm, file}, output, error));
EXPECT_EQ(expected, output);
EXPECT_EQ("", error);
}
TEST(TestFileMetadata, testV2Format) {
const std::string pgm = findProgram("tools/src/orc-metadata");
const std::string file = findExample("decimal64_v2.orc");
const std::string expected_out =
"{ \"name\": \"" + file +
"\",\n"
" \"type\": "
"\"struct<a:bigint,b:decimal(12,0),c:decimal(20,2),d:decimal(12,2),e:decimal(2,2)>\",\n"
" \"attributes\": {},\n"
" \"rows\": 10,\n"
" \"stripe count\": 1,\n"
" \"format\": \"UNSTABLE-PRE-2.0\", \"writer version\": \"ORC-135\", \"software version\": "
"\"ORC Java\",\n"
" \"compression\": \"zlib\", \"compression block\": 262144,\n"
" \"file length\": 738,\n"
" \"content\": 377, \"stripe stats\": 130, \"footer\": 204, \"postscript\": 26,\n"
" \"row index stride\": 10000,\n"
" \"user metadata\": {\n"
" },\n"
" \"stripes\": [\n"
" { \"stripe\": 0, \"rows\": 10,\n"
" \"offset\": 3, \"length\": 374,\n"
" \"index\": 192, \"data\": 112, \"footer\": 70\n"
" }\n"
" ]\n"
"}\n";
const std::string expected_err =
"Warning: ORC file " + file + " was written in an unknown format version UNSTABLE-PRE-2.0\n";
std::string output;
std::string error;
EXPECT_EQ(0, runProgram({pgm, file}, output, error)) << error;
EXPECT_EQ(expected_out, output);
EXPECT_EQ(expected_err, error);
}
TEST(TestFileMetadata, testAttributes) {
const std::string pgm = findProgram("tools/src/orc-metadata");
const std::string file = findExample("complextypes_iceberg.orc");
const std::string expected =
"{ \"name\": \"" + file +
"\",\n"
" \"type\": "
"\"struct<id:bigint,int_array:array<int>,int_array_array:array<array<int>>,int_map:map<"
"string,int>,int_map_array:array<map<string,int>>,nested_struct:struct<a:int,b:array<int>,c:"
"struct<d:array<array<struct<e:int,f:string>>>>,g:map<string,struct<h:struct<i:array<double>>"
">>>>\",\n"
" \"attributes\": {\n"
" \"id\": {\"iceberg.id\": \"1\", \"iceberg.long-type\": \"LONG\", \"iceberg.required\": "
"\"false\"},\n"
" \"int_array\": {\"iceberg.id\": \"2\", \"iceberg.required\": \"false\"},\n"
" \"int_array._elem\": {\"iceberg.id\": \"7\", \"iceberg.required\": \"false\"},\n"
" \"int_array_array\": {\"iceberg.id\": \"3\", \"iceberg.required\": \"false\"},\n"
" \"int_array_array._elem\": {\"iceberg.id\": \"8\", \"iceberg.required\": \"false\"},\n"
" \"int_array_array._elem._elem\": {\"iceberg.id\": \"9\", \"iceberg.required\": "
"\"false\"},\n"
" \"int_map\": {\"iceberg.id\": \"4\", \"iceberg.required\": \"false\"},\n"
" \"int_map._key\": {\"iceberg.id\": \"10\", \"iceberg.required\": \"true\"},\n"
" \"int_map._value\": {\"iceberg.id\": \"11\", \"iceberg.required\": \"false\"},\n"
" \"int_map_array\": {\"iceberg.id\": \"5\", \"iceberg.required\": \"false\"},\n"
" \"int_map_array._elem\": {\"iceberg.id\": \"12\", \"iceberg.required\": \"false\"},\n"
" \"int_map_array._elem._key\": {\"iceberg.id\": \"13\", \"iceberg.required\": "
"\"true\"},\n"
" \"int_map_array._elem._value\": {\"iceberg.id\": \"14\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct\": {\"iceberg.id\": \"6\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.a\": {\"iceberg.id\": \"15\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.b\": {\"iceberg.id\": \"16\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.b._elem\": {\"iceberg.id\": \"19\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.c\": {\"iceberg.id\": \"17\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.c.d\": {\"iceberg.id\": \"20\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.c.d._elem\": {\"iceberg.id\": \"21\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.c.d._elem._elem\": {\"iceberg.id\": \"22\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.c.d._elem._elem.e\": {\"iceberg.id\": \"23\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.c.d._elem._elem.f\": {\"iceberg.id\": \"24\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.g\": {\"iceberg.id\": \"18\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.g._key\": {\"iceberg.id\": \"25\", \"iceberg.required\": \"true\"},\n"
" \"nested_struct.g._value\": {\"iceberg.id\": \"26\", \"iceberg.required\": \"false\"},\n"
" \"nested_struct.g._value.h\": {\"iceberg.id\": \"27\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.g._value.h.i\": {\"iceberg.id\": \"28\", \"iceberg.required\": "
"\"false\"},\n"
" \"nested_struct.g._value.h.i._elem\": {\"iceberg.id\": \"29\", \"iceberg.required\": "
"\"false\"}},\n"
" \"rows\": 1,\n"
" \"stripe count\": 1,\n"
" \"format\": \"0.12\", \"writer version\": \"ORC-14\", \"software version\": \"ORC "
"Java\",\n"
" \"compression\": \"zlib\", \"compression block\": 131072,\n"
" \"file length\": 1734,\n"
" \"content\": 1006, \"stripe stats\": 167, \"footer\": 535, \"postscript\": 25,\n"
" \"row index stride\": 10000,\n"
" \"user metadata\": {\n"
" },\n"
" \"stripes\": [\n"
" { \"stripe\": 0, \"rows\": 1,\n"
" \"offset\": 3, \"length\": 1003,\n"
" \"index\": 679, \"data\": 150, \"footer\": 174\n"
" }\n"
" ]\n"
"}\n";
std::string output;
std::string error;
std::cout << expected;
EXPECT_EQ(0, runProgram({pgm, file}, output, error));
EXPECT_EQ(expected, output);
EXPECT_EQ("", error);
}