HAWQ-1692. Add the ORC format implementation
diff --git a/depends/build-all.sh b/depends/build-all.sh
index 627c777..29d7782 100755
--- a/depends/build-all.sh
+++ b/depends/build-all.sh
@@ -118,12 +118,15 @@
echo "Delete headers in ${PREFIX}/include ..."
rm -rf $PREFIX/include/dbcommon
rm -rf $PREFIX/include/univplan
+rm -rf $PREFIX/include/storage
echo "Done."
echo "Delete libs in ${PREFIX}/lib ..."
rm -rf $PREFIX/lib/libdbcommon*
rm -rf $PREFIX/lib/libunivplan*
+rm -rf $PREFIX/lib/libstorage*
echo "Done."
build dbcommon
build univplan
+build storage
diff --git a/depends/storage/.gitignore b/depends/storage/.gitignore
new file mode 100644
index 0000000..a9b913a
--- /dev/null
+++ b/depends/storage/.gitignore
@@ -0,0 +1,8 @@
+.DS_Store
+.cproject
+.project
+.settings
+.pydevproject
+*.pyc
+build/
+CodeCoverageReport/
diff --git a/depends/storage/CMake/CMakeTestCompileInt64tType.cc b/depends/storage/CMake/CMakeTestCompileInt64tType.cc
new file mode 100644
index 0000000..ad2fc8e
--- /dev/null
+++ b/depends/storage/CMake/CMakeTestCompileInt64tType.cc
@@ -0,0 +1,30 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstdint>
+
+// test if int64_t is typedef to be long long
+
+void test(long long *i) {} // NOLINT
+
+int main() {
+ int64_t i = 0;
+ test(&i);
+ return 0;
+}
diff --git a/depends/storage/CMake/FindCogapp.cmake b/depends/storage/CMake/FindCogapp.cmake
new file mode 100644
index 0000000..198c23b
--- /dev/null
+++ b/depends/storage/CMake/FindCogapp.cmake
@@ -0,0 +1,50 @@
+# locate cogapp and generate source code from template
+#
+# find_package(Cogapp REQUIRED)
+#
+# COGAPP_GENERATE (public function)
+# GENERATED_CODE = Variable to define with generated source files.
+# TEMPLATE = Template used to generate source files.
+#
+# NOTE: The COGAPP_GENERATE macro & add_executable() or add_library()
+# calls only work properly within the same directory.
+#
+
+find_package(PythonInterp REQUIRED)
+
+function(COGAPP_GENERATE GENERATED_CODE)
+ if(NOT ARGN)
+ message(SEND_ERROR "Error: COGAPP_GENERATE() called without any template files")
+ return()
+ endif()
+
+ set(${GENERATED_CODE})
+ foreach(FIL ${ARGN})
+ get_filename_component(ABS_FIL ${FIL} ABSOLUTE)
+ file(RELATIVE_PATH FIL_REL ${CMAKE_SOURCE_DIR} ${ABS_FIL})
+
+ get_filename_component(FIL_DIR ${CMAKE_BINARY_DIR}/codegen/${FIL_REL} DIRECTORY)
+ file(MAKE_DIRECTORY ${FIL_DIR})
+
+ get_filename_component(FIL_WE "${CMAKE_BINARY_DIR}/codegen/${FIL_REL}" NAME_WE)
+ get_filename_component(FIL_EXT "${CMAKE_BINARY_DIR}/codegen/${FIL_REL}" EXT)
+
+ set(FIL_OUT "${FIL_DIR}/${FIL_WE}.cg${FIL_EXT}")
+ list(APPEND ${GENERATED_CODE} ${FIL_OUT})
+
+ if(NOT EXISTS ${ABS_FIL})
+ MESSAGE(FATAL_ERROR "file ${ABS_FIL} does not exist")
+ endif()
+
+ add_custom_command(
+ OUTPUT ${FIL_OUT}
+ COMMAND ${PYTHON_EXECUTABLE}
+ ARGS -m cogapp -d -o ${FIL_OUT} ${ABS_FIL}
+ DEPENDS ${ABS_FIL}
+ COMMENT "Running cog on ${FIL}"
+ VERBATIM )
+ endforeach()
+
+ set_source_files_properties(${${GENERATED_CODE}} PROPERTIES GENERATED TRUE)
+ set(${GENERATED_CODE} ${${GENERATED_CODE}} PARENT_SCOPE)
+endfunction()
\ No newline at end of file
diff --git a/depends/storage/CMake/FindGFlags.cmake b/depends/storage/CMake/FindGFlags.cmake
new file mode 100644
index 0000000..f93c571
--- /dev/null
+++ b/depends/storage/CMake/FindGFlags.cmake
@@ -0,0 +1,48 @@
+# - Try to find GFLAGS
+#
+# The following variables are optionally searched for defaults
+# GFLAGS_ROOT_DIR: Base directory where all GFLAGS components are found
+#
+# The following are set after configuration is done:
+# GFLAGS_FOUND
+# GFLAGS_INCLUDE_DIRS
+# GFLAGS_LIBRARIES
+# GFLAGS_LIBRARYRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(GFLAGS_ROOT_DIR "" CACHE PATH "Folder contains Gflags")
+
+# We are testing only a couple of files in the include directories
+if(WIN32)
+ find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+ PATHS ${GFLAGS_ROOT_DIR}/src/windows)
+else()
+ find_path(GFLAGS_INCLUDE_DIR gflags/gflags.h
+ PATHS ${GFLAGS_ROOT_DIR})
+endif()
+
+if(MSVC)
+ find_library(GFLAGS_LIBRARY_RELEASE
+ NAMES libgflags
+ PATHS ${GFLAGS_ROOT_DIR}
+ PATH_SUFFIXES Release)
+
+ find_library(GFLAGS_LIBRARY_DEBUG
+ NAMES libgflags-debug
+ PATHS ${GFLAGS_ROOT_DIR}
+ PATH_SUFFIXES Debug)
+
+ set(GFLAGS_LIBRARY optimized ${GFLAGS_LIBRARY_RELEASE} debug ${GFLAGS_LIBRARY_DEBUG})
+else()
+ find_library(GFLAGS_LIBRARY gflags)
+endif()
+
+find_package_handle_standard_args(GFLAGS DEFAULT_MSG
+ GFLAGS_INCLUDE_DIR GFLAGS_LIBRARY)
+
+
+if(GFLAGS_FOUND)
+ set(GFLAGS_INCLUDE_DIRS ${GFLAGS_INCLUDE_DIR})
+ set(GFLAGS_LIBRARIES ${GFLAGS_LIBRARY})
+endif()
diff --git a/depends/storage/CMake/FindGlog.cmake b/depends/storage/CMake/FindGlog.cmake
new file mode 100644
index 0000000..d9f0ee0
--- /dev/null
+++ b/depends/storage/CMake/FindGlog.cmake
@@ -0,0 +1,49 @@
+
+# - Try to find Glog
+#
+# The following variables are optionally searched for defaults
+# GLOG_ROOT_DIR: Base directory where all GLOG components are found
+#
+# The following are set after configuration is done:
+# GLOG_FOUND
+# GLOG_INCLUDE_DIRS
+# GLOG_LIBRARIES
+# GLOG_LIBRARYRARY_DIRS
+
+include(FindPackageHandleStandardArgs)
+
+set(GLOG_ROOT_DIR "" CACHE PATH "Folder contains Google glog")
+
+if(WIN32)
+ find_path(GLOG_INCLUDE_DIR glog/logging.h
+ PATHS ${GLOG_ROOT_DIR}/src/windows)
+else()
+ find_path(GLOG_INCLUDE_DIR glog/logging.h
+ PATHS ${GLOG_ROOT_DIR})
+endif()
+
+if(MSVC)
+ find_library(GLOG_LIBRARY_RELEASE libglog_static
+ PATHS ${GLOG_ROOT_DIR}
+ PATH_SUFFIXES Release)
+
+ find_library(GLOG_LIBRARY_DEBUG libglog_static
+ PATHS ${GLOG_ROOT_DIR}
+ PATH_SUFFIXES Debug)
+
+ set(GLOG_LIBRARY optimized ${GLOG_LIBRARY_RELEASE} debug ${GLOG_LIBRARY_DEBUG})
+else()
+ find_library(GLOG_LIBRARY glog
+ PATHS ${GLOG_ROOT_DIR}
+ PATH_SUFFIXES
+ lib
+ lib64)
+endif()
+
+find_package_handle_standard_args(GLOG DEFAULT_MSG
+ GLOG_INCLUDE_DIR GLOG_LIBRARY)
+
+if(GLOG_FOUND)
+ set(GLOG_INCLUDE_DIRS ${GLOG_INCLUDE_DIR})
+ set(GLOG_LIBRARIES ${GLOG_LIBRARY})
+endif()
\ No newline at end of file
diff --git a/depends/storage/CMake/FindJSON.cmake b/depends/storage/CMake/FindJSON.cmake
new file mode 100644
index 0000000..a334948
--- /dev/null
+++ b/depends/storage/CMake/FindJSON.cmake
@@ -0,0 +1,38 @@
+# - Find json
+# Find the native JSON headers and libraries.
+#
+# JSON_INCLUDE_DIRS - where to find json/json.h, etc.
+# JSON_LIBRARIES - List of libraries when using json.
+# JSON_FOUND - True if json found.
+
+#=============================================================================
+# Copyright 2006-2009 Kitware, Inc.
+# Copyright 2012 Rolf Eike Beer <eike@sf-mail.de>
+#
+# Distributed under the OSI-approved BSD License (the "License");
+# see accompanying file Copyright.txt for details.
+#
+# This software is distributed WITHOUT ANY WARRANTY; without even the
+# implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+# See the License for more information.
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+# Look for the header file.
+find_path(JSON_INCLUDE_DIR NAMES json/json.h)
+mark_as_advanced(JSON_INCLUDE_DIR)
+
+# Look for the library (sorted from most current/relevant entry to least).
+find_library(JSON_LIBRARY NAMES jsoncpp
+)
+mark_as_advanced(JSON_LIBRARY)
+
+# handle the QUIETLY and REQUIRED arguments and set JSON_FOUND to TRUE if
+# all listed variables are TRUE
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(JSON DEFAULT_MSG JSON_INCLUDE_DIR JSON_LIBRARY)
+
+if(JSON_FOUND)
+ set(JSON_LIBRARIES ${JSON_LIBRARY})
+ set(JSON_INCLUDE_DIRS ${JSON_INCLUDE_DIR})
+endif()
diff --git a/depends/storage/CMake/FindSnappy.cmake b/depends/storage/CMake/FindSnappy.cmake
new file mode 100644
index 0000000..623d2d7
--- /dev/null
+++ b/depends/storage/CMake/FindSnappy.cmake
@@ -0,0 +1,30 @@
+# Variables used by this module, they can change the default behaviour and need
+# to be set before calling find_package:
+#
+# SNAPPY_ROOT_DIR Set this variable to the root installation of
+# Snappy if the module has problems finding
+# the proper installation path.
+#
+# Variables defined by this module:
+#
+# SNAPPY_FOUND System has Snappy libs/headers
+# SNAPPY_LIBRARIES The Snappy libraries
+# SNAPPY_INCLUDE_DIR The location of Snappy headers
+
+find_path(SNAPPY_INCLUDE_DIR
+ NAMES snappy.h
+ HINTS ${SNAPPY_ROOT_DIR}/include)
+
+find_library(SNAPPY_LIBRARIES
+ NAMES snappy
+ HINTS ${SNAPPY_ROOT_DIR}/lib)
+
+include(FindPackageHandleStandardArgs)
+find_package_handle_standard_args(Snappy DEFAULT_MSG
+ SNAPPY_LIBRARIES
+ SNAPPY_INCLUDE_DIR)
+
+mark_as_advanced(
+ SNAPPY_ROOT_DIR
+ SNAPPY_LIBRARIES
+ SNAPPY_INCLUDE_DIR)
diff --git a/depends/storage/CMake/FindZLIB.cmake b/depends/storage/CMake/FindZLIB.cmake
new file mode 100644
index 0000000..ec27957
--- /dev/null
+++ b/depends/storage/CMake/FindZLIB.cmake
@@ -0,0 +1,48 @@
+# - Find zlib
+# Find the native ZLIB headers and libraries.
+#
+# ZLIB_INCLUDE_DIRS - where to find zlib.h, etc.
+# ZLIB_LIBRARIES - List of libraries when using zlib.
+# ZLIB_FOUND - True if zlib found.
+
+#=============================================================================
+# (C) 1995-2017 Jean-loup Gailly and Mark Adler
+#
+# This software is provided 'as-is', without any express or implied
+# warranty. In no event will the authors be held liable for any damages
+# arising from the use of this software.
+#
+# Permission is granted to anyone to use this software for any purpose,
+# including commercial applications, and to alter it and redistribute it
+# freely, subject to the following restrictions:
+#
+# 1. The origin of this software must not be misrepresented; you must not
+# claim that you wrote the original software. If you use this software
+# in a product, an acknowledgment in the product documentation would be
+# appreciated but is not required.
+# 2. Altered source versions must be plainly marked as such, and must not be
+# misrepresented as being the original software.
+# 3. This notice may not be removed or altered from any source distribution.
+#
+# Jean-loup Gailly Mark Adler
+# jloup@gzip.org madler@alumni.caltech.edu
+#=============================================================================
+# (To distribute this file outside of CMake, substitute the full
+# License text for the above reference.)
+
+# Look for the header file.
+find_path(ZLIB_INCLUDE_DIR NAMES zlib.h)
+mark_as_advanced(ZLIB_INCLUDE_DIR)
+
+# Look for the library (sorted from most current/relevant entry to least).
+find_library(ZLIB_LIBRARY NAMES z)
+mark_as_advanced(ZLIB_LIBRARY)
+
+# handle the QUIETLY and REQUIRED arguments and set ZLIB_FOUND to TRUE if
+# all listed variables are TRUE
+FIND_PACKAGE_HANDLE_STANDARD_ARGS(ZLIB DEFAULT_MSG ZLIB_INCLUDE_DIR ZLIB_LIBRARY)
+
+if(ZLIB_FOUND)
+ set(ZLIB_LIBRARIES ${ZLIB_LIBRARY})
+ set(ZLIB_INCLUDE_DIRS ${ZLIB_INCLUDE_DIR})
+endif()
diff --git a/depends/storage/CMake/Functions.cmake b/depends/storage/CMake/Functions.cmake
new file mode 100644
index 0000000..a771b60
--- /dev/null
+++ b/depends/storage/CMake/Functions.cmake
@@ -0,0 +1,46 @@
+FUNCTION(AUTO_SOURCES RETURN_VALUE PATTERN SOURCE_SUBDIRS)
+
+ IF ("${SOURCE_SUBDIRS}" STREQUAL "RECURSE")
+ SET(PATH ".")
+ IF (${ARGC} EQUAL 4)
+ LIST(GET ARGV 3 PATH)
+ ENDIF ()
+ ENDIF()
+
+ IF ("${SOURCE_SUBDIRS}" STREQUAL "RECURSE")
+ UNSET(${RETURN_VALUE})
+ FILE(GLOB SUBDIR_FILES "${PATH}/${PATTERN}")
+ LIST(APPEND ${RETURN_VALUE} ${SUBDIR_FILES})
+
+ FILE(GLOB SUBDIRS RELATIVE ${PATH} ${PATH}/*)
+
+ FOREACH(DIR ${SUBDIRS})
+ IF (IS_DIRECTORY ${PATH}/${DIR})
+ IF (NOT "${DIR}" STREQUAL "CMAKEFILES")
+ FILE(GLOB_RECURSE SUBDIR_FILES "${PATH}/${DIR}/${PATTERN}")
+ LIST(APPEND ${RETURN_VALUE} ${SUBDIR_FILES})
+ ENDIF()
+ ENDIF()
+ ENDFOREACH()
+ ELSE ()
+ FILE(GLOB ${RETURN_VALUE} "${PATTERN}")
+
+ FOREACH (PATH ${SOURCE_SUBDIRS})
+ FILE(GLOB SUBDIR_FILES "${PATH}/${PATTERN}")
+ LIST(APPEND ${RETURN_VALUE} ${SUBDIR_FILES})
+ ENDFOREACH(PATH ${SOURCE_SUBDIRS})
+ ENDIF ()
+
+ IF (${FILTER_OUT})
+ LIST(REMOVE_ITEM ${RETURN_VALUE} ${FILTER_OUT})
+ ENDIF()
+
+ SET(${RETURN_VALUE} ${${RETURN_VALUE}} PARENT_SCOPE)
+ENDFUNCTION(AUTO_SOURCES)
+
+FUNCTION(CONTAINS_STRING FILE SEARCH RETURN_VALUE)
+ FILE(STRINGS ${FILE} FILE_CONTENTS REGEX ".*${SEARCH}.*")
+ IF (FILE_CONTENTS)
+ SET(${RETURN_VALUE} TRUE PARENT_SCOPE)
+ ENDIF()
+ENDFUNCTION(CONTAINS_STRING)
diff --git a/depends/storage/CMake/Options.cmake b/depends/storage/CMake/Options.cmake
new file mode 100644
index 0000000..36841ea
--- /dev/null
+++ b/depends/storage/CMake/Options.cmake
@@ -0,0 +1,71 @@
+##############################################################################
+# In this file we handle all env and customer's settings
+##############################################################################
+
+##############################################################################
+# Setup build and dependencies information
+##############################################################################
+SET(DEPENDENCY_INSTALL_PREFIX "/opt/dependency")
+IF($ENV{DEPENDENCY_INSTALL_PREFIX})
+ SET(DEPENDENCY_INSTALL_PREFIX $ENV{DEPENDENCY_INSTALL_PREFIX})
+ENDIF()
+
+SET(DEPENDENCY_DIST_PACKAGE_NAME "dependency-dist-package.tar.gz")
+IF($ENV{DEPENDENCY_DIST_PACKAGE_NAME})
+ SET(DEPENDENCY_DIST_PACKAGE_NAME $ENV{DEPENDENCY_DIST_PACKAGE_NAME})
+ENDIF()
+
+SET(CMAKE_PREFIX_PATH "${DEPENDENCY_INSTALL_PREFIX}" ${CMAKE_PREFIX_PATH})
+SET(CMAKE_PREFIX_PATH "${DEPENDENCY_INSTALL_PREFIX}/package" ${CMAKE_PREFIX_PATH})
+SET(CMAKE_PREFIX_PATH "${DEPENDENCY_INSTALL_PREFIX}/tools" ${CMAKE_PREFIX_PATH})
+
+SET(DEPENDENCY_LIBRARY_PATH "${DEPENDENCY_INSTALL_PREFIX}/package/lib:${DEPENDENCY_LIBRARY_PATH}")
+SET(DEPENDENCY_LIBRARY_PATH "${DEPENDENCY_INSTALL_PREFIX}/package/lib64:${DEPENDENCY_LIBRARY_PATH}")
+
+##############################################################################
+# Setup build flags
+##############################################################################
+OPTION(ENABLE_COVERAGE "enable code coverage." OFF)
+
+IF(NOT CMAKE_BUILD_TYPE)
+ SET(CMAKE_BUILD_TYPE Debug CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE)
+ENDIF(NOT CMAKE_BUILD_TYPE)
+
+IF(ENABLE_COVERAGE STREQUAL ON)
+ INCLUDE(CodeCoverage)
+ENDIF(ENABLE_COVERAGE STREQUAL ON)
+
+IF(CMAKE_BUILD_TYPE MATCHES Debug)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0")
+ENDIF()
+
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -g -fno-omit-frame-pointer -fno-strict-aliasing")
+
+IF(ENABLE_AVX STREQUAL ON)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mavx -mno-avx2")
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -DAVX_OPT")
+ELSE()
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -mno-avx -mno-avx2")
+ENDIF()
+
+#c++11 is needed to provide thread saft singleton implementation.
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -std=c++11 -Wno-deprecated-register")
+#-Rpass-missed=loop-vectorize -Wall -Wconversion
+
+IF(CMAKE_COMPILER_IS_CLANG)
+ SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-limit-debug-info -stdlib=libc++ -DUSE_CLANG")
+ IF(OS_LINUX)
+ SET(CLANG_LDFLAGS "-lc++abi -lc++" ${CLANG_LDFLAGS})
+ ENDIF(OS_LINUX)
+ENDIF(CMAKE_COMPILER_IS_CLANG)
+
+TRY_COMPILE(INT64T_EQUAL_LONGLONG
+ ${CMAKE_BINARY_DIR}
+ ${CMAKE_CURRENT_SOURCE_DIR}/CMake/CMakeTestCompileInt64tType.cc
+ OUTPUT_VARIABLE OUTPUT)
+
+IF(INT64T_EQUAL_LONGLONG)
+ MESSAGE(STATUS "Checking whether int64_t is typedef to long long -- yes")
+ELSE(INT64T_EQUAL_LONGLONG)
+ MESSAGE(STATUS "Checking whether int64_t is typedef to long long -- no")
+ENDIF(INT64T_EQUAL_LONGLONG)
diff --git a/depends/storage/CMake/Platform.cmake b/depends/storage/CMake/Platform.cmake
new file mode 100644
index 0000000..1ee0238
--- /dev/null
+++ b/depends/storage/CMake/Platform.cmake
@@ -0,0 +1,47 @@
+IF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ SET(OS_LINUX true CACHE INTERNAL "Linux operating system")
+ELSEIF(CMAKE_SYSTEM_NAME STREQUAL "Darwin")
+ SET(OS_MACOSX true CACHE INTERNAL "Mac Darwin operating system")
+ELSE(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+ MESSAGE(FATAL_ERROR "Unsupported OS: \"${CMAKE_SYSTEM_NAME}\"")
+ENDIF(CMAKE_SYSTEM_NAME STREQUAL "Linux")
+
+IF(CMAKE_COMPILER_IS_GNUCXX)
+ EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE COMPILER_OUTPUT)
+
+ STRING(REGEX MATCH "[0-9]\\.[0-9]\\.[0-9]" GCC_COMPILER_VERSION ${COMPILER_OUTPUT})
+ STRING(REGEX MATCHALL "[0-9]" GCC_COMPILER_VERSION ${GCC_COMPILER_VERSION})
+
+ LIST(LENGTH GCC_COMPILER_VERSION GCC_COMPILER_VERSION_LEN)
+ IF (NOT 3 EQUAL ${GCC_COMPILER_VERSION_LEN})
+ MESSAGE(FATAL_ERROR "Cannot get gcc version from \"${COMPILER_OUTPUT}\"")
+ ENDIF(NOT 3 EQUAL ${GCC_COMPILER_VERSION_LEN})
+
+ LIST(GET GCC_COMPILER_VERSION 0 GCC_COMPILER_VERSION_MAJOR)
+ LIST(GET GCC_COMPILER_VERSION 1 GCC_COMPILER_VERSION_MINOR)
+ LIST(GET GCC_COMPILER_VERSION 2 GCC_COMPILER_VERSION_PATCH)
+
+ SET(GCC_COMPILER_VERSION_MAJOR ${GCC_COMPILER_VERSION_MAJOR} CACHE INTERNAL "gcc major version")
+ SET(GCC_COMPILER_VERSION_MINOR ${GCC_COMPILER_VERSION_MINOR} CACHE INTERNAL "gcc minor version")
+ SET(GCC_COMPILER_VERSION_PATCH ${GCC_COMPILER_VERSION_PATCH} CACHE INTERNAL "gcc patch version")
+
+ MESSAGE(STATUS "checking compiler: GCC (${GCC_COMPILER_VERSION_MAJOR}.${GCC_COMPILER_VERSION_MINOR}.${GCC_COMPILER_VERSION_PATCH})")
+ELSE(CMAKE_COMPILER_IS_GNUCXX)
+ EXECUTE_PROCESS(COMMAND ${CMAKE_CXX_COMPILER} --version OUTPUT_VARIABLE COMPILER_OUTPUT)
+ IF(COMPILER_OUTPUT MATCHES "clang")
+ SET(CMAKE_COMPILER_IS_CLANG true CACHE INTERNAL "using clang as compiler")
+ MESSAGE(STATUS "checking compiler: CLANG")
+ ELSE(COMPILER_OUTPUT MATCHES "clang")
+ MESSAGE(FATAL_ERROR "Unsupported compiler: \"${CMAKE_CXX_COMPILER}\"")
+ ENDIF(COMPILER_OUTPUT MATCHES "clang")
+ENDIF(CMAKE_COMPILER_IS_GNUCXX)
+
+INCLUDE (TestBigEndian)
+TEST_BIG_ENDIAN(IS_BIG_ENDIAN)
+if(IS_BIG_ENDIAN)
+ message(STATUS "BIG_ENDIAN")
+ ADD_DEFINITIONS(-DIS_BIG_ENDIAN)
+else()
+ message(STATUS "LITTLE_ENDIAN")
+ ADD_DEFINITIONS(-DIS_LITTLE_ENDIAN)
+endif()
diff --git a/depends/storage/CMakeLists.txt b/depends/storage/CMakeLists.txt
new file mode 100644
index 0000000..d2a64c6
--- /dev/null
+++ b/depends/storage/CMakeLists.txt
@@ -0,0 +1,28 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+PROJECT(storage)
+
+##############################################################################
+# General CMake initialization
+##############################################################################
+SET(CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/CMake" ${CMAKE_MODULE_PATH})
+SET(CMAKE_VERBOSE_MAKEFILE OFF CACHE STRING "Verbose build." FORCE)
+
+IF(${CMAKE_CURRENT_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR})
+ MESSAGE(FATAL_ERROR "cannot build the project in the source directory! Out-of-source build is enforced!")
+ENDIF()
+
+##############################################################################
+# Import env, customer settings and utilities
+##############################################################################
+INCLUDE(Functions)
+INCLUDE(Platform)
+INCLUDE(Options)
+
+ADD_SUBDIRECTORY(src)
+ADD_SUBDIRECTORY(test)
+
+ADD_CUSTOM_TARGET(coverage
+ COMMAND make resetcoverage
+ COMMAND make -j8 unittest
+ COMMAND make ucoverage
+ COMMENT "Run all unit tests and get coverage...")
diff --git a/depends/storage/README b/depends/storage/README
new file mode 100644
index 0000000..f67eafe
--- /dev/null
+++ b/depends/storage/README
@@ -0,0 +1,15 @@
+# How to build
+## Get source code and Bootstrapping
+
+ cd storage
+ mkdir build
+ cd build
+ ../bootstrap --prefix=/opt/dependency/package (default)
+
+## Build
+
+ make
+
+## Install
+
+ make install
diff --git a/depends/storage/bootstrap b/depends/storage/bootstrap
new file mode 100755
index 0000000..36fd66c
--- /dev/null
+++ b/depends/storage/bootstrap
@@ -0,0 +1,109 @@
+#!/bin/sh
+# Licensed to the Apache Software Foundation (ASF) under one
+# or more contributor license agreements. See the NOTICE file
+# distributed with this work for additional information
+# regarding copyright ownership. The ASF licenses this file
+# to you under the Apache License, Version 2.0 (the
+# "License"); you may not use this file except in compliance
+# with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing,
+# software distributed under the License is distributed on an
+# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+# KIND, either express or implied. See the License for the
+# specific language governing permissions and limitations
+# under the License.
+#
+
+die() {
+ echo "$@" 1>&2 ; exit 1
+}
+
+arg ()
+{
+ echo "$1" | sed "s/^${2-[^=]*=}//"
+}
+
+# Detect directory information.
+source_dir=`cd "\`dirname \"$0\"\`";pwd`
+binary_dir=`pwd`
+
+# Choose the default install prefix.
+default_prefix="/opt/dependency/package"
+
+# Display bootstrap usage
+usage() {
+echo '
+Usage: '"$0"' [<options>]
+Options: [defaults in brackets after descriptions]
+Configuration:
+ --help print this message
+ --prefix=PREFIX install files in tree rooted at PREFIX
+ ['"${default_prefix}"']
+ --enable-coverage enable code coverage, must be used together with --enable-debug
+ --enable-debug enable debug build
+ --enable-avx enable avx for vector instruction optimization
+'
+ exit 10
+}
+
+# Parse arguments
+prefix_dir="${default_prefix}"
+build_type="Release"
+enable_coverage="OFF"
+enable_avx="ON"
+while test $# != 0; do
+ case "$1" in
+ --prefix=*) dir=`arg "$1"`
+ prefix_dir="$dir";;
+ --enable-coverage) enable_coverage="ON"
+ build_type="Debug";;
+ --enable-debug) build_type="Debug";;
+ --enable-avx=*) avx=`arg "$1"`
+ enable_avx="$avx";;
+ --help) usage ;;
+ *) die "Unknown option: $1" ;;
+ esac
+ shift
+done
+
+if [ ${source_dir} = ${binary_dir} ]; then
+ die "cannot build the project in the source directory! Out-of-source build is enforced!"
+fi
+
+enable_avx_upper=`echo "${enable_avx}" | tr [a-z] [A-Z]`
+if [ ${enable_avx_upper} != "ON" ] && [ ${enable_avx_upper} != "OFF" ]; then
+ die "unknown value for option enable-avx: ${enable_avx}, valid options are: on and off"
+fi
+
+# Check clang compiler
+if [[ x"${CC}" = x"" ]]; then
+ CC=gcc
+fi
+
+if [[ x"${CXX}" = x"" ]]; then
+ CXX=g++
+fi
+
+c_compiler=`which ${CC}`
+cxx_compiler=`which ${CXX}`
+cmake=`which cmake`
+
+if [ ! -x ${c_compiler} ]; then
+ die "cannot found c compiler"
+fi
+
+if [ ! -x ${cxx_compiler} ]; then
+ die "cannot found c++ compiler"
+fi
+
+if [ ! -x ${cmake} ]; then
+ die "cannot found cmake"
+fi
+
+# Configure
+${cmake} -DCMAKE_BUILD_TYPE=${build_type} -DCMAKE_INSTALL_PREFIX=${prefix_dir} -DCMAKE_C_COMPILER=${c_compiler} -DCMAKE_CXX_COMPILER=${cxx_compiler} -DENABLE_COVERAGE=${enable_coverage} -DENABLE_AVX=${enable_avx_upper} ${source_dir} || die "failed to configure the project"
+
+echo 'bootstrap success. Run "make" to build.'
diff --git a/depends/storage/src/CMakeLists.txt b/depends/storage/src/CMakeLists.txt
new file mode 100644
index 0000000..dd7ed3b
--- /dev/null
+++ b/depends/storage/src/CMakeLists.txt
@@ -0,0 +1,84 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+PROJECT(storage)
+
+FIND_PACKAGE(Protobuf REQUIRED)
+FIND_PACKAGE(JSON REQUIRED)
+FIND_PACKAGE(Snappy REQUIRED)
+FIND_PACKAGE(ZLIB REQUIRED)
+
+SET(storage_VERSION_MAJOR 0)
+SET(storage_VERSION_MINOR 1)
+SET(storage_VERSION_PATCH 0)
+SET(storage_VERSION_API 1)
+set(CMAKE_MACOSX_RPATH 1)
+
+SET(storage_ROOT_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+SET(storage_SRC_DIR ${CMAKE_CURRENT_SOURCE_DIR}/storage)
+SET(orcformat_proto_DIR ${storage_SRC_DIR}/format/orc)
+
+file(GLOB proto_files "${storage_SRC_DIR}/format/orc/*.proto")
+set(proto_SRC_DIR ${CMAKE_BINARY_DIR}/src/storage/format/orc)
+set(storage_PROTO_HDRS
+ ${proto_SRC_DIR}/orc_proto.pb.h
+)
+set(storage_PROTO_SRCS
+ ${proto_SRC_DIR}/orc_proto.pb.cc
+)
+file(MAKE_DIRECTORY ${proto_SRC_DIR})
+add_custom_command(
+ OUTPUT ${storage_PROTO_SRCS} ${storage_PROTO_HDRS}
+ COMMAND ${Protobuf_PROTOC_EXECUTABLE}
+ ARGS --cpp_out ${CMAKE_BINARY_DIR}/src -I ${CMAKE_CURRENT_SOURCE_DIR} ${proto_files}
+ DEPENDS "${proto_files}"
+ )
+
+AUTO_SOURCES(storage_files "*.cc" "RECURSE" "${storage_SRC_DIR}")
+LIST(APPEND storage_SOURCES ${storage_files})
+
+AUTO_SOURCES(common_HEADER "*.h" "${storage_SRC_DIR}/common")
+AUTO_SOURCES(cwrapper_HEADER "*.h" "${storage_SRC_DIR}/cwrapper")
+AUTO_SOURCES(format_HEADER "*.h" "${storage_SRC_DIR}/format")
+AUTO_SOURCES(orc_format_HEADER "*.h" "${storage_SRC_DIR}/format/orc")
+AUTO_SOURCES(testutil_HEADER "*.h" "${storage_SRC_DIR}/testutil")
+
+INCLUDE_DIRECTORIES(${storage_ROOT_DIR})
+INCLUDE_DIRECTORIES(${DEPENDENCY_INSTALL_PREFIX}/package/include)
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/src)
+INCLUDE_DIRECTORIES(/usr/local/include)
+LINK_DIRECTORIES(/usr/local/lib)
+LINK_DIRECTORIES(${DEPENDENCY_INSTALL_PREFIX}/package/lib)
+
+
+ADD_LIBRARY(storage-shared SHARED ${storage_SOURCES} ${storage_PROTO_SRCS} ${storage_PROTO_HDRS})
+# ADD_LIBRARY(storage-static STATIC ${storage_SOURCES} ${storage_PROTO_SRCS} ${storage_PROTO_HDRS})
+
+SET_TARGET_PROPERTIES(storage-shared PROPERTIES OUTPUT_NAME "storage")
+# SET_TARGET_PROPERTIES(storage-static PROPERTIES OUTPUT_NAME "storage")
+
+target_link_libraries(storage-shared ${CLANG_LDFLAGS}
+ dbcommon
+ univplan
+ hdfs3
+ snappy
+ lz4
+ ${ZLIB_LIBRARIES}
+ ${JSON_LIBRARIES}
+ glog
+ protobuf
+ pthread
+ iconv)
+# target_link_libraries(storage-static ${CLANG_LDFLAGS} dbcommon univplan hdfs3 snappy lz4 ${ZLIB_LIBRARIES} ${JSON_LIBRARIES} glog protobuf pthread iconv)
+
+INSTALL(TARGETS storage-shared
+ RUNTIME DESTINATION bin
+ LIBRARY DESTINATION lib
+ ARCHIVE DESTINATION lib)
+
+INSTALL(FILES ${common_HEADER} DESTINATION include/storage/common)
+INSTALL(FILES ${cwrapper_HEADER} DESTINATION include/storage/cwrapper)
+INSTALL(FILES ${format_HEADER} DESTINATION include/storage/format)
+INSTALL(FILES ${orc_format_HEADER} DESTINATION include/storage/format/orc)
+INSTALL(FILES ${storage_PROTO_HDRS} DESTINATION include/storage/format/orc)
+INSTALL(FILES ${testutil_HEADER} DESTINATION include/storage/testutil)
+
+SET(storage_ROOT_DIR ${storage_ROOT_DIR} PARENT_SCOPE)
diff --git a/depends/storage/src/storage/README b/depends/storage/src/storage/README
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/depends/storage/src/storage/README
diff --git a/depends/storage/src/storage/common/bloom-filter.h b/depends/storage/src/storage/common/bloom-filter.h
new file mode 100644
index 0000000..726cbea
--- /dev/null
+++ b/depends/storage/src/storage/common/bloom-filter.h
@@ -0,0 +1,189 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_COMMON_BLOOM_FILTER_H_
+#define STORAGE_SRC_STORAGE_COMMON_BLOOM_FILTER_H_
+
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+
+#include "dbcommon/hash/fast-hash.h"
+
+namespace storage {
+
+class MyBitSet {
+ public:
+ explicit MyBitSet(int64_t bits) : ownData(true) {
+ length = static_cast<int64_t>(
+ ceil(static_cast<double>(bits) / static_cast<double>(64)));
+ data = new uint64_t[length];
+ clear();
+ }
+
+ MyBitSet(uint64_t *data, int64_t size) : data(data), length(size) {
+ assert(data != nullptr && length > 0);
+ }
+
+ ~MyBitSet() {
+ if (ownData) {
+ delete data;
+ }
+ }
+
+ typedef std::unique_ptr<MyBitSet> uptr;
+
+ void set(int64_t index) {
+ assert(index >= 0);
+ data[index >> 6] |= (1L << index);
+ }
+
+ bool get(int64_t index) {
+ assert(index >= 0);
+ return (data[index >> 6] & (1L << index)) != 0;
+ }
+
+ uint64_t *getData() { return data; }
+ int64_t size() { return length; }
+
+ void clear() { memset(data, 0, size() * 8); }
+
+ private:
+ uint64_t *data = nullptr;
+ int64_t length = 0;
+ bool ownData = false;
+};
+
+class BloomFilter {
+ public:
+ explicit BloomFilter(int64_t expectedEntry) : kExpectedEntry(expectedEntry) {
+ assert(kExpectedEntry > 0 && "expectedEntries should be > 0");
+ assert(kDefaultFpp > 0.0 && kDefaultFpp < 1.0 &&
+ "False positive probability should be > 0.0 & < 1.0");
+ int64_t nb = optimalNumOfBits(kExpectedEntry, kDefaultFpp);
+ numBits = nb + 64 - (nb % 64);
+ numHashFunctions = optimalNumOfHashFunctions(kExpectedEntry, numBits);
+ bitSet.reset(new MyBitSet(numBits));
+ }
+
+ BloomFilter(uint64_t *bits, int64_t size, uint32_t numFuncs) {
+ bitSet.reset(new MyBitSet(bits, size));
+ numBits = size * 64;
+ numHashFunctions = numFuncs;
+ }
+
+ virtual ~BloomFilter() {}
+
+ typedef std::unique_ptr<BloomFilter> uptr;
+
+ void addInt(int64_t val) { addHash(getIntegerHash(val)); }
+ bool testInt(int64_t val) { return testHash(getIntegerHash(val)); }
+
+ void addDouble(double val) { addInt(doubleToRawBits(val)); }
+ bool testDouble(double val) { return testInt(doubleToRawBits(val)); }
+
+ void addString(const char *buffer, uint64_t len) {
+ int64_t hash64 = static_cast<int64_t>(murmur3.hash64(buffer, len));
+ addHash(hash64);
+ }
+ bool testString(const char *buffer, uint64_t len) {
+ int64_t hash64 = static_cast<int64_t>(murmur3.hash64(buffer, len));
+ return testHash(hash64);
+ }
+
+ uint64_t *getBitSet() { return bitSet->getData(); }
+ int64_t size() { return bitSet->size(); }
+
+ void reset() { bitSet->clear(); }
+
+ uint32_t getNumHashFunctions() { return numHashFunctions; }
+
+ private:
+ int64_t optimalNumOfBits(int64_t n, double p) {
+ auto ln2 = std::log(2);
+ return static_cast<int64_t>(std::ceil(-(n * std::log(p) / ln2 / ln2)));
+ }
+
+ uint32_t optimalNumOfHashFunctions(int64_t n, int64_t m) {
+ auto frac = static_cast<double>(m) / static_cast<double>(n);
+ return static_cast<uint32_t>(std::ceil(frac * std::log(2)));
+ }
+
+ int64_t getIntegerHash(int64_t key) {
+ key = (~key) + (key << 21); // key = (key << 21) - key - 1;
+ key = key ^ (key >> 24);
+ key = (key + (key << 3)) + (key << 8); // key * 265
+ key = key ^ (key >> 14);
+ key = (key + (key << 2)) + (key << 4); // key * 21
+ key = key ^ (key >> 28);
+ key = key + (key << 31);
+ return key;
+ }
+
+ void addHash(int64_t hash64) {
+ int64_t hash1 = hash64;
+ int64_t hash2 = static_cast<int64_t>(static_cast<uint64_t>(hash64) >> 32);
+
+ for (uint32_t i = 1; i <= numHashFunctions; ++i) {
+ int64_t combinedHash = hash1 + (i * hash2);
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ int64_t pos = combinedHash % numBits;
+ bitSet->set(pos);
+ }
+ }
+
+ bool testHash(int64_t hash64) {
+ int64_t hash1 = hash64;
+ int64_t hash2 = static_cast<int64_t>(static_cast<uint64_t>(hash64) >> 32);
+
+ for (uint32_t i = 1; i <= numHashFunctions; ++i) {
+ int64_t combinedHash = hash1 + (i * hash2);
+ // hashcode should be positive, flip all the bits if it's negative
+ if (combinedHash < 0) {
+ combinedHash = ~combinedHash;
+ }
+ int64_t pos = combinedHash % numBits;
+ if (!bitSet->get(pos)) {
+ return false;
+ }
+ }
+ return true;
+ }
+
+ int64_t doubleToRawBits(double val) {
+ int64_t bits;
+ memcpy(&bits, &val, sizeof(bits));
+ return bits;
+ }
+
+ private:
+ int64_t numBits = 0;
+ uint32_t numHashFunctions = 0;
+ int64_t kExpectedEntry = 0;
+ MyBitSet::uptr bitSet = nullptr;
+ const double kDefaultFpp = 0.05;
+ dbcommon::Murmur3 murmur3;
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_COMMON_BLOOM_FILTER_H_
diff --git a/depends/storage/src/storage/common/string.h b/depends/storage/src/storage/common/string.h
new file mode 100644
index 0000000..f7518e0
--- /dev/null
+++ b/depends/storage/src/storage/common/string.h
@@ -0,0 +1,95 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_COMMON_STRING_H_
+#define STORAGE_SRC_STORAGE_COMMON_STRING_H_
+
+#include <cassert>
+#include <string>
+
+#include "dbcommon/utils/cutils.h"
+
+namespace storage {
+
+#define STRING_INIT_RESERVE_BYTES 1024
+
+class String {
+ public:
+ String() {
+ assert(reserved_ > 0);
+ data_ = dbcommon::cnmalloc(reserved_);
+ }
+
+ virtual ~String() { dbcommon::cnfree(data_); }
+
+ String &operator=(const String &) = delete;
+
+ void append(const char *value, uint32_t sz) {
+ enlarge(sz);
+ assert(size_ + sz <= reserved_);
+ memcpy(&data_[size_], value, sz);
+ size_ += sz;
+ }
+
+ void append(const char *value, uint32_t pos, uint32_t sz) {
+ enlarge(sz);
+ assert(size_ + sz <= reserved_);
+ memcpy(&data_[size_], value + pos, sz);
+ size_ += sz;
+ }
+
+ void appendChar(char value) {
+ enlarge(1);
+ assert(size_ + 1 <= reserved_);
+ *(reinterpret_cast<char *>(&data_[size_])) = value;
+ size_ += 1;
+ }
+
+ char *data() const { return data_; }
+
+ uint32_t size() const { return size_; }
+
+ void reset() { size_ = 0; }
+
+ std::string substr(uint32_t pos, uint32_t len) {
+ std::string str;
+ for (uint32_t i = pos, end = pos + len; i < end; ++i) str += data_[i];
+ return str;
+ }
+
+ private:
+ void enlarge(uint32_t needed) {
+ needed += size_;
+ if (needed <= reserved_) return;
+
+ uint32_t newLen = 2 * reserved_;
+ while (needed > newLen) newLen *= 2;
+
+ data_ = dbcommon::cnrealloc(data_, sizeof(char) * newLen);
+ reserved_ = newLen;
+ }
+
+ private:
+ char *data_ = nullptr;
+ uint32_t size_ = 0;
+ uint32_t reserved_ = STRING_INIT_RESERVE_BYTES;
+};
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_COMMON_STRING_H_
diff --git a/depends/storage/src/storage/cwrapper/hdfs-file-system-c.cc b/depends/storage/src/storage/cwrapper/hdfs-file-system-c.cc
new file mode 100644
index 0000000..c468222
--- /dev/null
+++ b/depends/storage/src/storage/cwrapper/hdfs-file-system-c.cc
@@ -0,0 +1,486 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/cwrapper/hdfs-file-system-c.h"
+
+#include <string>
+#include <vector>
+
+#include "dbcommon/filesystem/hdfs/hdfs-file-system.h"
+#include "dbcommon/log/logger.h"
+#include "dbcommon/utils/file-info.h"
+#include "dbcommon/utils/global.h"
+#include "dbcommon/utils/macro.h"
+
+extern "C" {
+
+struct FscHdfsFileC {
+ void *f;
+};
+
+struct FscHdfsFileInfoC {
+ void *fi;
+};
+
+struct FscHdfsFileInfoArrayC {
+ int size;
+ void **fiVec;
+};
+
+struct FscHdfsFileBlockLocationArrayC {
+ int size;
+ void **fblVec;
+};
+
+struct FscHdfsFileBlockLocationC {
+ void *bl;
+};
+
+struct FscHdfsFileSystemC {
+ void *fs;
+ CatchedError error;
+};
+
+#define FETCH_FILE_SYSTEM_HANDLE(ofs, ifs) \
+ dbcommon::FileSystem *ifs = static_cast<dbcommon::FileSystem *>((ofs)->fs);
+
+#define FETCH_HDFS_FILE_SYSTEM_HANDLE(ofs, ifs) \
+ dbcommon::HdfsFileSystem *ifs = \
+ static_cast<dbcommon::HdfsFileSystem *>((ofs)->fs);
+
+#define FETCH_FILE_HANDLE(ofile, ifile) \
+ dbcommon::File *ifile = static_cast<dbcommon::File *>((ofile)->f);
+
+#define FETCH_FILE_INFO_HANDLE(ofi, ifi) \
+ dbcommon::FileInfo *ifi = static_cast<dbcommon::FileInfo *>((ofi)->fi);
+
+#define FETCH_FILE_BLOCK_LOCATION_HANDLE(obl, ibl) \
+ dbcommon::FileBlockLocation *ibl = \
+ static_cast<dbcommon::FileBlockLocation *>((obl)->bl);
+
+void FscHdfsSetError(CatchedError *ce, int errCode, const char *reason) {
+ assert(ce != nullptr);
+ FscHdfsFreeErrorContent(ce); /* free the old one if it was filled already */
+ ce->errCode = errCode;
+ ce->errMessage = new char[strlen(reason) + 1]();
+ strcpy(ce->errMessage, reason); /* NOLINT */
+}
+
+void FscHdfsCloseFileC(FscHdfsFileSystemC *fs, FscHdfsFileC *f) {
+ int errCode;
+ std::string errMessage;
+ FETCH_FILE_HANDLE(f, ifile)
+ try {
+ ifile->close();
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+FscHdfsFileSystemC *FscHdfsNewFileSystem(const char *namenode, uint16_t port) {
+ try {
+ std::string url("hdfs://");
+ url += namenode;
+ url += ":" + std::to_string(port);
+ dbcommon::FileSystem *fs = FSManager.get(url);
+
+ FscHdfsFileSystemC *result = new FscHdfsFileSystemC();
+ result->error.errCode = 0;
+ result->error.errMessage = nullptr;
+ result->fs = fs;
+ return result;
+ } catch (dbcommon::TransactionAbortException &e) {
+ return nullptr;
+ }
+}
+
+FscHdfsFileC *FscHdfsOpenFile(FscHdfsFileSystemC *fs, const char *path,
+ int flags) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ std::unique_ptr<dbcommon::File> file = ifs->open(path, flags);
+ FscHdfsFileC *result = new FscHdfsFileC();
+ result->f = file.release();
+ return result;
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return nullptr;
+ }
+}
+
+void FscHdfsSeekFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, uint64_t offset) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ FETCH_FILE_HANDLE(f, ifile)
+ try {
+ ifs->seek(ifile, offset);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+void FscHdfsRemovePath(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ ifs->remove(path);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+void FscHdfsRemovePathIfExists(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ ifs->removeIfExists(path);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+FscHdfsFileInfoC *FscHdfsGetFileInfo(FscHdfsFileSystemC *fs,
+ const char *fileName) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ std::unique_ptr<dbcommon::FileInfo> finfo = ifs->getFileInfo(fileName);
+ FscHdfsFileInfoC *result = new FscHdfsFileInfoC();
+ result->fi = finfo.release();
+ return result;
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return nullptr;
+ }
+}
+
+int FscHdfsExistPath(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ return ifs->exists(path) ? 1 : 0;
+}
+
+int64_t FscHdfsGetFileLength(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ return ifs->getFileLength(path);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return -1;
+ }
+}
+
+char FscHdfsGetFileKind(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ return ifs->getFileKind(path);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return 'U';
+ }
+}
+
+int FscHdfsReadFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, void *buf,
+ int size) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ FETCH_FILE_HANDLE(f, ifile)
+ try {
+ return ifs->read(ifile, buf, size);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return -1;
+ }
+}
+
+void FscHdfsWriteFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, void *buf,
+ int size) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ FETCH_FILE_HANDLE(f, ifile)
+ try {
+ ifs->write(ifile, buf, size);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+void FscHdfsCreateDir(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ ifs->createDir(path);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+int FscHdfsExistInsertPath(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ std::string fullPath(path);
+ fullPath += INSERT_HIDDEN_DIR;
+ return ifs->exists(fullPath.c_str()) ? 1 : 0;
+}
+
+void FscHdfsCreateInsertDir(FscHdfsFileSystemC *fs, const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ std::string fullPath(path);
+ fullPath += INSERT_HIDDEN_DIR;
+ ifs->createDir(fullPath.c_str());
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+FscHdfsFileInfoArrayC *FscHdfsDirPath(FscHdfsFileSystemC *fs,
+ const char *path) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ std::vector<std::unique_ptr<dbcommon::FileInfo> > finfovector =
+ ifs->dir(path);
+ FscHdfsFileInfoArrayC *result = new FscHdfsFileInfoArrayC();
+ result->size = finfovector.size();
+ result->fiVec = new void *[result->size];
+ for (int i = 0; i < result->size; ++i) {
+ dbcommon::FileInfo *newfi = new dbcommon::FileInfo();
+ FscHdfsFileInfoC *newfic = new FscHdfsFileInfoC();
+ newfic->fi = newfi;
+ result->fiVec[i] = newfic;
+
+ *newfi = *(finfovector[i]);
+ }
+ return result;
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return nullptr;
+ }
+}
+
+void FscHdfsChmodPath(FscHdfsFileSystemC *fs, const char *path, int mode) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ ifs->chmod(path, mode);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ }
+}
+
+int64_t FscHdfsTellFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ FETCH_FILE_HANDLE(f, ifile)
+ try {
+ return ifs->tell(ifile);
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return -1;
+ }
+}
+
+FscHdfsFileBlockLocationArrayC *FscHdfsGetPathFileBlockLocation(
+ FscHdfsFileSystemC *fs, const char *path, int64_t start, int64_t length) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ try {
+ std::vector<std::unique_ptr<dbcommon::FileBlockLocation> > // NOLINT
+ fblvector = ifs->getFileBlockLocation(path, start, length);
+ FscHdfsFileBlockLocationArrayC *result =
+ new FscHdfsFileBlockLocationArrayC();
+ result->size = fblvector.size();
+ result->fblVec = new void *[result->size];
+ for (int i = 0; i < result->size; ++i) {
+ FscHdfsFileBlockLocationC *newblc = new FscHdfsFileBlockLocationC();
+ dbcommon::FileBlockLocation *newbl = new dbcommon::FileBlockLocation();
+ newblc->bl = newbl;
+ result->fblVec[i] = newblc;
+
+ newbl->corrupt = fblvector[i]->corrupt;
+ newbl->length = fblvector[i]->length;
+ newbl->offset = fblvector[i]->offset;
+ newbl->hosts = fblvector[i]->hosts;
+ newbl->names = fblvector[i]->names;
+ newbl->ports = fblvector[i]->ports;
+ newbl->topoPaths = fblvector[i]->topoPaths;
+ }
+ return result;
+ } catch (dbcommon::TransactionAbortException &e) {
+ FscHdfsSetError(&(fs->error), e.errCode(), e.what());
+ return nullptr;
+ }
+}
+
+void FscHdfsSetFileSystemBlockSize(FscHdfsFileSystemC *fs, int size) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ ifs->setBlockSize(size);
+}
+
+int FscHdfsGetFileSystemBlockSize(FscHdfsFileSystemC *fs) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ return ifs->getBlockSize();
+}
+
+const char *FscHdfsGetFileSystemAddress(FscHdfsFileSystemC *fs) {
+ FETCH_HDFS_FILE_SYSTEM_HANDLE(fs, ifs)
+ return ifs->getFileSystemNameNodeAddr().c_str();
+}
+
+uint16_t FscHdfsGetFileSystemPort(FscHdfsFileSystemC *fs) {
+ FETCH_HDFS_FILE_SYSTEM_HANDLE(fs, ifs)
+ return ifs->getFileSystemPort();
+}
+
+int FscHdfsHasErrorRaised(FscHdfsFileSystemC *fs) {
+ FETCH_FILE_SYSTEM_HANDLE(fs, ifs)
+ return fs->error.errCode != 0 ? 1 : 0;
+}
+
+void FscHdfsFreeFileSystemC(FscHdfsFileSystemC **fs) {
+ if (*fs == nullptr) return;
+ FscHdfsFreeErrorContent(&((*fs)->error));
+ delete *fs;
+ *fs = nullptr;
+}
+
+void FscHdfsFreeFileC(FscHdfsFileC **f) {
+ if (*f == nullptr) return;
+ FETCH_FILE_HANDLE((*f), ifile)
+ delete ifile;
+ delete *f;
+ *f = nullptr;
+}
+
+void FscHdfsFreeFileInfoArrayC(FscHdfsFileInfoArrayC **fiArray) {
+ if (*fiArray == nullptr) return;
+ for (int i = 0; i < (*fiArray)->size; ++i) {
+ FscHdfsFileInfoC *pfic =
+ static_cast<FscHdfsFileInfoC *>((*fiArray)->fiVec[i]);
+ dbcommon::FileInfo *pfi = static_cast<dbcommon::FileInfo *>(pfic->fi);
+ delete pfi;
+ delete pfic;
+ }
+ delete *fiArray;
+ *fiArray = nullptr;
+}
+
+void FscHdfsFreeFileBlockLocationArrayC(
+ FscHdfsFileBlockLocationArrayC **fblArray) {
+ if (*fblArray == nullptr) return;
+ for (int i = 0; i < (*fblArray)->size; ++i) {
+ FscHdfsFileBlockLocationC *fblc =
+ static_cast<FscHdfsFileBlockLocationC *>((*fblArray)->fblVec[i]);
+ dbcommon::FileBlockLocation *fbl =
+ static_cast<dbcommon::FileBlockLocation *>(fblc->bl);
+ delete fbl;
+ delete fblc;
+ }
+ delete *fblArray;
+ *fblArray = nullptr;
+}
+
+void FscHdfsFreeErrorContent(CatchedError *ce) {
+ assert(ce != nullptr);
+ if (ce->errMessage != nullptr) {
+ delete[] ce->errMessage;
+ }
+}
+
+FscHdfsFileInfoC *FscHdfsGetFileInfoFromArray(FscHdfsFileInfoArrayC *fia,
+ int index) {
+ if (index < 0 || index >= fia->size) {
+ return nullptr;
+ }
+ return static_cast<FscHdfsFileInfoC *>(fia->fiVec[index]);
+}
+
+const char *FscHdfsGetFileInfoName(FscHdfsFileInfoC *fi) {
+ FETCH_FILE_INFO_HANDLE(fi, ifi)
+ return ifi->name.c_str();
+}
+
+int64_t FscHdfsGetFileInfoLength(FscHdfsFileInfoC *fi) {
+ FETCH_FILE_INFO_HANDLE(fi, ifi)
+ return ifi->size;
+}
+
+FscHdfsFileBlockLocationC *FscHdfsGetFileBlockLocationFromArray(
+ FscHdfsFileBlockLocationArrayC *bla, int index) {
+ if (index < 0 || index >= bla->size) {
+ return nullptr;
+ }
+ return static_cast<FscHdfsFileBlockLocationC *>(bla->fblVec[index]);
+}
+
+int FscHdfsGetFileBlockLocationArraySize(FscHdfsFileBlockLocationArrayC *bla) {
+ return bla->size;
+}
+
+int FscHdfsGetFileBlockLocationNNodes(FscHdfsFileBlockLocationC *bl) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->hosts.size();
+}
+
+int64_t FscHdfsGetFileBlockLocationOffset(FscHdfsFileBlockLocationC *bl) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->offset;
+}
+int64_t FscHdfsGetFileBlockLocationLength(FscHdfsFileBlockLocationC *bl) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->length;
+}
+
+int FscHdfsGetFileBlockLocationCorrupt(FscHdfsFileBlockLocationC *bl) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->corrupt;
+}
+
+const char *FscHdfsGetFileBlockLocationNodeHost(FscHdfsFileBlockLocationC *bl,
+ int index) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->hosts[index].c_str();
+}
+const char *FscHdfsGetFileBlockLocationNodeName(FscHdfsFileBlockLocationC *bl,
+ int index) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->names[index].c_str();
+}
+const char *FscHdfsGetFileBlockLocationNodeTopoPath(
+ FscHdfsFileBlockLocationC *bl, int index) {
+ FETCH_FILE_BLOCK_LOCATION_HANDLE(bl, ibl)
+ return ibl->topoPaths[index].c_str();
+}
+
+void FscHdfsFreeString(char **pstr) {
+ delete[] * pstr;
+ *pstr = nullptr;
+}
+
+CatchedError *FscHdfsGetFileSystemError(FscHdfsFileSystemC *fs) {
+ return &(fs->error);
+}
+
+void SetToken(const char *tokenkey, const char *token) {
+ if (token) {
+ std::string Token(token);
+ std::string TokenKey(tokenkey);
+ FSManager.setTokenMap(TokenKey, Token);
+ }
+}
+
+void SetCcname(const char *ccname) {
+ if (ccname) {
+ std::string Ccname(ccname);
+ FSManager.setCcname(Ccname);
+ }
+}
+void cleanup_FSManager() {
+ FSManager.clearFsMap();
+ FSManager.clearFsTokenMap();
+}
+}
diff --git a/depends/storage/src/storage/cwrapper/hdfs-file-system-c.h b/depends/storage/src/storage/cwrapper/hdfs-file-system-c.h
new file mode 100644
index 0000000..13707c4
--- /dev/null
+++ b/depends/storage/src/storage/cwrapper/hdfs-file-system-c.h
@@ -0,0 +1,126 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+///////////////////////////////////////////////////////////////////////////////
+
+#ifndef STORAGE_SRC_STORAGE_CWRAPPER_HDFS_FILE_SYSTEM_C_H_
+#define STORAGE_SRC_STORAGE_CWRAPPER_HDFS_FILE_SYSTEM_C_H_
+
+#include <fcntl.h>
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct FscHdfsFileC;
+struct FscHdfsFileInfoC;
+struct FscHdfsFileInfoArrayC;
+struct FscHdfsFileBlockLocationC;
+struct FscHdfsFileBlockLocationArrayC;
+struct FscHdfsFileSystemC;
+
+typedef struct FscHdfsFileC FscHdfsFileC;
+typedef struct FscHdfsFileInfoC FscHdfsFileInfoC;
+typedef struct FscHdfsFileInfoArrayC FscHdfsFileInfoArrayC;
+typedef struct FscHdfsFileBlockLocationC FscHdfsFileBlockLocationC;
+typedef struct FscHdfsFileBlockLocationArrayC FscHdfsFileBlockLocationArrayC;
+typedef struct FscHdfsFileSystemC FscHdfsFileSystemC;
+
+typedef struct CatchedError {
+ int errCode;
+ char *errMessage;
+} CatchedError;
+
+// Set error
+void FscHdfsSetError(CatchedError *ce, int errCode, const char *reason);
+CatchedError *FscHdfsGetFileSystemError(FscHdfsFileSystemC *fs);
+
+// File APIs
+void FscHdfsCloseFileC(FscHdfsFileSystemC *fs, FscHdfsFileC *f);
+
+// File system APIs
+FscHdfsFileSystemC *FscHdfsNewFileSystem(const char *namenode, uint16_t port);
+
+FscHdfsFileC *FscHdfsOpenFile(FscHdfsFileSystemC *fs, const char *path,
+ int flags);
+void FscHdfsSeekFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, uint64_t offset);
+void FscHdfsRemovePath(FscHdfsFileSystemC *fs, const char *path);
+void FscHdfsRemovePathIfExists(FscHdfsFileSystemC *fs, const char *path);
+FscHdfsFileInfoC *FscHdfsGetFileInfo(FscHdfsFileSystemC *fs,
+ const char *fileName);
+int FscHdfsExistPath(FscHdfsFileSystemC *fs, const char *path);
+int64_t FscHdfsGetFileLength(FscHdfsFileSystemC *fs, const char *path);
+int FscHdfsReadFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, void *buf,
+ int size);
+void FscHdfsWriteFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f, void *buf,
+ int size);
+void FscHdfsCreateDir(FscHdfsFileSystemC *fs, const char *path);
+// this is a special interface to create dir with a hidden ".tmp" subdir
+int FscHdfsExistInsertPath(FscHdfsFileSystemC *fs, const char *path);
+void FscHdfsCreateInsertDir(FscHdfsFileSystemC *fs, const char *path);
+FscHdfsFileInfoArrayC *FscHdfsDirPath(FscHdfsFileSystemC *fs, const char *path);
+void FscHdfsChmodPath(FscHdfsFileSystemC *fs, const char *path, int mode);
+int64_t FscHdfsTellFile(FscHdfsFileSystemC *fs, FscHdfsFileC *f);
+FscHdfsFileBlockLocationArrayC *FscHdfsGetPathFileBlockLocation(
+ FscHdfsFileSystemC *fs, const char *path, int64_t start, int64_t length);
+void FscHdfsSetFileSystemBlockSize(FscHdfsFileSystemC *fs, int size);
+int FscHdfsGetFileSystemBlockSize(FscHdfsFileSystemC *fs);
+
+const char *FscHdfsGetFileSystemAddress(FscHdfsFileSystemC *fs);
+uint16_t FscHdfsGetFileSystemPort(FscHdfsFileSystemC *fs);
+
+FscHdfsFileInfoC *FscHdfsGetFileInfoFromArray(FscHdfsFileInfoArrayC *fia,
+ int index);
+const char *FscHdfsGetFileInfoName(FscHdfsFileInfoC *fi);
+int64_t FscHdfsGetFileInfoLength(FscHdfsFileInfoC *fi);
+
+FscHdfsFileBlockLocationC *FscHdfsGetFileBlockLocationFromArray(
+ FscHdfsFileBlockLocationArrayC *bla, int index);
+
+int FscHdfsGetFileBlockLocationArraySize(FscHdfsFileBlockLocationArrayC *bla);
+
+int FscHdfsGetFileBlockLocationNNodes(FscHdfsFileBlockLocationC *bl);
+int64_t FscHdfsGetFileBlockLocationOffset(FscHdfsFileBlockLocationC *bl);
+int64_t FscHdfsGetFileBlockLocationLength(FscHdfsFileBlockLocationC *bl);
+int FscHdfsGetFileBlockLocationCorrupt(FscHdfsFileBlockLocationC *bl);
+const char *FscHdfsGetFileBlockLocationNodeHost(FscHdfsFileBlockLocationC *bl,
+ int index);
+const char *FscHdfsGetFileBlockLocationNodeName(FscHdfsFileBlockLocationC *bl,
+ int index);
+const char *FscHdfsGetFileBlockLocationNodeTopoPath(
+ FscHdfsFileBlockLocationC *bl, int index);
+
+int FscHdfsHasErrorRaised(FscHdfsFileSystemC *fs);
+
+// Still need some additional free/delete APIs to help release memory
+void FscHdfsFreeFileSystemC(FscHdfsFileSystemC **fs);
+void FscHdfsFreeFileC(FscHdfsFileC **f);
+void FscHdfsFreeFileInfoArrayC(FscHdfsFileInfoArrayC **fiArray);
+void FscHdfsFreeFileBlockLocationArrayC(
+ FscHdfsFileBlockLocationArrayC **fblArray);
+void FscHdfsFreeErrorContent(CatchedError *ce);
+void SetToken(const char *tokenkey, const char *token);
+void SetCcname(const char *ccname);
+void cleanup_FSManager();
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STORAGE_SRC_STORAGE_CWRAPPER_HDFS_FILE_SYSTEM_C_H_
diff --git a/depends/storage/src/storage/cwrapper/orc-format-c.cc b/depends/storage/src/storage/cwrapper/orc-format-c.cc
new file mode 100644
index 0000000..c215f96
--- /dev/null
+++ b/depends/storage/src/storage/cwrapper/orc-format-c.cc
@@ -0,0 +1,638 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/cwrapper/orc-format-c.h"
+
+#include <uuid/uuid.h>
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "dbcommon/common/vector-transformer.h"
+#include "dbcommon/common/vector/decimal-vector.h"
+#include "dbcommon/common/vector/timestamp-vector.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/function/decimal-function.h"
+#include "dbcommon/function/typecast-func.cg.h"
+#include "dbcommon/type/date.h"
+#include "dbcommon/type/decimal.h"
+#include "dbcommon/type/type-kind.h"
+#include "dbcommon/utils/global.h"
+#include "dbcommon/utils/url.h"
+
+#include "storage/format/format.h"
+#include "storage/format/orc/orc-format.h"
+
+#include "univplan/univplanbuilder/univplanbuilder-scan-task.h"
+
+#define NUMERIC_POS 0x0000
+#define NUMERIC_NEG 0x4000
+#define DEC_DIGITS 4
+#define NUMERIC_DSCALE_MASK 0x3FF
+#define NUMERIC_HDRSZ (sizeof(int32_t) + sizeof(uint16_t) + sizeof(int16_t))
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+static void ORCFormatSetErrorORCFormatC(ORCFormatCatchedError *ce, int errCode,
+ const char *errMsg);
+
+typedef struct OrcColumnReader {
+ dbcommon::TypeKind type;
+ const char *value;
+ const bool *nulls;
+ const uint64_t *lens;
+ std::unique_ptr<dbcommon::ByteBuffer> valBuffer;
+} OrcColumnReader;
+
+struct ORCFormatC {
+ std::unique_ptr<storage::ORCFormat> orcFormat; // NOLINT
+ dbcommon::URL::uptr url;
+ dbcommon::Parameters params;
+ dbcommon::TupleDesc desc;
+ ORCFormatCatchedError error;
+ std::vector<bool> columnsToRead;
+ univplan::UnivPlanScanFileSplitListList splits;
+ dbcommon::TupleBatch::uptr tb;
+ std::string insertFileName;
+
+ bool needNewTupleBatch;
+ uint64_t rowRead;
+ uint64_t rowCount;
+ std::vector<std::unique_ptr<OrcColumnReader>> columnReaders;
+ std::vector<uint32_t> colToReadIds;
+};
+
+typedef struct NumericTransData {
+ int32_t varlen; // total size counted in byte
+ int16_t weight; // size of integral part, counted in int16_t
+ uint16_t sign_dscale; // sign and scale
+ int16_t digits[0];
+} NumericTransData;
+
+ORCFormatC *ORCFormatNewORCFormatC(const char *tableOptions, int segno) {
+ ORCFormatC *instance = new ORCFormatC();
+ univplan::UNIVPLANFORMATTYPE type = univplan::UNIVPLANFORMATTYPE::ORC_FORMAT;
+
+ instance->params.set("table.options", tableOptions);
+ instance->orcFormat.reset(new storage::ORCFormat(&(instance->params)));
+ instance->orcFormat->setFileSystemManager(&FSManager);
+ instance->tb = nullptr;
+ instance->url = nullptr;
+ instance->error.errCode = ERRCODE_SUCCESSFUL_COMPLETION;
+ instance->insertFileName = "/" + std::to_string(segno + 1) + "_";
+ return instance;
+}
+
+void ORCFormatBeginORCFormatC(ORCFormatC *fmt, ORCFormatFileSplit *splits,
+ int numSplits, bool *columnsToRead,
+ char **columnName, int *columnDatatype,
+ uint64_t *columnDatatypeMod, int numColumns) {
+ try {
+ fmt->tb = nullptr;
+ fmt->needNewTupleBatch = true;
+ for (int i = 0; i < numColumns; ++i) {
+ fmt->columnsToRead.push_back(columnsToRead[i]);
+ fmt->desc.add(columnName[i],
+ (static_cast<dbcommon::TypeKind>(columnDatatype[i])),
+ columnDatatypeMod[i]);
+ if (columnsToRead[i]) {
+ std::unique_ptr<OrcColumnReader> columnReader(new OrcColumnReader);
+ columnReader->type = static_cast<dbcommon::TypeKind>(columnDatatype[i]);
+ switch (columnReader->type) {
+ case dbcommon::TypeKind::STRINGID:
+ case dbcommon::TypeKind::CHARID:
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::BINARYID:
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID:
+ case dbcommon::TypeKind::DECIMALID:
+ columnReader->valBuffer.reset(new dbcommon::ByteBuffer(true));
+ columnReader->valBuffer->reserve(DEFAULT_RESERVED_SIZE_OF_STRING *
+ DEFAULT_NUMBER_TUPLES_PER_BATCH);
+ break;
+ default:
+ columnReader->valBuffer = nullptr;
+ break;
+ }
+ fmt->columnReaders.push_back(std::move(columnReader));
+ fmt->colToReadIds.push_back(i);
+ }
+ }
+
+ // create one scan task to contain all splits
+ univplan::UnivPlanBuilderScanTask scanTaskBld;
+ // add all splits into scan task
+ for (int j = 0; j < numSplits; ++j) {
+ scanTaskBld.addScanFileSplit(splits[j].fileName, splits[j].start,
+ splits[j].len, -1, -1); // no rangeid, rgid
+ }
+ // build scan task by transfering tb from this builder to fmt instance
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListTb> newScanTask(
+ new univplan::UnivPlanScanFileSplitListTb(
+ std::move(scanTaskBld.releaseSplitsTb())));
+ fmt->splits.push_back(std::move(newScanTask));
+
+ fmt->orcFormat->beginScan(&(fmt->splits), &(fmt->desc),
+ &(fmt->columnsToRead), nullptr, nullptr, false);
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatRescanORCFormatC(ORCFormatC *fmt) {
+ try {
+ fmt->orcFormat->reScan();
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatEndORCFormatC(ORCFormatC *fmt) {
+ try {
+ fmt->orcFormat->endScan();
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatBeginInsertORCFormatC(ORCFormatC *fmt, const char *dirFullPath,
+ char **columnName, int *columnDatatype,
+ uint64_t *columnDatatypeMod,
+ int numColumns) {
+ try {
+ fmt->tb = nullptr;
+ for (int i = 0; i < numColumns; ++i) {
+ fmt->desc.add(columnName[i],
+ (static_cast<dbcommon::TypeKind>(columnDatatype[i])),
+ columnDatatypeMod[i]);
+ }
+
+ std::string dirFullInsertPath(dirFullPath);
+ dirFullInsertPath += INSERT_HIDDEN_DIR;
+ fmt->url.reset(new dbcommon::URL(dirFullInsertPath));
+ dbcommon::FileSystem *fs = FSManager.get(dirFullPath);
+ std::string targetPath = fmt->url->getPath();
+ std::string targetRawPath = fmt->url->getRawString();
+ if (!fs->exists(targetPath.c_str())) {
+ LOG_ERROR(ERRCODE_DATA_EXCEPTION, "no data directory found: %s",
+ targetPath.c_str());
+ }
+ // Generate filename for current insertion.
+ uuid_t uuid;
+ char buf[1024];
+ uuid_generate_time(uuid);
+ uuid_unparse(uuid, buf);
+ fmt->insertFileName.append(buf, strlen(buf));
+
+ fmt->orcFormat->beginInsert(targetRawPath + fmt->insertFileName, fmt->desc);
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatInsertORCFormatC(ORCFormatC *fmt, int *datatypes, char **values,
+ uint64_t *lens, unsigned char **nullBitmap,
+ int32_t **dims, bool *isNull) {
+ try {
+ if (fmt->tb == nullptr)
+ fmt->tb.reset(new dbcommon::TupleBatch(fmt->desc, true));
+
+ dbcommon::TupleBatchWriter &writers = fmt->tb->getTupleBatchWriter();
+ int natts = fmt->desc.getNumOfColumns();
+
+ for (int i = 0; i < natts; ++i) {
+ dbcommon::TypeKind datatype =
+ (static_cast<dbcommon::TypeKind>(datatypes[i]));
+ switch (datatype) {
+ case dbcommon::TypeKind::BOOLEANID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]), sizeof(bool),
+ isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::TINYINTID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(int8_t), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::SMALLINTID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(int16_t), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::INTID:
+ case dbcommon::TypeKind::DATEID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(int32_t), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::BIGINTID:
+ case dbcommon::TypeKind::TIMEID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(int64_t), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::FLOATID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]), sizeof(float),
+ isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::DOUBLEID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(double), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::CHARID:
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::STRINGID:
+ case dbcommon::TypeKind::BINARYID:
+ case dbcommon::TypeKind::DECIMALID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID:
+ writers[i]->append(reinterpret_cast<char *>(values[i]),
+ sizeof(int64_t) + sizeof(int64_t), isNull[i]);
+ break;
+
+ case dbcommon::TypeKind::SMALLINTARRAYID:
+ case dbcommon::TypeKind::INTARRAYID:
+ case dbcommon::TypeKind::BIGINTARRAYID:
+ case dbcommon::TypeKind::FLOATARRAYID:
+ case dbcommon::TypeKind::DOUBLEARRAYID: {
+ dbcommon::ListVector *lwriter =
+ reinterpret_cast<dbcommon::ListVector *>(writers[i].get());
+ lwriter->append(reinterpret_cast<char *>(values[i]), lens[i],
+ nullBitmap[i], dims[i], isNull[i], true);
+ break;
+ }
+ case dbcommon::TypeKind::INVALIDTYPEID:
+ LOG_ERROR(ERRCODE_DATA_EXCEPTION, "data type with id %d is invalid",
+ static_cast<int>(datatype));
+
+ default:
+ LOG_ERROR(ERRCODE_DATA_EXCEPTION,
+ "data type with id %d is not supported yet",
+ static_cast<int>(datatype));
+ break;
+ }
+ }
+
+ fmt->tb->incNumOfRows(1);
+ if (fmt->tb->getNumOfRows() >= storage::Format::kTuplesPerBatch) {
+ fmt->orcFormat->doInsert(std::move(fmt->tb));
+ fmt->tb = nullptr;
+ }
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatEndInsertORCFormatC(ORCFormatC *fmt) {
+ try {
+ if (fmt->tb) fmt->orcFormat->doInsert(std::move(fmt->tb)); // NOLINT
+ fmt->orcFormat->endInsert();
+ dbcommon::FileSystem *fs = FSManager.get(fmt->url->getRawString());
+ fs->rename((fmt->url->getPath() + fmt->insertFileName).c_str(),
+ (fmt->url->getPath() + "/.." + fmt->insertFileName).c_str());
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ }
+}
+
+void ORCFormatFreeORCFormatC(ORCFormatC **fmt) {
+ if (*fmt == nullptr) return;
+ delete *fmt;
+ *fmt = nullptr;
+}
+
+ORCFormatCatchedError *ORCFormatGetErrorORCFormatC(ORCFormatC *fmt) {
+ return &(fmt->error);
+}
+
+void ORCFormatSetErrorORCFormatC(ORCFormatCatchedError *ce, int errCode,
+ const char *errMsg) {
+ assert(ce != nullptr);
+ ce->errCode = errCode;
+ snprintf(ce->errMessage, strlen(errMsg) + 1, "%s", errMsg);
+}
+
+static void textRelatedGetValueBuffer(ORCFormatC *fmt, dbcommon::BytesVector *v,
+ OrcColumnReader *reader) {
+ bool hasNull = v->hasNullValue();
+ const uint64_t *lens = v->getLengths();
+ const char **valPtrs = v->getValPtrs();
+ reader->valBuffer->clear();
+ reader->lens = lens;
+ if (hasNull) {
+ const bool *nulls = v->getNullBuffer()->getBools();
+ for (uint64_t i = 0; i < fmt->rowCount; ++i) {
+ if (!nulls[i]) {
+ uint32_t len = lens[i];
+ reader->valBuffer->append(len);
+ reader->valBuffer->append(valPtrs[i], len);
+ }
+ }
+ reader->nulls = nulls;
+ } else {
+ for (uint64_t i = 0; i < fmt->rowCount; ++i) {
+ uint32_t len = lens[i];
+ reader->valBuffer->append(len);
+ reader->valBuffer->append(valPtrs[i], len);
+ }
+ reader->nulls = nullptr;
+ }
+ reader->value = reader->valBuffer->data();
+}
+
+static void timestampGetValueBuffer(ORCFormatC *fmt,
+ dbcommon::TimestampVector *v,
+ OrcColumnReader *reader) {
+ bool hasNull = v->hasNullValue();
+ const char **valPtrs = v->getValPtrs();
+ const int64_t *second = reinterpret_cast<const int64_t *>(v->getValue());
+ const int64_t *nanosecond =
+ reinterpret_cast<const int64_t *>(v->getNanoseconds());
+ reader->valBuffer->clear();
+ if (hasNull) {
+ const bool *nulls = v->getNullBuffer()->getBools();
+ for (uint64_t i = 0; i < fmt->rowCount; ++i) {
+ if (!nulls[i]) {
+ int64_t val = (second[i] - TIMESTAMP_EPOCH_JDATE) * 1000000 +
+ nanosecond[i] / 1000;
+ reader->valBuffer->append(val);
+ }
+ }
+ reader->nulls = nulls;
+ } else {
+ for (uint64_t i = 0; i < fmt->rowCount; ++i) {
+ int64_t val =
+ (second[i] - TIMESTAMP_EPOCH_JDATE) * 1000000 + nanosecond[i] / 1000;
+ reader->valBuffer->append(val);
+ }
+ reader->nulls = nullptr;
+ }
+ reader->value = reader->valBuffer->data();
+}
+
+static void decimalGetValueBuffer(dbcommon::DecimalVector *srcVector,
+ OrcColumnReader *reader) {
+ dbcommon::DecimalVectorRawData src(srcVector);
+
+ auto convertNumericTranData = [&](uint64_t plainIdx) {
+ NumericTransData numeric;
+ dbcommon::Int128 data(src.hightbits[plainIdx], src.lowbits[plainIdx]);
+
+ numeric.sign_dscale = NUMERIC_POS;
+ if (data.isNegative()) {
+ numeric.sign_dscale = NUMERIC_NEG;
+ data = data.negate();
+ }
+
+ // Pad zero for fractional part in order to make it counted by int16_t
+ int16_t scaleDigitCount = src.scales[plainIdx];
+ int16_t paddingDigitCount =
+ (DEC_DIGITS - scaleDigitCount % DEC_DIGITS) % DEC_DIGITS;
+ int16_t significantDigitCount = data.getNumOfDigit();
+
+ bool isPaddingSuffix = significantDigitCount > scaleDigitCount;
+ int16_t totalDigitCount = isPaddingSuffix
+ ? significantDigitCount + paddingDigitCount
+ : scaleDigitCount + paddingDigitCount;
+
+ numeric.sign_dscale |= (scaleDigitCount & NUMERIC_DSCALE_MASK);
+ numeric.weight = isPaddingSuffix ? (totalDigitCount - scaleDigitCount -
+ paddingDigitCount + (DEC_DIGITS - 1)) /
+ DEC_DIGITS -
+ 1
+ : -1;
+ numeric.varlen =
+ NUMERIC_HDRSZ +
+ ((totalDigitCount + DEC_DIGITS - 1) / DEC_DIGITS) * sizeof(int16_t);
+
+ // Reserver buffer
+ reader->valBuffer->resize(reader->valBuffer->size() + numeric.varlen);
+
+ // Fill header
+ *reinterpret_cast<NumericTransData *>(reader->valBuffer->tail() -
+ numeric.varlen) = numeric;
+
+ // Fill digits
+ __int128_t dividend =
+ (__int128_t(data.getHighBits()) << 64) + __int128_t(data.getLowBits());
+ for (int i = 0; i < paddingDigitCount; i++) dividend *= 10;
+ int16_t *ptr = reinterpret_cast<int16_t *>(reader->valBuffer->tail());
+ for (int i = 0; i < (numeric.varlen - NUMERIC_HDRSZ) / sizeof(int16_t);
+ i++) {
+ int16_t remainder = dividend % 10000;
+ *--ptr = remainder;
+ dividend /= 10000;
+ }
+ assert(reinterpret_cast<char *>(ptr) ==
+ reader->valBuffer->tail() - numeric.varlen + NUMERIC_HDRSZ);
+ };
+ reader->valBuffer->clear();
+ dbcommon::transformVector(src.plainSize, src.sel, src.nulls,
+ convertNumericTranData);
+
+ reader->nulls = srcVector->getNulls();
+ reader->value = reader->valBuffer->data();
+}
+
+static void columnReadGetContent(ORCFormatC *fmt) {
+ const dbcommon::TupleBatchReader &tbReader = fmt->tb->getTupleBatchReader();
+ int32_t colIndex = 0;
+ for (auto plainColIndex : fmt->colToReadIds) {
+ OrcColumnReader *colReader = fmt->columnReaders[colIndex++].get();
+ switch (colReader->type) {
+ case dbcommon::TypeKind::STRINGID:
+ case dbcommon::TypeKind::CHARID:
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::BINARYID: {
+ dbcommon::BytesVector *v = dynamic_cast<dbcommon::BytesVector *>(
+ tbReader[plainColIndex].get());
+ textRelatedGetValueBuffer(fmt, v, colReader);
+ break;
+ }
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID: {
+ dbcommon::TimestampVector *v =
+ dynamic_cast<dbcommon::TimestampVector *>(
+ tbReader[plainColIndex].get());
+ timestampGetValueBuffer(fmt, v, colReader);
+ break;
+ }
+ case dbcommon::TypeKind::DECIMALID: {
+ dbcommon::DecimalVector *v = dynamic_cast<dbcommon::DecimalVector *>(
+ tbReader[plainColIndex].get());
+ decimalGetValueBuffer(v, colReader);
+ break;
+ }
+ case dbcommon::TypeKind::BOOLEANID:
+ case dbcommon::TypeKind::SMALLINTID:
+ case dbcommon::TypeKind::INTID:
+ case dbcommon::TypeKind::BIGINTID:
+ case dbcommon::TypeKind::FLOATID:
+ case dbcommon::TypeKind::DOUBLEID:
+ case dbcommon::TypeKind::DATEID:
+ case dbcommon::TypeKind::TIMEID: {
+ dbcommon::Vector *v =
+ dynamic_cast<dbcommon::Vector *>(tbReader[plainColIndex].get());
+ if (v->hasNullValue()) {
+ colReader->nulls = v->getNullBuffer()->getBools();
+ } else {
+ colReader->nulls = nullptr;
+ }
+ colReader->value = v->getValue();
+ break;
+ }
+ default: {
+ LOG_ERROR(ERRCODE_DATA_EXCEPTION, "not supported yet");
+ break;
+ }
+ }
+ }
+}
+
+bool ORCFormatNextORCFormatC(ORCFormatC *fmt, const char **values,
+ uint64_t *lens, bool *nulls) {
+ try {
+ begin:
+ if (fmt->needNewTupleBatch) {
+ fmt->tb = fmt->orcFormat->next();
+ if (fmt->tb == nullptr) {
+ return false;
+ }
+ fmt->needNewTupleBatch = false;
+ fmt->rowRead = 0;
+ fmt->rowCount = fmt->tb->getNumOfRows();
+ if (fmt->rowCount > 0) columnReadGetContent(fmt);
+ }
+
+ if (fmt->rowRead < fmt->rowCount) {
+ int32_t colIndex = 0;
+ for (auto plainColIndex : fmt->colToReadIds) {
+ OrcColumnReader *reader = fmt->columnReaders[colIndex++].get();
+ switch (reader->type) {
+ case dbcommon::TypeKind::STRINGID:
+ case dbcommon::TypeKind::CHARID:
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::BINARYID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ lens[plainColIndex] = reader->lens[fmt->rowRead] + 4;
+ reader->value += lens[plainColIndex];
+ }
+ break;
+ }
+ case dbcommon::TypeKind::BOOLEANID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ }
+ reader->value += 1;
+ break;
+ }
+ case dbcommon::TypeKind::SMALLINTID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ }
+ reader->value += 2;
+ break;
+ }
+ case dbcommon::TypeKind::INTID:
+ case dbcommon::TypeKind::FLOATID:
+ case dbcommon::TypeKind::DATEID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ }
+ reader->value += 4;
+ break;
+ }
+ case dbcommon::TypeKind::BIGINTID:
+ case dbcommon::TypeKind::DOUBLEID:
+ case dbcommon::TypeKind::TIMEID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ }
+ reader->value += 8;
+ break;
+ }
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ reader->value += 8;
+ }
+ break;
+ }
+ case dbcommon::TypeKind::DECIMALID: {
+ if (reader->nulls && reader->nulls[fmt->rowRead]) {
+ nulls[plainColIndex] = true;
+ } else {
+ nulls[plainColIndex] = false;
+ values[plainColIndex] = reader->value;
+ lens[plainColIndex] =
+ (reinterpret_cast<const NumericTransData *>(reader->value))
+ ->varlen;
+ reader->value += lens[plainColIndex];
+ }
+ break;
+ }
+ default: {
+ LOG_ERROR(ERRCODE_DATA_EXCEPTION, "not supported yet");
+ break;
+ }
+ }
+ }
+ ++fmt->rowRead;
+ } else {
+ fmt->needNewTupleBatch = true;
+ goto begin;
+ }
+ return true;
+ } catch (dbcommon::TransactionAbortException &e) {
+ ORCFormatSetErrorORCFormatC(&(fmt->error), e.errCode(), e.what());
+ return false;
+ }
+}
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/depends/storage/src/storage/cwrapper/orc-format-c.h b/depends/storage/src/storage/cwrapper/orc-format-c.h
new file mode 100644
index 0000000..8b423da
--- /dev/null
+++ b/depends/storage/src/storage/cwrapper/orc-format-c.h
@@ -0,0 +1,81 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_CWRAPPER_ORC_FORMAT_C_H_
+#define STORAGE_SRC_STORAGE_CWRAPPER_ORC_FORMAT_C_H_
+
+#include <stdint.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#ifndef ERROR_MESSAGE_BUFFER_SIZE
+#define ERROR_MESSAGE_BUFFER_SIZE 4096
+#endif
+
+struct ORCFormatC;
+
+typedef struct ORCFormatC ORCFormatC;
+
+typedef struct ORCFormatCatchedError {
+ int errCode;
+ char errMessage[ERROR_MESSAGE_BUFFER_SIZE];
+} ORCFormatCatchedError;
+
+typedef struct ORCFormatFileSplit {
+ char *fileName;
+ int64_t start;
+ int64_t len;
+} ORCFormatFileSplit;
+
+#define ORCFormatType 'o'
+
+// tableOptions in json format
+ORCFormatC *ORCFormatNewORCFormatC(const char *tableOptions, int segno);
+void ORCFormatFreeORCFormatC(ORCFormatC **fmt);
+
+void ORCFormatBeginORCFormatC(ORCFormatC *fmt, ORCFormatFileSplit *splits,
+ int numSplits, bool *columnsToRead,
+ char **columnName, int *columnDatatype,
+ uint64_t *columnDatatypeMod, int numColumns);
+
+bool ORCFormatNextORCFormatC(ORCFormatC *fmt, const char **values,
+ uint64_t *lens, bool *nulls);
+
+void ORCFormatRescanORCFormatC(ORCFormatC *fmt);
+
+void ORCFormatEndORCFormatC(ORCFormatC *fmt);
+
+void ORCFormatBeginInsertORCFormatC(ORCFormatC *fmt, const char *dirFullPath,
+ char **columnName, int *columnDatatype,
+ uint64_t *columnDatatypeMod,
+ int numColumns);
+void ORCFormatInsertORCFormatC(ORCFormatC *fmt, int *datatypes, char **values,
+ uint64_t *lens, unsigned char **nullBitmap,
+ int32_t **dims, bool *isNull);
+void ORCFormatEndInsertORCFormatC(ORCFormatC *fmt);
+
+ORCFormatCatchedError *ORCFormatGetErrorORCFormatC(ORCFormatC *fmt);
+
+#ifdef __cplusplus
+}
+#endif
+
+#endif // STORAGE_SRC_STORAGE_CWRAPPER_ORC_FORMAT_C_H_
diff --git a/depends/storage/src/storage/format/format.cc b/depends/storage/src/storage/format/format.cc
new file mode 100644
index 0000000..fc2c04e
--- /dev/null
+++ b/depends/storage/src/storage/format/format.cc
@@ -0,0 +1,94 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/utils/parameters.h"
+#include "dbcommon/utils/url.h"
+
+#include "storage/format/format.h"
+#include "storage/format/orc/orc-format.h"
+
+#include "univplan/univplanbuilder/univplanbuilder-scan-task.h"
+
+namespace storage {
+
+std::unique_ptr<Format> Format::createFormat(
+ univplan::UNIVPLANFORMATTYPE type) {
+ return createFormat(type, nullptr);
+}
+
+std::unique_ptr<Format> Format::createFormat(univplan::UNIVPLANFORMATTYPE type,
+ dbcommon::Parameters *p) {
+ std::unique_ptr<Format> format;
+ switch (type) {
+ case univplan::UNIVPLANFORMATTYPE::ORC_FORMAT: {
+ format.reset(new ORCFormat(p));
+ break;
+ }
+ default: {
+ LOG_ERROR(ERRCODE_INVALID_PARAMETER_VALUE, "invalid format %d", type);
+ }
+ }
+ return std::move(format);
+}
+
+//
+// we use fileLenghts as input since if we implement transaction,
+// we can not get file EOF correctly without knowing more information.
+//
+// for now, we simply allocate average length of file total size(avgLength) to
+// each task.
+// the splits in each task may contains several files(their total length equals
+// to avgLength).
+//
+std::unique_ptr<univplan::UnivPlanScanFileSplitListList> Format::createTasks(
+ const std::vector<std::unique_ptr<Input> > &inputs, int nWorker) {
+ LOG_INFO("createTasks is called");
+
+ assert(nWorker > 0);
+
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListList> taskList(
+ new univplan::UnivPlanScanFileSplitListList);
+ // create one scan task to contain all splits
+ univplan::UnivPlanBuilderScanTask scanTaskBld;
+ for (const std::unique_ptr<Input> &file : inputs) {
+ FileInput *fi = static_cast<FileInput *>(file.get());
+ dbcommon::URL urlParser(fi->getName());
+ dbcommon::FileSystem *fs =
+ fsManager->get(urlParser.getNormalizedServiceName());
+ std::vector<std::unique_ptr<dbcommon::FileBlockLocation> > locations =
+ fs->getFileBlockLocation(urlParser.getPath().c_str(), 0, fi->getSize());
+ for (const std::unique_ptr<dbcommon::FileBlockLocation> &loc : locations) {
+ scanTaskBld.addScanFileSplit(fi->getName().c_str(), loc->offset,
+ loc->length, -1, -1); // no rangeid, rgid
+ }
+ }
+ // build scan task by transfering tb from this builder to fmt instance
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListTb> newScanTask(
+ new univplan::UnivPlanScanFileSplitListTb(
+ std::move(scanTaskBld.releaseSplitsTb())));
+
+ // newScanTask->debugOuput();
+
+ taskList->push_back(std::move(newScanTask));
+ return std::move(taskList);
+}
+
+} // namespace storage
diff --git a/depends/storage/src/storage/format/format.h b/depends/storage/src/storage/format/format.h
new file mode 100644
index 0000000..63559ee
--- /dev/null
+++ b/depends/storage/src/storage/format/format.h
@@ -0,0 +1,223 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_FORMAT_H_
+#define STORAGE_SRC_STORAGE_FORMAT_FORMAT_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/utils/macro.h"
+#include "dbcommon/utils/parameters.h"
+#include "univplan/common/univplan-type.h"
+
+namespace storage {
+
+class FileSystemManagerInterface;
+
+class Input {
+ public:
+ Input() {}
+ virtual ~Input() {}
+
+ // Get input name
+ // @return The input name
+ virtual const std::string &getName() const = 0;
+
+ // Get input size
+ // @return The input size
+ virtual int64_t getSize() const = 0;
+};
+
+class FileInput : public Input {
+ public:
+ FileInput(const char *fileName, int64_t length) {
+ this->fileName = fileName;
+ this->length = length;
+ }
+ FileInput(const std::string &fileName, int64_t length) {
+ this->fileName = fileName;
+ this->length = length;
+ }
+ FileInput(FileInput &&file) { // NOLINT
+ this->fileName = std::move(file.fileName);
+ this->length = file.length;
+ }
+ FileInput(const FileInput &file) {
+ this->fileName = file.fileName;
+ this->length = file.length;
+ }
+ FileInput &operator=(const FileInput &file) {
+ this->fileName = file.fileName;
+ this->length = file.length;
+ return *this;
+ }
+
+ virtual ~FileInput() {}
+
+ const std::string &getName() const override { return fileName; }
+
+ int64_t getSize() const override { return length; }
+
+ private:
+ std::string fileName;
+ int64_t length;
+};
+
+// This struct was added to make the foramt interface extensible
+typedef struct FormatContext {
+ univplan::UnivPlanExprPolyList indexExpr;
+} FormatContext;
+
+// Format "read" accepts a list of splits, and return TupleBatches one by one.
+// And Format "write" accept TupleBatches, and write them to storage. It is
+// quite like InputFormat/OutputFormat of MR.
+//
+// It is a general concept, not only about concrete file format.
+// It can be FAST format files on HDFS, Text files on HDFS,
+// even HBase format. So it is extensible.
+// Users should be able to write their only format.
+// For example, users can write a PostgresqlFormat to read and write data
+// to a postgresql server.
+
+class Format {
+ public:
+ Format() {}
+
+ virtual ~Format() {}
+
+ void setFileSystemManager(dbcommon::FileSystemManagerInterface *fsManager) {
+ this->fsManager = fsManager;
+ }
+
+ // Begin scan of the splits
+ // @param splits The file splits need to be scanned
+ // @param tupleDesc The tuple description for the target table
+ // @param projectionCols The project columns list
+ // @param filterExpr The filter expression
+ // @param readStatsOnly To indicate if read only statistics
+ // @return void
+ virtual void beginScan(const univplan::UnivPlanScanFileSplitListList *splits,
+ const dbcommon::TupleDesc *tupleDesc,
+ const std::vector<bool> *projectionCols,
+ const univplan::UnivPlanExprPolyList *filterExpr,
+ const FormatContext *formatContext,
+ bool readStatsOnly) = 0;
+
+ // Get next TupleBatch
+ // @return unique_ptr of dbcommon::TupleBatch
+ virtual std::unique_ptr<dbcommon::TupleBatch> next() = 0;
+
+ // End the scan
+ // @return void
+ virtual void endScan() = 0;
+
+ // Restart the scan
+ // @return void
+ virtual void reScan() = 0;
+
+ // Stop the scan
+ // @return void
+ virtual void stopScan() = 0;
+
+ // Begin insert
+ // @param targetName The target name. For 'fast' and 'text' format, it
+ // is the target file name. For 'hbase' format, it is the target table.
+ // @param td The tuple description
+ // @return void
+ virtual void beginInsert(const std::string &targetName,
+ const dbcommon::TupleDesc &tupleDesc) = 0;
+
+ // Insert a tuple batch
+ // @param tb The tuple batch to be inserted
+ virtual void doInsert(std::unique_ptr<dbcommon::TupleBatch> tb) = 0;
+
+ // End insert
+ virtual void endInsert() = 0;
+
+ // Begin update
+ // @param targetName The target name. For 'fast' and 'text' format, it
+ // is the target file name. For 'hbase' format, it is the target table.
+ // @param td The tuple description
+ // @return void
+ virtual void beginUpdate(const std::string &targetName,
+ const dbcommon::TupleDesc &tupleDesc) = 0;
+
+ // Update a tuple batch
+ // @param tb The tuple batch to be updated
+ virtual void doUpdate(std::unique_ptr<dbcommon::TupleBatch> tb) = 0;
+
+ // End update
+ virtual void endUpdate() = 0;
+
+ // Begin delete
+ // @param targetName The target name. For 'fast' and 'text' format, it
+ // is the target file name. For 'hbase' format, it is the target table.
+ // @param td The tuple description
+ // @return void
+ virtual void beginDelete(const std::string &targetName,
+ const dbcommon::TupleDesc &tupleDesc) = 0;
+
+ // Delete a tuple batch
+ // @param tb The tuple batch to be deleted
+ virtual void doDelete(std::unique_ptr<dbcommon::TupleBatch> tb) = 0;
+
+ // End delete
+ virtual void endDelete() = 0;
+
+ // Create tasks given input and the number of workers
+ // @param files The input files
+ // @param nWorker The number of workers
+ // @return The list of tasks, each worker has one task assigned.
+ // it is possible if there is no splits in a task
+ // when there is no enough splits (each task has a split list)
+ virtual std::unique_ptr<univplan::UnivPlanScanFileSplitListList> createTasks(
+ const std::vector<std::unique_ptr<Input> > &inputs, int nWorker);
+
+ // set & get user command for external table
+ std::string getUserCommand() const { return userCommand; }
+ void setUserCommand(std::string command) { userCommand = command; }
+ virtual void setCancelled() {}
+
+ static std::unique_ptr<Format> createFormat(
+ univplan::UNIVPLANFORMATTYPE type);
+ static std::unique_ptr<Format> createFormat(univplan::UNIVPLANFORMATTYPE type,
+ dbcommon::Parameters *p);
+
+ static const int kTuplesPerBatch = DEFAULT_NUMBER_TUPLES_PER_BATCH;
+ static const int kBlockSize = DEFAULT_BLOCK_SIZE;
+
+ protected:
+ // Format does not own splits, so it does not delete it in destructor.
+ const univplan::UnivPlanScanFileSplitListList *splits = nullptr;
+ // The file system manager used to get the file system
+ dbcommon::FileSystemManagerInterface *fsManager = nullptr;
+ // user command for external table such as DBGEN
+ std::string userCommand = "";
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_FORMAT_H_
diff --git a/depends/storage/src/storage/format/orc/README b/depends/storage/src/storage/format/orc/README
new file mode 100644
index 0000000..2ba5af3
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/README
@@ -0,0 +1,320 @@
+
+1. ORC read process:
+
+FileInputStream -> SnappyDecompressionStream -> RLE decoder -> raw bytes
+
+2. ORC write process
+
+raw bytes -> RLE encoder -> SnappyCompressionStream -> FileOutputStream
+
+3. TODOs
+
+1) url format “hdfs://localhost:8020/user/hive/warehouse/tsmallint/” or “hdfs://localhost:8020//user/hive/warehouse/tsmallint” are not correct
+
+2) now some types are supported: tinyint, smallint, int, bigint,float, double, string, varchar
+need more type support: boolean/date/char(x)/timestamp/decimal/struct et al
+
+TO add a type support, we need to pay special attention to:
+a. TypeImpl::createRowBatch: this function returns correct ColumnVectorBatch
+b. buildReader -- this function returns reader of correct type
+
+3) null value handling performance enhancement
+
+4) writer
+
+ orc::WriterOptions opts;
+ std::unique_ptr<orc::Writer> writer;
+ dbcommon::URL url(filename);
+ dbcommon::FileSystemManager fsm;
+ dbcommon::FileSystem *fs = fsm.get(url.getNormalizedServiceName());
+ writer = orc::createWriter(orc::writeFile(fs, url.getPath()), opts);
+
+ std::unique_ptr<orc::ColumnVectorBatch> batch = writer->createRowBatch(1000);
+ for (uint32_t i = 0; i < 100; i++) {
+ writer->addRowBatch(batch);
+ }
+ writer->close();
+
+5) hasEnoughSpaceForBatch needs to be revisited
+ // estimated tuple batch size
+ // TODO(lei): might need to be revised after we figure out how to
+ // store other types
+
+6) read footer only once on master, then dispatch it to worker. this can potentially
+ avoid all opening footers at the same time
+
+7) add some boundary numbers tests for orc format: for example, max(int32_t) for different
+ encoding schemes - delta, direct, patchedbase, short repeat.
+
+8) need to compare the performance & compression ration for lz4 and snappy
+
+9) write more information to orc file: statistics, indexes. otherwise,
+ it needs data reloading when we added the feature in reader.
+
+10) add tests for snappy (since lz4 is now default), and add tests for snappycompressor/lz4compressor
+
+4. How to use
+
+-- hive
+create table tcn(t tinyint, s smallint, i int, b bigint, f float, d double, str string, v varchar(10), c char(4), bin binary) stored as orc;
+
+insert into tcn values (1, 2, 3, 4, 1.1, 1.2, 'string', 'var', 'char', 'binary');
+
+select * from tcn;
+
+-- computenode
+create table tcn(t tinyint, s smallint, i int, b bigint, f float, d double, str string, v varchar(10), c string, bin string)
+with (format = orc, location= 'hdfs://localhost:8020/user/hive/warehouse/tcn');
+
+select * from tcn;
+
+5. micro benchmark: based on 2016 Oct 15 version (after analyze)
+
+NOTE: Analyze is very important for aggregation.
+
+ lz4 fast snappy orcnone postgres
+filesize 426MB 1042MB 402MB 696M 985MB
+load 20239ms 20784ms 20098ms 19944ms 22767ms(copy)
+count* 32ms 32ms 31ms 30ms 639ms
+countint 96ms 42ms 98ms 97ms 745ms
+countstring 179ms 85ms 235ms 120ms 1154ms
+count2int2string 444ms 165ms 501ms 365ms 1596ms
+tpch-Q1 1025ms 386ms 1072ms 896ms 3830ms
+
+
+The benchmark used is:
+1) schema
+CREATE TABLE e_LINEITEM ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double,
+ L_EXTENDEDPRICE double,
+ L_DISCOUNT double,
+ L_TAX double,
+ L_RETURNFLAG string,
+ L_LINESTATUS string,
+ L_SHIPDATE string,
+ L_COMMITDATE string,
+ L_RECEIPTDATE string,
+ L_SHIPINSTRUCT string,
+ L_SHIPMODE string,
+ L_COMMENT string) with (FORMAT = command, COMMANDS = '/Users/ChangLei/curwork/dev/computenode/inst/bin/dbgen -b /Users/ChangLei/curwork/dev/computenode/inst/bin/dists.dss -T L -s 1 -C 2 -S $TASKNO', TaskCount = 2);
+
+CREATE TABLE lineitem_orc_lz4 ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double,
+ L_EXTENDEDPRICE double,
+ L_DISCOUNT double,
+ L_TAX double,
+ L_RETURNFLAG string,
+ L_LINESTATUS string,
+ L_SHIPDATE string,
+ L_COMMITDATE string,
+ L_RECEIPTDATE string,
+ L_SHIPINSTRUCT string,
+ L_SHIPMODE string,
+ L_COMMENT string) with(format = orc, location='file:///tmp/lineitem_orc_lz4');
+
+CREATE TABLE lineitem_fast ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double,
+ L_EXTENDEDPRICE double,
+ L_DISCOUNT double,
+ L_TAX double,
+ L_RETURNFLAG string,
+ L_LINESTATUS string,
+ L_SHIPDATE string,
+ L_COMMITDATE string,
+ L_RECEIPTDATE string,
+ L_SHIPINSTRUCT string,
+ L_SHIPMODE string,
+ L_COMMENT string) with(format = fast, location='file:///tmp/lineitem_fast');
+
+CREATE TABLE lineitem_orc_snappy ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double,
+ L_EXTENDEDPRICE double,
+ L_DISCOUNT double,
+ L_TAX double,
+ L_RETURNFLAG string,
+ L_LINESTATUS string,
+ L_SHIPDATE string,
+ L_COMMITDATE string,
+ L_RECEIPTDATE string,
+ L_SHIPINSTRUCT string,
+ L_SHIPMODE string,
+ L_COMMENT string) with(format = orc, location='file:///tmp/lineitem_orc_snappy');
+
+CREATE TABLE lineitem_orc_none ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double,
+ L_EXTENDEDPRICE double,
+ L_DISCOUNT double,
+ L_TAX double,
+ L_RETURNFLAG string,
+ L_LINESTATUS string,
+ L_SHIPDATE string,
+ L_COMMITDATE string,
+ L_RECEIPTDATE string,
+ L_SHIPINSTRUCT string,
+ L_SHIPMODE string,
+ L_COMMENT string) with(format = orc, location='file:///tmp/lineitem_orc_none');
+
+CREATE TABLE lineitem_pg ( L_ORDERKEY int,
+ L_PARTKEY int,
+ L_SUPPKEY int,
+ L_LINENUMBER int,
+ L_QUANTITY double precision,
+ L_EXTENDEDPRICE double precision,
+ L_DISCOUNT double precision,
+ L_TAX double precision,
+ L_RETURNFLAG varchar,
+ L_LINESTATUS varchar,
+ L_SHIPDATE varchar,
+ L_COMMITDATE varchar,
+ L_RECEIPTDATE varchar,
+ L_SHIPINSTRUCT varchar,
+ L_SHIPMODE varchar,
+ L_COMMENT varchar);
+
+2) loading
+insert into lineitem_orc_lz4 select * from e_lineitem;
+insert into lineitem_fast select * from e_lineitem;
+insert into lineitem_orc_snappy select * from e_lineitem;
+insert into lineitem_orc_none select * from e_lineitem;
+copy lineitem_pg from '/Users/ChangLei/curwork/dev/tpch-dbgen/lineitem.tbl' with delimiter '|';
+
+analyze lineitem_orc_lz4;
+analyze lineitem_fast;
+analyze lineitem_orc_snappy;
+analyze lineitem_orc_none;
+analyze lineitem_pg;
+
+
+3) count*
+select count(*) from lineitem_orc_lz4;
+select count(*) from lineitem_fast;
+select count(*) from lineitem_orc_snappy;
+select count(*) from lineitem_orc_none;
+select count(*) from lineitem_pg;
+
+4) countint
+select count(L_ORDERKEY) from lineitem_orc_lz4;
+select count(L_ORDERKEY) from lineitem_fast;
+select count(L_ORDERKEY) from lineitem_orc_snappy;
+select count(L_ORDERKEY) from lineitem_orc_none;
+select count(L_ORDERKEY) from lineitem_pg;
+
+5) countstring
+select count(L_COMMENT) from lineitem_orc_lz4;
+select count(L_COMMENT) from lineitem_fast;
+select count(L_COMMENT) from lineitem_orc_snappy;
+select count(L_COMMENT) from lineitem_orc_none;
+select count(L_COMMENT) from lineitem_pg;
+
+6) count2int2string
+select count(L_ORDERKEY), count(L_SUPPKEY), count(L_SHIPMODE), count(L_COMMENT) from lineitem_orc_lz4;
+select count(L_ORDERKEY), count(L_SUPPKEY), count(L_SHIPMODE), count(L_COMMENT) from lineitem_fast;
+select count(L_ORDERKEY), count(L_SUPPKEY), count(L_SHIPMODE), count(L_COMMENT) from lineitem_orc_snappy;
+select count(L_ORDERKEY), count(L_SUPPKEY), count(L_SHIPMODE), count(L_COMMENT) from lineitem_orc_none;
+select count(L_ORDERKEY), count(L_SUPPKEY), count(L_SHIPMODE), count(L_COMMENT) from lineitem_pg;
+
+7) tpch-Q1
+
+SELECT
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ sum(l_quantity) as avg_qty,
+ sum(l_extendedprice) as avg_price,
+ sum(l_discount) as avg_disc,
+ count(*) as count_order
+FROM
+ lineitem_orc_lz4
+GROUP BY
+ l_returnflag,
+ l_linestatus;
+
+SELECT
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ sum(l_quantity) as avg_qty,
+ sum(l_extendedprice) as avg_price,
+ sum(l_discount) as avg_disc,
+ count(*) as count_order
+FROM
+ lineitem_fast
+GROUP BY
+ l_returnflag,
+ l_linestatus;
+
+
+SELECT
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ sum(l_quantity) as avg_qty,
+ sum(l_extendedprice) as avg_price,
+ sum(l_discount) as avg_disc,
+ count(*) as count_order
+FROM
+ lineitem_orc_snappy
+GROUP BY
+ l_returnflag,
+ l_linestatus;
+
+
+SELECT
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ sum(l_quantity) as avg_qty,
+ sum(l_extendedprice) as avg_price,
+ sum(l_discount) as avg_disc,
+ count(*) as count_order
+FROM
+ lineitem_orc_none
+GROUP BY
+ l_returnflag,
+ l_linestatus;
+
+SELECT
+ l_returnflag,
+ l_linestatus,
+ sum(l_quantity) as sum_qty,
+ sum(l_extendedprice) as sum_base_price,
+ sum(l_extendedprice * (1 - l_discount)) as sum_disc_price,
+ sum(l_extendedprice * (1 - l_discount) * (1 + l_tax)) as sum_charge,
+ sum(l_quantity) as avg_qty,
+ sum(l_extendedprice) as avg_price,
+ sum(l_discount) as avg_disc,
+ count(*) as count_order
+FROM
+ lineitem_pg
+GROUP BY
+ l_returnflag,
+ l_linestatus;
+
+
diff --git a/depends/storage/src/storage/format/orc/byte-rle.cc b/depends/storage/src/storage/format/orc/byte-rle.cc
new file mode 100644
index 0000000..724a43a
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/byte-rle.cc
@@ -0,0 +1,476 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <immintrin.h>
+#include <string.h>
+
+#include <algorithm>
+#include <iostream>
+#include <utility>
+
+#include "storage/format/orc/byte-rle.h"
+#include "storage/format/orc/exceptions.h"
+
+namespace orc {
+
+const size_t MINIMUM_REPEAT = 3;
+
+ByteRleDecoder::~ByteRleDecoder() {
+ // PASS
+}
+
+void ByteRleDecoderImpl::nextBuffer() {
+ int bufferLength = 0;
+ const void *bufferPointer = nullptr;
+ bool result = inputStream->Next(&bufferPointer, &bufferLength);
+ if (!result) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in nextBuffer");
+ }
+ bufferStart = static_cast<const char *>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+}
+
+signed char ByteRleDecoderImpl::readByte() {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ return *(bufferStart++);
+}
+
+void ByteRleDecoderImpl::readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<size_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<size_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ value = readByte();
+ }
+}
+
+ByteRleDecoderImpl::ByteRleDecoderImpl(
+ std::unique_ptr<SeekableInputStream> input) {
+ inputStream = std::move(input);
+ repeating = false;
+ remainingValues = 0;
+ value = 0;
+ bufferStart = 0;
+ bufferEnd = 0;
+}
+
+ByteRleDecoderImpl::~ByteRleDecoderImpl() {
+ // PASS
+}
+
+void ByteRleDecoderImpl::seek(PositionProvider &location) {
+ // move the input stream
+ inputStream->seek(location);
+ // force a re-read from the stream
+ bufferEnd = bufferStart;
+ // read a new header
+ readHeader();
+ // skip ahead the given number of records
+ ByteRleDecoderImpl::skip(location.next());
+}
+
+void ByteRleDecoderImpl::skip(uint64_t numValues) {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ size_t count = std::min(static_cast<size_t>(numValues), remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ // for literals we need to skip over count bytes, which may involve
+ // reading from the underlying stream
+ if (!repeating) {
+ size_t consumedBytes = count;
+ while (consumedBytes > 0) {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ size_t skipSize =
+ std::min(static_cast<size_t>(consumedBytes),
+ static_cast<size_t>(bufferEnd - bufferStart));
+ bufferStart += skipSize;
+ consumedBytes -= skipSize;
+ }
+ }
+ }
+}
+
+void ByteRleDecoderImpl::next(char *data, uint64_t numValues,
+ const char *notNull) {
+ uint64_t position = 0;
+ // skip over null values
+ while (notNull && position < numValues && !notNull[position]) {
+ position += 1;
+ }
+ while (position < numValues) {
+ // if we are out of values, read more
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ // how many do we read out of this block?
+ size_t count =
+ std::min(static_cast<size_t>(numValues - position), remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = value;
+ consumed += 1;
+ }
+ }
+ } else {
+ memset(data + position, value, count);
+ consumed = count;
+ }
+ } else {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ data[position + i] = readByte();
+ consumed += 1;
+ }
+ }
+ } else {
+ uint64_t i = 0;
+ while (i < count) {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ uint64_t copyBytes =
+ std::min(static_cast<uint64_t>(count - i),
+ static_cast<uint64_t>(bufferEnd - bufferStart));
+ memcpy(data + position + i, bufferStart, copyBytes);
+ bufferStart += copyBytes;
+ i += copyBytes;
+ }
+ consumed = count;
+ }
+ }
+ remainingValues -= consumed;
+ position += count;
+ // skip over any null values
+ while (notNull && position < numValues && !notNull[position]) {
+ position += 1;
+ }
+ }
+}
+
+std::unique_ptr<ByteRleDecoder> createByteRleDecoder(
+ std::unique_ptr<SeekableInputStream> input) {
+ return std::unique_ptr<ByteRleDecoder>(
+ new ByteRleDecoderImpl(std::move(input)));
+}
+
+class BooleanRleDecoderImpl : public ByteRleDecoderImpl {
+ public:
+ explicit BooleanRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+
+ virtual ~BooleanRleDecoderImpl();
+
+ void seek(PositionProvider &) override;
+
+ void skip(uint64_t numValues) override;
+
+ void next(char *data, uint64_t numValues, const char *notNull) override;
+
+ protected:
+ size_t remainingBits = 0;
+ char lastByte = 0;
+};
+
+BooleanRleDecoderImpl::BooleanRleDecoderImpl(
+ std::unique_ptr<SeekableInputStream> input)
+ : ByteRleDecoderImpl(std::move(input)) {}
+
+BooleanRleDecoderImpl::~BooleanRleDecoderImpl() {
+ // PASS
+}
+
+void BooleanRleDecoderImpl::seek(PositionProvider &location) {
+ ByteRleDecoderImpl::seek(location);
+ uint64_t consumed = location.next();
+ if (consumed > 8) {
+ throw ParseError("bad position");
+ }
+ if (consumed != 0) {
+ remainingBits = 8 - consumed;
+ ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
+ }
+}
+
+void BooleanRleDecoderImpl::skip(uint64_t numValues) {
+ if (numValues <= remainingBits) {
+ remainingBits -= numValues;
+ } else {
+ numValues -= remainingBits;
+ uint64_t bytesSkipped = numValues / 8;
+ ByteRleDecoderImpl::skip(bytesSkipped);
+ ByteRleDecoderImpl::next(&lastByte, 1, nullptr);
+ remainingBits = 8 - (numValues % 8);
+ }
+}
+
+void BooleanRleDecoderImpl::next(char *__restrict__ data, uint64_t numValues,
+ const char *__restrict__ notNull) {
+ // next spot to fill in
+ uint64_t position = 0;
+
+ // use up any remaining bits
+ if (notNull) {
+ while (remainingBits > 0 && position < numValues) {
+ if (notNull[position]) {
+ remainingBits -= 1;
+ data[position] =
+ (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
+ } else {
+ data[position] = 0;
+ }
+ position += 1;
+ }
+ } else {
+ while (remainingBits > 0 && position < numValues) {
+ remainingBits -= 1;
+ data[position++] =
+ (static_cast<unsigned char>(lastByte) >> remainingBits) & 0x1;
+ }
+ }
+
+ // count the number of nonNulls remaining
+ uint64_t nonNulls = numValues - position;
+ if (notNull) {
+ for (uint64_t i = position; i < numValues; ++i) {
+ if (!notNull[i]) {
+ nonNulls -= 1;
+ }
+ }
+ }
+
+ // fill in the remaining values
+ if (nonNulls == 0) {
+ while (position < numValues) {
+ data[position++] = 0;
+ }
+ } else if (position < numValues) {
+ // read the new bytes into the array
+ uint64_t bytesRead = (nonNulls + 7) / 8;
+ ByteRleDecoderImpl::next(data + position, bytesRead, nullptr);
+ lastByte = data[position + bytesRead - 1];
+ remainingBits = bytesRead * 8 - nonNulls;
+ // expand the array backwards so that we don't clobber the data
+ uint64_t bitsLeft = bytesRead * 8 - remainingBits;
+ if (notNull) {
+ for (int64_t i = static_cast<int64_t>(numValues) - 1;
+ i >= static_cast<int64_t>(position); --i) {
+ if (notNull[i]) {
+ uint64_t shiftPosn = (-bitsLeft) % 8;
+ data[i] = (data[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
+ bitsLeft -= 1;
+ } else {
+ data[i] = 0;
+ }
+ }
+ } else {
+ // performance: edit the code below carefully
+ const char *__restrict__ dataSrc = data;
+ int64_t i = static_cast<int64_t>(numValues) - 1;
+#ifdef AVX_OPT
+ int64_t positionEnd = i - (i - position + 1) % 16;
+ assert((positionEnd - position + 1) % 16 == 0);
+#else
+ int64_t positionEnd = static_cast<int64_t>(position) - 1;
+#endif
+ // step 1: remove the back element to align to 16 byte e.g. 128 bit
+ for (; i > positionEnd;) {
+ uint8_t shiftPosn = (-bitsLeft) % 8;
+ data[i] = (dataSrc[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
+ --i, --bitsLeft;
+ if (shiftPosn == 7) break;
+ }
+ for (; i - 7 > positionEnd; i -= 8, bitsLeft -= 8) {
+ char tmpDataSrc = dataSrc[position + (bitsLeft - 1) / 8];
+ uint64_t tmpBuf;
+#ifdef IS_BIG_ENDIAN
+#pragma clang loop unroll(full)
+ for (int8_t shiftPosn = 7; shiftPosn >= 0; shiftPosn--) {
+ tmpBuf <<= 8;
+ tmpBuf |= (char)(tmpDataSrc >> shiftPosn) & 0x1;
+ }
+#else
+#pragma clang loop unroll(full)
+ for (int8_t shiftPosn = 0; shiftPosn <= 7; shiftPosn++) {
+ tmpBuf <<= 8;
+ tmpBuf |= (char)(tmpDataSrc >> shiftPosn) & 0x1;
+ }
+#endif
+ uint64_t *tmpPtr = (uint64_t *)&data[i - 7];
+ *tmpPtr = tmpBuf;
+ }
+// end of step 1
+#ifdef AVX_OPT
+ // step 2: simd
+ // intel cpus are all little endian
+ // 2 bytes src e.g. 16 bits expand to 16 bytes e.g. 128 bits
+ // todo: there could be more specific version for avx2, avx512
+ __m128i *tmpPtr = (__m128i *)&data[i - 15];
+ if ((uint64_t)tmpPtr % 16 == 0) {
+ // _mm128_store_si128 require aligned, otherwise exception
+ __m128i mask = _mm_set1_epi8(0x1);
+ for (; i - 15 >= static_cast<int64_t>(position);
+ i -= 16, bitsLeft -= 16) {
+ const char *tds = &dataSrc[position + (bitsLeft - 1) / 8 - 1];
+ __m128i src = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, tds[1], 0, 0, 0, 0, 0,
+ 0, 0, tds[0]);
+ // high to low in register
+ __m128i res = _mm_set1_epi8(0x0);
+ {
+ __m128i tmp;
+ // pay attention to shift right logically
+ tmp = _mm_slli_si128(src, 0); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 7); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 1); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 6); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 2); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 5); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 3); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 4); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 4); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 3); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 5); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 2); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 6); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 1); // shift the bit
+ res = _mm_or_si128(res, tmp);
+
+ tmp = _mm_slli_si128(src, 7); // shift the byte
+ tmp = _mm_srli_epi64(tmp, 0); // shift the bit
+ res = _mm_or_si128(res, tmp);
+ }
+ res = _mm_and_si128(res, mask);
+ __m128i *tmpPtr = (__m128i *)&data[i - 15];
+ _mm_storeu_si128(tmpPtr, res);
+ }
+ } else { // address not aligned
+ int64_t positionEnd = static_cast<int64_t>(position) - 1;
+ for (; i > positionEnd;) {
+ uint8_t shiftPosn = (-bitsLeft) % 8;
+ data[i] = (dataSrc[position + (bitsLeft - 1) / 8] >> shiftPosn) & 0x1;
+ --i, --bitsLeft;
+ if (shiftPosn == 7) break;
+ }
+ for (; i - 7 > positionEnd; i -= 8, bitsLeft -= 8) {
+ char tmpDataSrc = dataSrc[position + (bitsLeft - 1) / 8];
+ uint64_t tmpBuf;
+#pragma clang loop unroll(full)
+ for (int8_t shiftPosn = 0; shiftPosn <= 7; shiftPosn++) {
+ tmpBuf <<= 8;
+ tmpBuf |= (char)(tmpDataSrc >> shiftPosn) & 0x1;
+ }
+ uint64_t *tmpPtr = (uint64_t *)&data[i - 7];
+ *tmpPtr = tmpBuf;
+ }
+ }
+#endif
+ assert(bitsLeft == 0);
+ }
+ }
+}
+
+std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input) {
+ BooleanRleDecoderImpl *decoder = new BooleanRleDecoderImpl(std::move(input));
+ return std::unique_ptr<ByteRleDecoder>(
+ reinterpret_cast<ByteRleDecoder *>(decoder));
+}
+
+std::unique_ptr<ByteRleCoder> createByteRleCoder(CompressionKind kind) {
+ std::unique_ptr<ByteRleCoder> coder(
+ new ByteRleCoder(createBlockCompressor(kind)));
+ return std::move(coder);
+}
+
+BooleanRleEncoderImpl::BooleanRleEncoderImpl(
+ std::unique_ptr<SeekableOutputStream> output)
+ : ByteRleCoder(std::move(output)) {
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+}
+
+BooleanRleEncoderImpl::~BooleanRleEncoderImpl() {}
+
+void BooleanRleEncoderImpl::write(const char *data, uint64_t numValues,
+ const char *notNull) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (bitsRemained == 0) {
+ ByteRleCoder::write(current);
+ current = static_cast<char>(0);
+ bitsRemained = 8;
+ }
+ if (!notNull || notNull[i]) {
+ if (!data || data[i]) {
+ current = static_cast<char>(current | (0x80 >> (8 - bitsRemained)));
+ }
+ --bitsRemained;
+ }
+ }
+ if (bitsRemained == 0) {
+ ByteRleCoder::write(current);
+ current = static_cast<char>(0);
+ bitsRemained = 8;
+ }
+}
+
+void BooleanRleEncoderImpl::flush() {
+ if (bitsRemained != 8) {
+ ByteRleCoder::write(current);
+ }
+ bitsRemained = 8;
+ current = static_cast<char>(0);
+ ByteRleCoder::flush();
+}
+
+void BooleanRleEncoderImpl::flushToStream(OutputStream *stream) {
+ flush();
+ ByteRleCoder::flushToStream(stream);
+}
+
+std::unique_ptr<BooleanRleEncoderImpl> createBooleanRleEncoderImpl(
+ CompressionKind kind) {
+ std::unique_ptr<BooleanRleEncoderImpl> coder(
+ new BooleanRleEncoderImpl(createBlockCompressor(kind)));
+ return std::move(coder);
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/byte-rle.h b/depends/storage/src/storage/format/orc/byte-rle.h
new file mode 100644
index 0000000..719a4a0
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/byte-rle.h
@@ -0,0 +1,237 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_BYTE_RLE_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_BYTE_RLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "storage/format/orc/rle.h"
+#include "storage/format/orc/seekable-input-stream.h"
+#include "storage/format/orc/seekable-output-stream.h"
+
+namespace orc {
+
+class ByteRleDecoder {
+ public:
+ virtual ~ByteRleDecoder();
+
+ // Seek to a particular spot.
+ // @param pos The position to seek
+ // @return void
+ virtual void seek(PositionProvider& pos) = 0; // NOLINT
+
+ // Seek over a given number of values.
+ // @param numValues Skip the number of values
+ // @return void
+ virtual void skip(uint64_t numValues) = 0;
+
+ // Read a number of values into the batch.
+ // @param data the array to read into
+ // @param numValues the number of values to read
+ // @param notNull If the pointer is null, all values are read. If the
+ // pointer is not null, positions that are false are skipped.
+ virtual void next(char* data, uint64_t numValues, const char* notNull) = 0;
+};
+
+// Create a byte RLE decoder.
+// @param input the input stream to read from
+// @return The decoder
+std::unique_ptr<ByteRleDecoder> createByteRleDecoder(
+ std::unique_ptr<SeekableInputStream> input);
+
+// Create a boolean RLE decoder.
+// Unlike the other RLE decoders, the boolean decoder sets the data to 0
+// if the value is masked by notNull. This is required for the notNull stream
+// processing to properly apply multiple masks from nested types.
+// @param input the input stream to read from
+// @return The boolean RLE decoder
+std::unique_ptr<ByteRleDecoder> createBooleanRleDecoder(
+ std::unique_ptr<SeekableInputStream> input);
+
+class ByteRleDecoderImpl : public ByteRleDecoder {
+ public:
+ explicit ByteRleDecoderImpl(std::unique_ptr<SeekableInputStream> input);
+
+ virtual ~ByteRleDecoderImpl();
+
+ void seek(PositionProvider&) override;
+
+ void skip(uint64_t numValues) override;
+
+ void next(char* data, uint64_t numValues, const char* notNull) override;
+
+ protected:
+ inline void nextBuffer();
+ inline signed char readByte();
+ inline void readHeader();
+
+ std::unique_ptr<SeekableInputStream> inputStream;
+ size_t remainingValues;
+ char value;
+ const char* bufferStart;
+ const char* bufferEnd;
+ bool repeating;
+};
+
+// Run length byte encoder. A control byte is written before
+// each run with positive values 0 to 127 meaning 2 to 129 repetitions. If the
+// bytes is -1 to -128, 1 to 128 literal byte values follow.
+class ByteRleCoder : public RleCoder {
+ public:
+ explicit ByteRleCoder(std::unique_ptr<SeekableOutputStream> stream)
+ : output(std::move(stream)), literals(MAX_LITERAL_SIZE) {}
+ ~ByteRleCoder() {}
+
+ void flushToStream(OutputStream* os) override {
+ writeValues();
+ output->flushToStream(os);
+ }
+
+ void flush() { writeValues(); }
+
+ uint64_t getStreamSize() override { return output->getStreamSize(); }
+
+ void reset() override {
+ output->reset();
+ repeat = false;
+ tailRunLength = 0;
+ numLiterals = 0;
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ // This is the maximal space used.
+ // It might not be accurate.
+ return output->getEstimatedSpaceNeeded() + sizeof(int8_t) * numLiterals +
+ sizeof(int8_t) /* control byte*/;
+ }
+
+ void write(void* data, uint64_t numValues, const char* notNull) override {
+ int8_t* d = reinterpret_cast<int8_t*>(data);
+
+ if (notNull) {
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (notNull[i]) {
+ write(d[i]);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < numValues; i++) {
+ write(d[i]);
+ }
+ }
+ }
+
+ void write(int8_t value) {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0]) {
+ numLiterals += 1;
+ if (numLiterals == MAX_REPEAT_SIZE) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (value == literals[numLiterals - 1]) {
+ tailRunLength += 1;
+ } else {
+ tailRunLength = 1;
+ }
+ if (tailRunLength == MIN_REPEAT_SIZE) {
+ if (numLiterals + 1 == MIN_REPEAT_SIZE) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= MIN_REPEAT_SIZE - 1;
+ writeValues();
+ literals[0] = value;
+ repeat = true;
+ numLiterals = MIN_REPEAT_SIZE;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+ }
+
+ private:
+ void writeValues() {
+ if (numLiterals != 0) {
+ if (repeat) {
+ assert(numLiterals - MIN_REPEAT_SIZE >= 0);
+ output->write<int8_t>(numLiterals - MIN_REPEAT_SIZE);
+ output->write(reinterpret_cast<const char*>(literals.data()),
+ sizeof(int8_t) * 1);
+ } else {
+ output->write<int8_t>(-numLiterals);
+ output->write(reinterpret_cast<const char*>(literals.data()),
+ numLiterals * sizeof(int8_t));
+ }
+ repeat = false;
+ tailRunLength = 0;
+ numLiterals = 0;
+ }
+ }
+
+ private:
+ const int32_t MIN_REPEAT_SIZE = 3;
+ const int32_t MAX_LITERAL_SIZE = 128;
+ const int32_t MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
+
+ std::unique_ptr<SeekableOutputStream> output;
+ std::vector<int8_t> literals;
+ int32_t numLiterals = 0;
+ bool repeat = false;
+ int32_t tailRunLength = 0;
+};
+
+// Create a byte RLE coder.
+// @param output The output stream to write to
+// @return The coder
+std::unique_ptr<ByteRleCoder> createByteRleCoder(CompressionKind kind);
+
+class BooleanRleEncoderImpl : public ByteRleCoder {
+ public:
+ BooleanRleEncoderImpl(std::unique_ptr<SeekableOutputStream> output);
+ virtual ~BooleanRleEncoderImpl() override;
+
+ virtual void write(const char* data, uint64_t numValues, const char* notNull);
+
+ virtual void flush();
+ virtual void flushToStream(OutputStream* stream) override;
+
+ private:
+ int bitsRemained;
+ char current;
+};
+std::unique_ptr<BooleanRleEncoderImpl> createBooleanRleEncoderImpl(
+ CompressionKind kind);
+
+} // end of namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_BYTE_RLE_H_
diff --git a/depends/storage/src/storage/format/orc/column-printer.cc b/depends/storage/src/storage/format/orc/column-printer.cc
new file mode 100644
index 0000000..896d1db
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/column-printer.cc
@@ -0,0 +1,613 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/column-printer.h"
+
+#include <time.h>
+
+#include <limits>
+#include <sstream>
+#include <stdexcept>
+#include <typeinfo>
+
+#ifdef __clang__
+#pragma clang diagnostic ignored "-Wformat-security"
+#endif
+
+namespace orc {
+
+static void writeChar(std::string& file, char ch) { // NOLINT
+ file += ch;
+}
+
+void writeString(std::string& file, const char* ptr) { // NOLINT
+ size_t len = strlen(ptr);
+ file.append(ptr, len);
+}
+
+ColumnPrinter::ColumnPrinter(std::string& _buffer)
+ : // NOLINT
+ buffer(_buffer) {
+ notNull = nullptr;
+ hasNulls = false;
+}
+
+ColumnPrinter::~ColumnPrinter() {
+ // PASS
+}
+
+void ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ hasNulls = batch.hasNulls;
+ if (hasNulls) {
+ notNull = batch.notNull.data();
+ } else {
+ notNull = nullptr;
+ }
+}
+
+std::unique_ptr<ColumnPrinter> createColumnPrinter(
+ std::string& buffer, // NOLINT
+ const Type* type) {
+ ColumnPrinter* result = nullptr;
+ if (type == nullptr) {
+ result = new VoidColumnPrinter(buffer);
+ } else {
+ switch (static_cast<int64_t>(type->getKind())) {
+ case BOOLEAN:
+ result = new BooleanColumnPrinter(buffer);
+ break;
+
+ case SHORT:
+ result = new ShortColumnPrinter(buffer);
+ break;
+
+ case INT:
+ result = new IntColumnPrinter(buffer);
+ break;
+
+ case BYTE:
+ case LONG:
+ result = new LongColumnPrinter(buffer);
+ break;
+
+ case FLOAT:
+ result = new FloatColumnPrinter(buffer, *type);
+ break;
+
+ case DOUBLE:
+ result = new DoubleColumnPrinter(buffer, *type);
+ break;
+
+ case STRING:
+ case VARCHAR:
+ case CHAR:
+ result = new StringColumnPrinter(buffer);
+ break;
+
+ case BINARY:
+ result = new BinaryColumnPrinter(buffer);
+ break;
+
+ case TIMESTAMP:
+ result = new TimestampColumnPrinter(buffer);
+ break;
+
+ case LIST:
+ result = new ListColumnPrinter(buffer, *type);
+ break;
+
+ case MAP:
+ result = new MapColumnPrinter(buffer, *type);
+ break;
+
+ case STRUCT:
+ result = new StructColumnPrinter(buffer, *type);
+ break;
+
+ case DECIMAL:
+ if (type->getPrecision() == 0 || type->getPrecision() > 18) {
+ result = new Decimal128ColumnPrinter(buffer);
+ } else {
+ result = new Decimal64ColumnPrinter(buffer);
+ }
+ break;
+
+ case DATE:
+ result = new DateColumnPrinter(buffer);
+ break;
+
+ case TIME:
+ result = new TimeColumnPrinter(buffer);
+ break;
+
+ case UNION:
+ result = new UnionColumnPrinter(buffer, *type);
+ break;
+
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "unknown batch type");
+ }
+ }
+ return std::unique_ptr<ColumnPrinter>(result);
+}
+
+VoidColumnPrinter::VoidColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer) {
+ // PASS
+}
+
+void VoidColumnPrinter::reset(const ColumnVectorBatch&) {
+ // PASS
+}
+
+void VoidColumnPrinter::printRow(uint64_t) { writeString(buffer, "null"); }
+
+FloatColumnPrinter::FloatColumnPrinter(std::string& buffer, const Type& type)
+ : // NOLINT
+ ColumnPrinter(buffer),
+ data(nullptr) {
+ // PASS
+}
+
+void FloatColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const FloatVectorBatch&>(batch).data.data();
+}
+
+void FloatColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%.7g", data[rowId]);
+ writeString(buffer, numBuffer);
+ }
+}
+
+DoubleColumnPrinter::DoubleColumnPrinter(std::string& buffer, const Type& type)
+ : // NOLINT
+ ColumnPrinter(buffer),
+ data(nullptr) {
+ // PASS
+}
+
+void DoubleColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const DoubleVectorBatch&>(batch).data.data();
+}
+
+void DoubleColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%.14g", data[rowId]);
+ writeString(buffer, numBuffer);
+ }
+}
+
+Decimal64ColumnPrinter::Decimal64ColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), data(nullptr), scale(0) {
+ // PASS
+}
+
+void Decimal64ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal64VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal64VectorBatch&>(batch).scale;
+}
+
+std::string toDecimalString(int64_t value, int32_t scale) {
+ std::stringstream buffer;
+ if (scale == 0) {
+ buffer << value;
+ return buffer.str();
+ }
+ std::string sign = "";
+ if (value < 0) {
+ sign = "-";
+ value = -value;
+ }
+ buffer << value;
+ std::string str = buffer.str();
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return sign + "0." + str;
+ } else {
+ std::string result = sign + "0.";
+ for (int32_t i = 0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+}
+
+void Decimal64ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, toDecimalString(data[rowId], scale).c_str());
+ }
+}
+
+Decimal128ColumnPrinter::Decimal128ColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), data(nullptr), scale(0) {
+ // PASS
+}
+
+void Decimal128ColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const Decimal128VectorBatch&>(batch).values.data();
+ scale = dynamic_cast<const Decimal128VectorBatch&>(batch).scale;
+}
+
+void Decimal128ColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, data[rowId].toDecimalString(scale).c_str());
+ }
+}
+
+StringColumnPrinter::StringColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), start(nullptr), length(nullptr) {
+ // PASS
+}
+
+void StringColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ start = dynamic_cast<const BytesVectorBatch&>(batch).data.data();
+ length = dynamic_cast<const BytesVectorBatch&>(batch).length.data();
+}
+
+void StringColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '"');
+ for (int64_t i = 0; i < length[rowId]; ++i) {
+ char ch = static_cast<char>(start[rowId][i]);
+ switch (ch) {
+ case '\\':
+ writeString(buffer, "\\\\");
+ break;
+ case '\b':
+ writeString(buffer, "\\b");
+ break;
+ case '\f':
+ writeString(buffer, "\\f");
+ break;
+ case '\n':
+ writeString(buffer, "\\n");
+ break;
+ case '\r':
+ writeString(buffer, "\\r");
+ break;
+ case '\t':
+ writeString(buffer, "\\t");
+ break;
+ case '"':
+ writeString(buffer, "\\\"");
+ break;
+ default:
+ writeChar(buffer, ch);
+ break;
+ }
+ }
+ writeChar(buffer, '"');
+ }
+}
+
+ListColumnPrinter::ListColumnPrinter(std::string& buffer, const Type& type)
+ : ColumnPrinter(buffer), offsets(nullptr) {
+ elementPrinter = createColumnPrinter(buffer, type.getSubtype(0));
+}
+
+void ListColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ offsets = dynamic_cast<const ListVectorBatch&>(batch).offsets.data();
+ elementPrinter->reset(*dynamic_cast<const ListVectorBatch&>(batch).elements);
+}
+
+void ListColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
+ if (i != offsets[rowId]) {
+ writeString(buffer, ", ");
+ }
+ elementPrinter->printRow(static_cast<uint64_t>(i));
+ }
+ writeChar(buffer, ']');
+ }
+}
+
+MapColumnPrinter::MapColumnPrinter(std::string& buffer, const Type& type)
+ : ColumnPrinter(buffer), offsets(nullptr) {
+ keyPrinter = createColumnPrinter(buffer, type.getSubtype(0));
+ elementPrinter = createColumnPrinter(buffer, type.getSubtype(1));
+}
+
+void MapColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const MapVectorBatch& myBatch = dynamic_cast<const MapVectorBatch&>(batch);
+ offsets = myBatch.offsets.data();
+ keyPrinter->reset(*myBatch.keys);
+ elementPrinter->reset(*myBatch.elements);
+}
+
+void MapColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for (int64_t i = offsets[rowId]; i < offsets[rowId + 1]; ++i) {
+ if (i != offsets[rowId]) {
+ writeString(buffer, ", ");
+ }
+ writeString(buffer, "{\"key\": ");
+ keyPrinter->printRow(static_cast<uint64_t>(i));
+ writeString(buffer, ", \"value\": ");
+ elementPrinter->printRow(static_cast<uint64_t>(i));
+ writeChar(buffer, '}');
+ }
+ writeChar(buffer, ']');
+ }
+}
+
+UnionColumnPrinter::UnionColumnPrinter(std::string& buffer, const Type& type)
+ : ColumnPrinter(buffer), tags(nullptr), offsets(nullptr) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ fieldPrinter.push_back(
+ createColumnPrinter(buffer, type.getSubtype(i)).release());
+ }
+}
+
+UnionColumnPrinter::~UnionColumnPrinter() {
+ for (size_t i = 0; i < fieldPrinter.size(); i++) {
+ delete fieldPrinter[i];
+ }
+}
+
+void UnionColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const UnionVectorBatch& unionBatch =
+ dynamic_cast<const UnionVectorBatch&>(batch);
+ tags = unionBatch.tags.data();
+ offsets = unionBatch.offsets.data();
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
+ fieldPrinter[i]->reset(*(unionBatch.children[i]));
+ }
+}
+
+void UnionColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, "{\"tag\": ");
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer),
+ "%"
+ "ll"
+ "d",
+ static_cast<int64_t>(tags[rowId]));
+ writeString(buffer, numBuffer);
+ writeString(buffer, ", \"value\": ");
+ fieldPrinter[tags[rowId]]->printRow(offsets[rowId]);
+ writeChar(buffer, '}');
+ }
+}
+
+StructColumnPrinter::StructColumnPrinter(std::string& buffer, const Type& type)
+ : // NOLINT
+ ColumnPrinter(buffer) {
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ fieldNames.push_back(type.getFieldName(i));
+ fieldTypes.push_back(type.getSubtype(i)->toString());
+ fieldPrinter.push_back(
+ createColumnPrinter(buffer, type.getSubtype(i)).release());
+ }
+}
+
+StructColumnPrinter::~StructColumnPrinter() {
+ for (size_t i = 0; i < fieldPrinter.size(); i++) {
+ delete fieldPrinter[i];
+ }
+}
+
+void StructColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const StructVectorBatch& structBatch =
+ dynamic_cast<const StructVectorBatch&>(batch);
+ for (size_t i = 0; i < fieldPrinter.size(); ++i) {
+ fieldPrinter[i]->reset(*(structBatch.fields[i]));
+ }
+}
+
+void StructColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '{');
+ for (unsigned int i = 0; i < fieldPrinter.size(); ++i) {
+ if (i != 0) {
+ writeString(buffer, ", ");
+ }
+ writeChar(buffer, '"');
+ writeString(buffer, fieldNames[i].c_str());
+ writeChar(buffer, '-');
+ writeString(buffer, fieldTypes[i].c_str());
+ writeString(buffer, "\": ");
+ fieldPrinter[i]->printRow(rowId);
+ }
+ writeChar(buffer, '}');
+ }
+}
+
+DateColumnPrinter::DateColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), data(nullptr) {
+ // PASS
+}
+
+void DateColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ const time_t timeValue = data[rowId] * 24 * 60 * 60;
+ struct tm tmValue;
+ gmtime_r(&timeValue, &tmValue);
+ char timeBuffer[11];
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
+ writeChar(buffer, '"');
+ writeString(buffer, timeBuffer);
+ writeChar(buffer, '"');
+ }
+}
+
+void DateColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+}
+
+TimeColumnPrinter::TimeColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), data(nullptr) {
+ // PASS
+}
+
+void TimeColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ const time_t timeValue = data[rowId] * 24 * 60 * 60;
+ struct tm tmValue;
+ gmtime_r(&timeValue, &tmValue);
+ char timeBuffer[11];
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d", &tmValue);
+ writeChar(buffer, '"');
+ writeString(buffer, timeBuffer);
+ writeChar(buffer, '"');
+ }
+}
+
+void TimeColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+}
+
+BooleanColumnPrinter::BooleanColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), data(nullptr) {
+ // PASS
+}
+
+void BooleanColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeString(buffer, (data[rowId] ? "true" : "false"));
+ }
+}
+
+void BooleanColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ data = dynamic_cast<const LongVectorBatch&>(batch).data.data();
+}
+
+BinaryColumnPrinter::BinaryColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), start(nullptr), length(nullptr) {
+ // PASS
+}
+
+void BinaryColumnPrinter::printRow(uint64_t rowId) {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ writeChar(buffer, '[');
+ for (int64_t i = 0; i < length[rowId]; ++i) {
+ if (i != 0) {
+ writeString(buffer, ", ");
+ }
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer), "%d",
+ (static_cast<const int>(start[rowId][i]) & 0xff));
+ writeString(buffer, numBuffer);
+ }
+ writeChar(buffer, ']');
+ }
+}
+
+void BinaryColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ start = dynamic_cast<const BytesVectorBatch&>(batch).data.data();
+ length = dynamic_cast<const BytesVectorBatch&>(batch).length.data();
+}
+
+TimestampColumnPrinter::TimestampColumnPrinter(std::string& buffer)
+ : ColumnPrinter(buffer), seconds(nullptr), nanoseconds(nullptr) {
+ // PASS
+}
+
+void TimestampColumnPrinter::printRow(uint64_t rowId) {
+ const int64_t NANO_DIGITS = 9;
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ int64_t nanos = nanoseconds[rowId];
+ time_t secs = static_cast<time_t>(seconds[rowId]);
+ struct tm tmValue;
+ gmtime_r(&secs, &tmValue);
+ char timeBuffer[20];
+ strftime(timeBuffer, sizeof(timeBuffer), "%Y-%m-%d %H:%M:%S", &tmValue);
+ writeChar(buffer, '"');
+ writeString(buffer, timeBuffer);
+ writeChar(buffer, '.');
+ // remove trailing zeros off the back of the nanos value.
+ int64_t zeroDigits = 0;
+ if (nanos == 0) {
+ zeroDigits = 8;
+ } else {
+ while (nanos % 10 == 0) {
+ nanos /= 10;
+ zeroDigits += 1;
+ }
+ }
+ char numBuffer[64];
+ snprintf(numBuffer, sizeof(numBuffer),
+ "%0*"
+ "ll"
+ "d\"",
+ static_cast<int>(NANO_DIGITS - zeroDigits),
+ static_cast<int64_t>(nanos));
+ writeString(buffer, numBuffer);
+ }
+}
+
+void TimestampColumnPrinter::reset(const ColumnVectorBatch& batch) {
+ ColumnPrinter::reset(batch);
+ const TimestampVectorBatch& ts =
+ dynamic_cast<const TimestampVectorBatch&>(batch);
+ seconds = ts.data.data();
+ nanoseconds = ts.nanoseconds.data();
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/column-printer.h b/depends/storage/src/storage/format/orc/column-printer.h
new file mode 100644
index 0000000..01b31cd
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/column-printer.h
@@ -0,0 +1,280 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_COLUMN_PRINTER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_COLUMN_PRINTER_H_
+
+#include <stdio.h>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "storage/format/orc/input-stream.h"
+#include "storage/format/orc/type.h"
+#include "storage/format/orc/vector.h"
+
+namespace orc {
+
+extern void writeString(std::string& file, const char* ptr); // NOLINT
+
+class ColumnPrinter {
+ protected:
+ std::string& buffer;
+ bool hasNulls;
+ const char* notNull;
+
+ public:
+ explicit ColumnPrinter(std::string&);
+ virtual ~ColumnPrinter();
+ virtual void printRow(uint64_t rowId) = 0;
+ // should be called once at the start of each batch of rows
+ virtual void reset(const ColumnVectorBatch& batch);
+};
+
+std::unique_ptr<ColumnPrinter> createColumnPrinter(std::string&,
+ const Type* type);
+
+class VoidColumnPrinter : public ColumnPrinter {
+ public:
+ explicit VoidColumnPrinter(std::string&);
+ ~VoidColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class BooleanColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* data;
+
+ public:
+ explicit BooleanColumnPrinter(std::string&);
+ ~BooleanColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+template <class ElementType>
+class FixedSizeColumnPrinter : public ColumnPrinter {
+ private:
+ const ElementType* data;
+
+ public:
+ explicit FixedSizeColumnPrinter(std::string& buffer)
+ : // NOLINT
+ ColumnPrinter(buffer),
+ data(nullptr) {}
+ ~FixedSizeColumnPrinter() {}
+
+ void reset(const ColumnVectorBatch& batch) override {
+ ColumnPrinter::reset(batch);
+ data = reinterpret_cast<const ElementType*>(batch.getData());
+ }
+
+ void printRow(uint64_t rowId) override {
+ if (hasNulls && !notNull[rowId]) {
+ writeString(buffer, "null");
+ } else {
+ std::stringstream ss;
+ ss << data[rowId];
+ writeString(buffer, ss.str().c_str());
+ }
+ }
+};
+
+class LongColumnPrinter : public FixedSizeColumnPrinter<int64_t> {
+ public:
+ explicit LongColumnPrinter(std::string& buffer)
+ : // NOLINT
+ FixedSizeColumnPrinter<int64_t>(buffer) {}
+ ~LongColumnPrinter() {}
+};
+
+class IntColumnPrinter : public FixedSizeColumnPrinter<int32_t> {
+ public:
+ explicit IntColumnPrinter(std::string& buffer)
+ : // NOLINT
+ FixedSizeColumnPrinter<int32_t>(buffer) {}
+ ~IntColumnPrinter() {}
+};
+
+class ShortColumnPrinter : public FixedSizeColumnPrinter<int16_t> {
+ public:
+ explicit ShortColumnPrinter(std::string& buffer)
+ : // NOLINT
+ FixedSizeColumnPrinter<int16_t>(buffer) {}
+ ~ShortColumnPrinter() {}
+};
+
+class FloatColumnPrinter : public ColumnPrinter {
+ private:
+ const float* data;
+
+ public:
+ explicit FloatColumnPrinter(std::string&, const Type& type);
+ virtual ~FloatColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class DoubleColumnPrinter : public ColumnPrinter {
+ private:
+ const double* data;
+
+ public:
+ explicit DoubleColumnPrinter(std::string&, const Type& type);
+ virtual ~DoubleColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class TimestampColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* seconds;
+ const int64_t* nanoseconds;
+
+ public:
+ explicit TimestampColumnPrinter(std::string&);
+ ~TimestampColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class DateColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* data;
+
+ public:
+ explicit DateColumnPrinter(std::string&);
+ ~DateColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class TimeColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* data;
+
+ public:
+ explicit TimeColumnPrinter(std::string&);
+ ~TimeColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class Decimal64ColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* data;
+ int32_t scale;
+
+ public:
+ explicit Decimal64ColumnPrinter(std::string&);
+ ~Decimal64ColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class Decimal128ColumnPrinter : public ColumnPrinter {
+ private:
+ const Int128* data;
+ int32_t scale;
+
+ public:
+ explicit Decimal128ColumnPrinter(std::string&);
+ ~Decimal128ColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class StringColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
+ const int64_t* length;
+
+ public:
+ explicit StringColumnPrinter(std::string&);
+ virtual ~StringColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class BinaryColumnPrinter : public ColumnPrinter {
+ private:
+ const char* const* start;
+ const int64_t* length;
+
+ public:
+ explicit BinaryColumnPrinter(std::string&);
+ virtual ~BinaryColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class ListColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* offsets;
+ std::unique_ptr<ColumnPrinter> elementPrinter;
+
+ public:
+ ListColumnPrinter(std::string&, const Type& type);
+ virtual ~ListColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class MapColumnPrinter : public ColumnPrinter {
+ private:
+ const int64_t* offsets;
+ std::unique_ptr<ColumnPrinter> keyPrinter;
+ std::unique_ptr<ColumnPrinter> elementPrinter;
+
+ public:
+ MapColumnPrinter(std::string&, const Type& type);
+ virtual ~MapColumnPrinter() {}
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class UnionColumnPrinter : public ColumnPrinter {
+ private:
+ const unsigned char* tags;
+ const uint64_t* offsets;
+ std::vector<ColumnPrinter*> fieldPrinter;
+
+ public:
+ UnionColumnPrinter(std::string&, const Type& type);
+ virtual ~UnionColumnPrinter();
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+class StructColumnPrinter : public ColumnPrinter {
+ private:
+ std::vector<ColumnPrinter*> fieldPrinter;
+ std::vector<std::string> fieldNames;
+ std::vector<std::string> fieldTypes;
+
+ public:
+ StructColumnPrinter(std::string&, const Type& type);
+ virtual ~StructColumnPrinter();
+ void printRow(uint64_t rowId) override;
+ void reset(const ColumnVectorBatch& batch) override;
+};
+
+} // namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_COLUMN_PRINTER_H_
diff --git a/depends/storage/src/storage/format/orc/data-buffer.cc b/depends/storage/src/storage/format/orc/data-buffer.cc
new file mode 100644
index 0000000..5018e02
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/data-buffer.cc
@@ -0,0 +1,67 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/data-buffer.h"
+
+#include "dbcommon/utils/cutils.h"
+#include "dbcommon/utils/global.h"
+
+#include "storage/format/orc/int128.h"
+
+namespace orc {
+
+template <class T>
+DataBuffer<T>::DataBuffer(dbcommon::MemoryPool& pool, // NOLINT
+ uint64_t newSize)
+ : memoryPool(pool), buf(nullptr), currentSize(0), currentCapacity(0) {
+ if (newSize) resize(newSize);
+}
+
+template <class T>
+DataBuffer<T>::~DataBuffer() {
+ if (buf) memoryPool.free(buf);
+}
+
+template <class T>
+void DataBuffer<T>::resize(uint64_t newSize) {
+ if (buf) {
+ buf = memoryPool.realloc<T>(buf, sizeof(T) * newSize);
+ } else {
+ buf = memoryPool.malloc<T>(sizeof(T) * newSize);
+ }
+ currentCapacity = memoryPool.getSpace() / sizeof(T);
+ currentSize = newSize;
+}
+
+template class DataBuffer<bool>;
+template class DataBuffer<char>;
+template class DataBuffer<char*>;
+template class DataBuffer<float>;
+template class DataBuffer<double>;
+template class DataBuffer<Int128>;
+template class DataBuffer<int64_t>;
+template class DataBuffer<uint64_t>;
+template class DataBuffer<int32_t>;
+template class DataBuffer<uint32_t>;
+template class DataBuffer<int16_t>;
+template class DataBuffer<uint16_t>;
+template class DataBuffer<int8_t>;
+template class DataBuffer<uint8_t>;
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/data-buffer.h b/depends/storage/src/storage/format/orc/data-buffer.h
new file mode 100644
index 0000000..d32989d
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/data-buffer.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_DATA_BUFFER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_DATA_BUFFER_H_
+
+#include <memory>
+
+#include "dbcommon/utils/memory-pool.h"
+
+namespace orc {
+
+template <class T>
+class DataBuffer {
+ private:
+ dbcommon::MemoryPool& memoryPool;
+ T* buf;
+ // current size
+ uint64_t currentSize;
+ // maximal capacity (actual allocated memory)
+ uint64_t currentCapacity;
+
+ // not implemented
+ DataBuffer(DataBuffer& buffer); // NOLINT
+ DataBuffer& operator=(DataBuffer& buffer); // NOLINT
+
+ public:
+ explicit DataBuffer(dbcommon::MemoryPool& pool, // NOLINT
+ uint64_t _size = 0);
+ virtual ~DataBuffer();
+
+ T* data() { return buf; }
+
+ const T* data() const { return buf; }
+
+ uint64_t size() { return currentSize; }
+
+ uint64_t capacity() { return currentCapacity; }
+
+ T& operator[](uint64_t i) { return buf[i]; }
+
+ void resize(uint64_t _size);
+};
+
+} // namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_DATA_BUFFER_H_
diff --git a/depends/storage/src/storage/format/orc/exceptions.cc b/depends/storage/src/storage/format/orc/exceptions.cc
new file mode 100644
index 0000000..6d1ecc5
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/exceptions.cc
@@ -0,0 +1,58 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/exceptions.h"
+
+namespace orc {
+
+NotImplementedYet::NotImplementedYet(const std::string& what_arg)
+ : logic_error(what_arg) {
+ // PASS
+}
+
+NotImplementedYet::NotImplementedYet(const char* what_arg)
+ : logic_error(what_arg) {
+ // PASS
+}
+
+NotImplementedYet::NotImplementedYet(const NotImplementedYet& error)
+ : logic_error(error) {
+ // PASS
+}
+
+NotImplementedYet::~NotImplementedYet() noexcept {
+ // PASS
+}
+
+ParseError::ParseError(const std::string& what_arg) : runtime_error(what_arg) {
+ // PASS
+}
+
+ParseError::ParseError(const char* what_arg) : runtime_error(what_arg) {
+ // PASS
+}
+
+ParseError::ParseError(const ParseError& error) : runtime_error(error) {
+ // PASS
+}
+
+ParseError::~ParseError() noexcept {
+ // PASS
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/exceptions.h b/depends/storage/src/storage/format/orc/exceptions.h
new file mode 100644
index 0000000..98beecf
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/exceptions.h
@@ -0,0 +1,51 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_EXCEPTIONS_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_EXCEPTIONS_H_
+
+#include <stdexcept>
+#include <string>
+
+namespace orc {
+
+class NotImplementedYet : public std::logic_error {
+ public:
+ explicit NotImplementedYet(const std::string& what_arg);
+ explicit NotImplementedYet(const char* what_arg);
+ virtual ~NotImplementedYet() noexcept;
+ NotImplementedYet(const NotImplementedYet&);
+
+ private:
+ NotImplementedYet& operator=(const NotImplementedYet&);
+};
+
+class ParseError : public std::runtime_error {
+ public:
+ explicit ParseError(const std::string& what_arg);
+ explicit ParseError(const char* what_arg);
+ virtual ~ParseError() noexcept;
+ ParseError(const ParseError&);
+
+ private:
+ ParseError& operator=(const ParseError&);
+};
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_EXCEPTIONS_H_
diff --git a/depends/storage/src/storage/format/orc/file-version.h b/depends/storage/src/storage/format/orc/file-version.h
new file mode 100644
index 0000000..25d2e89
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/file-version.h
@@ -0,0 +1,56 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_FILE_VERSION_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_FILE_VERSION_H_
+
+#include <string>
+
+namespace orc {
+
+class FileVersion {
+ private:
+ uint32_t majorVersion;
+ uint32_t minorVersion;
+
+ public:
+ FileVersion(uint32_t major, uint32_t minor)
+ : majorVersion(major), minorVersion(minor) {}
+
+ uint32_t getMajor() const { return this->majorVersion; }
+
+ uint32_t getMinor() const { return this->minorVersion; }
+
+ bool operator==(const FileVersion& right) const {
+ return this->majorVersion == right.getMajor() &&
+ this->minorVersion == right.getMinor();
+ }
+
+ bool operator!=(const FileVersion& right) const { return !(*this == right); }
+
+ std::string toString() const {
+ std::stringstream ss;
+ ss << getMajor() << '.' << getMinor();
+ return ss.str();
+ }
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_FILE_VERSION_H_
diff --git a/depends/storage/src/storage/format/orc/input-stream.cc b/depends/storage/src/storage/format/orc/input-stream.cc
new file mode 100644
index 0000000..c523c01
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/input-stream.cc
@@ -0,0 +1,40 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cerrno>
+
+#include "dbcommon/filesystem/file-system.h"
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/input-stream.h"
+
+namespace orc {
+
+std::unique_ptr<InputStream> readFile(dbcommon::FileSystem *fs,
+ const std::string &path) {
+ return std::unique_ptr<InputStream>(new GeneralFileInputStream(fs, path));
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/input-stream.h b/depends/storage/src/storage/format/orc/input-stream.h
new file mode 100644
index 0000000..1d0550f
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/input-stream.h
@@ -0,0 +1,112 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_INPUT_STREAM_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_INPUT_STREAM_H_
+
+#include <fcntl.h>
+#include <stdio.h>
+#include <sys/mman.h>
+#include <sys/stat.h>
+#include <sys/types.h>
+#include <unistd.h>
+
+#include <cassert>
+#include <cerrno>
+#include <memory>
+#include <string>
+
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "storage/format/orc/exceptions.h"
+
+// The top level interface to ORC.
+namespace orc {
+
+// An abstract interface for providing ORC readers a stream of bytes.
+class InputStream {
+ public:
+ InputStream() {}
+ virtual ~InputStream() {}
+
+ // Get the total length of the file in bytes.
+ virtual uint64_t getLength() const = 0;
+
+ // Get the natural size for reads.
+ // @return the number of bytes that should be read at once
+ virtual uint64_t getNaturalReadSize() const = 0;
+
+ // Read length bytes from the file starting at offset into
+ // the buffer starting at buf.
+ // @param buf the starting position of a buffer.
+ // @param length the number of bytes to read.
+ // @param offset the position in the stream to read from.
+ virtual void read(void* buf, uint64_t length, uint64_t offset) = 0;
+
+ // Get the name of the stream for error messages.
+ virtual const std::string& getName() const = 0;
+
+ virtual void readBloomFilter(void* buf, uint64_t length, uint64_t offset) = 0;
+};
+
+class GeneralFileInputStream : public InputStream {
+ public:
+ GeneralFileInputStream(dbcommon::FileSystem* fs, std::string fileName)
+ : fs(fs), fileName(fileName) {
+ file = fs->open(fileName.c_str(), O_RDONLY);
+ totalLength = fs->getFileLength(fileName.c_str());
+ }
+
+ virtual ~GeneralFileInputStream() {}
+
+ uint64_t getLength() const override { return totalLength; }
+
+ uint64_t getNaturalReadSize() const override { return 128 * 1024; }
+
+ void read(void* buf, uint64_t length, uint64_t offset) override {
+ assert(buf != nullptr);
+
+ fs->seek(file.get(), offset);
+ int bytesRead = fs->read(file.get(), buf, length);
+ }
+
+ void readBloomFilter(void* buf, uint64_t length, uint64_t offset) override {
+ assert(buf != nullptr);
+ if (!bloomFilterHandler)
+ bloomFilterHandler = fs->open(fileName.c_str(), O_RDONLY);
+ fs->seek(bloomFilterHandler.get(), offset);
+ int bytesRead = fs->read(bloomFilterHandler.get(), buf, length);
+ }
+
+ const std::string& getName() const override { return fileName; }
+
+ private:
+ std::string fileName;
+ std::unique_ptr<dbcommon::File> file = nullptr;
+ std::unique_ptr<dbcommon::File> bloomFilterHandler = nullptr;
+ uint64_t totalLength = 0;
+ dbcommon::FileSystem* fs = nullptr;
+};
+
+std::unique_ptr<InputStream> readFile(dbcommon::FileSystem* fs,
+ const std::string& path);
+
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_INPUT_STREAM_H_
diff --git a/depends/storage/src/storage/format/orc/int128.cc b/depends/storage/src/storage/format/orc/int128.cc
new file mode 100644
index 0000000..fa3b0de
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/int128.cc
@@ -0,0 +1,480 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+
+#include "dbcommon/log/logger.h"
+#include "storage/format/orc/int128.h"
+
+namespace orc {
+
+Int128 Int128::maximumValue() {
+ return Int128(0x7fffffffffffffff, 0xfffffffffffffff);
+}
+
+Int128 Int128::minimumValue() {
+ return Int128(static_cast<int64_t>(0x8000000000000000), 0x0);
+}
+
+Int128::Int128(const std::string& str) {
+ lowbits = 0;
+ highbits = 0;
+ size_t length = str.length();
+ if (length > 0) {
+ bool isNegative = str[0] == '-';
+ size_t posn = isNegative ? 1 : 0;
+ while (posn < length) {
+ size_t group = std::min(18ul, length - posn);
+ int64_t chunk = std::stoll(str.substr(posn, group));
+ int64_t multiple = 1;
+ for (size_t i = 0; i < group; ++i) {
+ multiple *= 10;
+ }
+ *this *= multiple;
+ *this += chunk;
+ posn += group;
+ }
+ if (isNegative) {
+ negate();
+ }
+ }
+}
+
+Int128& Int128::operator*=(const Int128& right) {
+ const uint64_t INT_MASK = 0xffffffff;
+ const uint64_t CARRY_BIT = 1l << 32;
+
+ // Break the left and right numbers into 32 bit chunks
+ // so that we can multiply them without overflow.
+ uint64_t L0 = static_cast<uint64_t>(highbits) >> 32;
+ uint64_t L1 = static_cast<uint64_t>(highbits) & INT_MASK;
+ uint64_t L2 = lowbits >> 32;
+ uint64_t L3 = lowbits & INT_MASK;
+ uint64_t R0 = static_cast<uint64_t>(right.highbits) >> 32;
+ uint64_t R1 = static_cast<uint64_t>(right.highbits) & INT_MASK;
+ uint64_t R2 = right.lowbits >> 32;
+ uint64_t R3 = right.lowbits & INT_MASK;
+
+ uint64_t product = L3 * R3;
+ lowbits = product & INT_MASK;
+ uint64_t sum = product >> 32;
+ product = L2 * R3;
+ sum += product;
+ highbits = sum < product ? CARRY_BIT : 0;
+ product = L3 * R2;
+ sum += product;
+ if (sum < product) {
+ highbits += CARRY_BIT;
+ }
+ lowbits += sum << 32;
+ highbits += static_cast<int64_t>(sum >> 32);
+ highbits += L1 * R3 + L2 * R2 + L3 * R1;
+ highbits += (L0 * R3 + L1 * R2 + L2 * R1 + L3 * R0) << 32;
+ return *this;
+}
+
+// Expands the given value into an array of ints so that we can work on
+// it. The array will be converted to an absolute value and the wasNegative
+// flag will be set appropriately. The array will remove leading zeros from
+// the value.
+// @param array an array of length 4 to set with the value
+// @param wasNegative a flag for whether the value was original negative
+// @result the output length of the array
+int64_t Int128::fillInArray(uint32_t* array, bool& wasNegative) const {
+ uint64_t high;
+ uint64_t low;
+ if (highbits < 0) {
+ low = ~lowbits + 1;
+ high = static_cast<uint64_t>(~highbits);
+ if (low == 0) {
+ high += 1;
+ }
+ wasNegative = true;
+ } else {
+ low = lowbits;
+ high = static_cast<uint64_t>(highbits);
+ wasNegative = false;
+ }
+ if (high != 0) {
+ if (high > UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(high >> 32);
+ array[1] = static_cast<uint32_t>(high);
+ array[2] = static_cast<uint32_t>(low >> 32);
+ array[3] = static_cast<uint32_t>(low);
+ return 4;
+ } else {
+ array[0] = static_cast<uint32_t>(high);
+ array[1] = static_cast<uint32_t>(low >> 32);
+ array[2] = static_cast<uint32_t>(low);
+ return 3;
+ }
+ } else if (low >= UINT32_MAX) {
+ array[0] = static_cast<uint32_t>(low >> 32);
+ array[1] = static_cast<uint32_t>(low);
+ return 2;
+ } else if (low == 0) {
+ return 0;
+ } else {
+ array[0] = static_cast<uint32_t>(low);
+ return 1;
+ }
+}
+
+// Find last set bit in a 32 bit integer. Bit 1 is the LSB and bit 32 is
+// the MSB. We can replace this with bsrq asm instruction on x64.
+int64_t fls(uint32_t x) {
+ int64_t bitpos = 0;
+ while (x) {
+ x >>= 1;
+ bitpos += 1;
+ }
+ return bitpos;
+}
+
+// Shift the number in the array left by bits positions.
+// @param array the number to shift, must have length elements
+// @param length the number of entries in the array
+// @param bits the number of bits to shift (0 <= bits < 32)
+void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for (int64_t i = 0; i < length - 1; ++i) {
+ array[i] = (array[i] << bits) | (array[i + 1] >> (32 - bits));
+ }
+ array[length - 1] <<= bits;
+ }
+}
+
+// Shift the number in the array right by bits positions.
+// @param array the number to shift, must have length elements
+// @param length the number of entries in the array
+// @param bits the number of bits to shift (0 <= bits < 32)
+void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits) {
+ if (length > 0 && bits != 0) {
+ for (int64_t i = length - 1; i > 0; --i) {
+ array[i] = (array[i] >> bits) | (array[i - 1] << (32 - bits));
+ }
+ array[0] >>= bits;
+ }
+}
+
+// Fix the signs of the result and remainder at the end of the division
+// based on the signs of the dividend and divisor.
+void fixDivisionSigns(Int128& result, Int128& remainder, // NOLINT
+ bool dividendWasNegative, bool divisorWasNegative) {
+ if (dividendWasNegative != divisorWasNegative) {
+ result.negate();
+ }
+ if (dividendWasNegative) {
+ remainder.negate();
+ }
+}
+
+// Build a Int128 from a list of ints.
+void buildFromArray(Int128& value, uint32_t* array, int64_t length) { // NOLINT
+ switch (length) {
+ case 0:
+ value = 0;
+ break;
+ case 1:
+ value = array[0];
+ break;
+ case 2:
+ value = Int128(0, (static_cast<uint64_t>(array[0]) << 32) + array[1]);
+ break;
+ case 3:
+ value =
+ Int128(array[0], (static_cast<uint64_t>(array[1]) << 32) + array[2]);
+ break;
+ case 4:
+ value = Int128((static_cast<int64_t>(array[0]) << 32) + array[1],
+ (static_cast<uint64_t>(array[2]) << 32) + array[3]);
+ break;
+ case 5:
+ if (array[0] != 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Can't build Int128 with 5 ints.");
+ }
+ value = Int128((static_cast<int64_t>(array[1]) << 32) + array[2],
+ (static_cast<uint64_t>(array[3]) << 32) + array[4]);
+ break;
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Unsupported length for building Int128");
+ }
+}
+
+// Do a division where the divisor fits into a single 32 bit value.
+Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
+ uint32_t divisor, Int128& remainder, // NOLINT
+ bool dividendWasNegative, // NOLINT
+ bool divisorWasNegative) {
+ uint64_t r = 0;
+ uint32_t resultArray[5];
+ for (int64_t j = 0; j < dividendLength; j++) {
+ r <<= 32;
+ r += dividend[j];
+ resultArray[j] = static_cast<uint32_t>(r / divisor);
+ r %= divisor;
+ }
+ Int128 result;
+ buildFromArray(result, resultArray, dividendLength);
+ remainder = static_cast<int64_t>(r);
+ fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative);
+ return result;
+}
+
+Int128 Int128::divide(const Int128& divisor, Int128& remainder) const {
+ // Split the dividend and divisor into integer pieces so that we can
+ // work on them.
+ uint32_t dividendArray[5];
+ uint32_t divisorArray[4];
+ bool dividendWasNegative;
+ bool divisorWasNegative;
+ // leave an extra zero before the dividend
+ dividendArray[0] = 0;
+ int64_t dividendLength =
+ fillInArray(dividendArray + 1, dividendWasNegative) + 1;
+ int64_t divisorLength = divisor.fillInArray(divisorArray, divisorWasNegative);
+
+ // Handle some of the easy cases.
+ if (dividendLength <= divisorLength) {
+ remainder = *this;
+ return 0;
+ } else if (divisorLength == 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Division by 0 in Int128");
+ } else if (divisorLength == 1) {
+ return singleDivide(dividendArray, dividendLength, divisorArray[0],
+ remainder, dividendWasNegative, divisorWasNegative);
+ }
+
+ int64_t resultLength = dividendLength - divisorLength;
+ uint32_t resultArray[4];
+
+ // Normalize by shifting both by a multiple of 2 so that
+ // the digit guessing is better. The requirement is that
+ // divisorArray[0] is greater than 2**31.
+ int64_t normalizeBits = 32 - fls(divisorArray[0]);
+ shiftArrayLeft(divisorArray, divisorLength, normalizeBits);
+ shiftArrayLeft(dividendArray, dividendLength, normalizeBits);
+
+ // compute each digit in the result
+ for (int64_t j = 0; j < resultLength; ++j) {
+ // Guess the next digit. At worst it is two too large
+ uint32_t guess = UINT32_MAX;
+ uint64_t highDividend =
+ static_cast<uint64_t>(dividendArray[j]) << 32 | dividendArray[j + 1];
+ if (dividendArray[j] != divisorArray[0]) {
+ guess = static_cast<uint32_t>(highDividend / divisorArray[0]);
+ }
+
+ // catch all of the cases where guess is two too large and most of the
+ // cases where it is one too large
+ uint32_t rhat = static_cast<uint32_t>(
+ highDividend - guess * static_cast<uint64_t>(divisorArray[0]));
+ while (static_cast<uint64_t>(divisorArray[1]) * guess >
+ (static_cast<uint64_t>(rhat) << 32) + dividendArray[j + 2]) {
+ guess -= 1;
+ rhat += divisorArray[0];
+ if (static_cast<uint64_t>(rhat) < divisorArray[0]) {
+ break;
+ }
+ }
+
+ // subtract off the guess * divisor from the dividend
+ uint64_t mult = 0;
+ for (int64_t i = divisorLength - 1; i >= 0; --i) {
+ mult += static_cast<uint64_t>(guess) * divisorArray[i];
+ uint32_t prev = dividendArray[j + i + 1];
+ dividendArray[j + i + 1] -= static_cast<uint32_t>(mult);
+ mult >>= 32;
+ if (dividendArray[j + i + 1] > prev) {
+ mult += 1;
+ }
+ }
+ uint32_t prev = dividendArray[j];
+ dividendArray[j] -= static_cast<uint32_t>(mult);
+
+ // if guess was too big, we add back divisor
+ if (dividendArray[j] > prev) {
+ guess -= 1;
+ uint32_t carry = 0;
+ for (int64_t i = divisorLength - 1; i >= 0; --i) {
+ uint64_t sum = static_cast<uint64_t>(divisorArray[i]) +
+ dividendArray[j + i + 1] + carry;
+ dividendArray[j + i + 1] = static_cast<uint32_t>(sum);
+ carry = static_cast<uint32_t>(sum >> 32);
+ }
+ dividendArray[j] += carry;
+ }
+
+ resultArray[j] = guess;
+ }
+
+ // denormalize the remainder
+ shiftArrayRight(dividendArray, dividendLength, normalizeBits);
+
+ // return result and remainder
+ Int128 result;
+ buildFromArray(result, resultArray, resultLength);
+ buildFromArray(remainder, dividendArray, dividendLength);
+ fixDivisionSigns(result, remainder, dividendWasNegative, divisorWasNegative);
+ return result;
+}
+
+std::string Int128::toString() const {
+ // 10**18 - the largest power of 10 less than 63 bits
+ const Int128 tenTo18(0xde0b6b3a7640000);
+ // 10**36
+ const Int128 tenTo36(0xc097ce7bc90715, 0xb34b9f1000000000);
+ Int128 remainder;
+ std::stringstream buf;
+ bool needFill = false;
+
+ // get anything above 10**36 and print it
+ Int128 top = divide(tenTo36, remainder);
+ if (top != 0) {
+ buf << top.toLong();
+ remainder.abs();
+ needFill = true;
+ }
+
+ // now get anything above 10**18 and print it
+ Int128 tail;
+ top = remainder.divide(tenTo18, tail);
+ if (needFill || top != 0) {
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ } else {
+ needFill = true;
+ tail.abs();
+ }
+ buf << top.toLong();
+ }
+
+ // finally print the tail, which is less than 10**18
+ if (needFill) {
+ buf << std::setw(18) << std::setfill('0');
+ }
+ buf << tail.toLong();
+ return buf.str();
+}
+
+std::string Int128::toDecimalString(int32_t scale) const {
+ std::string str = toString();
+ if (scale == 0) {
+ return str;
+ } else if (*this < 0) {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len - 1 > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len - 1 == scale) {
+ return "-0." + str.substr(1, std::string::npos);
+ } else {
+ std::string result = "-0.";
+ for (int32_t i = 0; i < scale - len + 1; ++i) {
+ result += "0";
+ }
+ return result + str.substr(1, std::string::npos);
+ }
+ } else {
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return "0." + str;
+ } else {
+ std::string result = "0.";
+ for (int32_t i = 0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+ }
+}
+
+std::string Int128::toHexString() const {
+ std::stringstream buf;
+ buf << std::hex << "0x" << std::setw(16) << std::setfill('0') << highbits
+ << std::setw(16) << std::setfill('0') << lowbits;
+ return buf.str();
+}
+
+const static int32_t MAX_PRECISION_64 = 18; // NOLINT
+const static int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1] = { // NOLINT
+ 1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power,
+ bool& overflow) { // NOLINT
+ overflow = false;
+ Int128 remainder;
+
+ while (power > 0) {
+ int32_t step = std::min(power, MAX_PRECISION_64);
+ if (value > 0 &&
+ Int128::maximumValue().divide(POWERS_OF_TEN[step], remainder) < value) {
+ overflow = true;
+ return Int128::maximumValue();
+ } else if (value < 0 && Int128::minimumValue().divide(POWERS_OF_TEN[step],
+ remainder) > value) {
+ overflow = true;
+ return Int128::minimumValue();
+ }
+
+ value *= POWERS_OF_TEN[step];
+ power -= step;
+ }
+
+ return value;
+}
+
+Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power) {
+ Int128 remainder;
+ while (power > 0) {
+ int32_t step = std::min(std::abs(power), MAX_PRECISION_64);
+ value = value.divide(POWERS_OF_TEN[step], remainder);
+ power -= step;
+ }
+ return value;
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/int128.h b/depends/storage/src/storage/format/orc/int128.h
new file mode 100644
index 0000000..5949dee
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/int128.h
@@ -0,0 +1,304 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_INT128_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_INT128_H_
+
+#include <stdexcept>
+#include <string>
+
+#include "dbcommon/log/logger.h"
+
+namespace orc {
+
+// Represents a signed 128-bit integer in two's complement.
+// Calculations wrap around and overflow is ignored.
+//
+// For a discussion of the algorithms, look at Knuth's volume 2,
+// Semi-numerical Algorithms section 4.3.1.
+class Int128 {
+ public:
+ Int128() {
+ highbits = 0;
+ lowbits = 0;
+ }
+
+ // Convert a signed 64 bit value into an Int128.
+ Int128(int64_t right) { // NOLINT
+ if (right >= 0) {
+ highbits = 0;
+ lowbits = static_cast<uint64_t>(right);
+ } else {
+ highbits = -1;
+ lowbits = static_cast<uint64_t>(right);
+ }
+ }
+
+ // Create from the twos complement representation.
+ Int128(int64_t high, uint64_t low) {
+ highbits = high;
+ lowbits = low;
+ }
+
+ // Parse the number from a base 10 string representation.
+ explicit Int128(const std::string&);
+
+ // Maximum positive value allowed by the type.
+ static Int128 maximumValue();
+
+ // Minimum negative value allowed by the type.
+ static Int128 minimumValue();
+
+ Int128& negate() {
+ lowbits = ~lowbits + 1;
+ highbits = ~highbits;
+ if (lowbits == 0) {
+ highbits += 1;
+ }
+ return *this;
+ }
+
+ Int128& abs() {
+ if (highbits < 0) {
+ negate();
+ }
+ return *this;
+ }
+
+ Int128& invert() {
+ lowbits = ~lowbits;
+ highbits = ~highbits;
+ return *this;
+ }
+
+ // Add a number to this one. The result is truncated to 128 bits.
+ // @param right the number to add
+ // @return *this
+ Int128& operator+=(const Int128& right) {
+ uint64_t sum = lowbits + right.lowbits;
+ highbits += right.highbits;
+ if (sum < lowbits) {
+ highbits += 1;
+ }
+ lowbits = sum;
+ return *this;
+ }
+
+ // Subtract a number from this one. The result is truncated to 128 bits.
+ // @param right the number to subtract
+ // @return *this
+ Int128& operator-=(const Int128& right) {
+ uint64_t diff = lowbits - right.lowbits;
+ highbits -= right.highbits;
+ if (diff > lowbits) {
+ highbits -= 1;
+ }
+ lowbits = diff;
+ return *this;
+ }
+
+ // Multiply this number by a number. The result is truncated to 128 bits.
+ // @param right the number to multiply by
+ // @return *this
+ Int128& operator*=(const Int128& right);
+
+ // Divide this number by right and return the result. This operation is
+ // not destructive.
+ //
+ // The answer rounds to zero. Signs work like:
+ // 21 / 5 -> 4, 1
+ // -21 / 5 -> -4, -1
+ // 21 / -5 -> -4, 1
+ // -21 / -5 -> 4, -1
+ // @param right the number to divide by
+ // @param remainder the remainder after the division
+ Int128 divide(const Int128& right, Int128& remainder) const; // NOLINT
+
+ // Logical or between two Int128.
+ // @param right the number to or in
+ // @return *this
+ Int128& operator|=(const Int128& right) {
+ lowbits |= right.lowbits;
+ highbits |= right.highbits;
+ return *this;
+ }
+
+ // Logical and between two Int128.
+ // @param right the number to and in
+ // @return *this
+ Int128& operator&=(const Int128& right) {
+ lowbits &= right.lowbits;
+ highbits &= right.highbits;
+ return *this;
+ }
+
+ // Shift left by the given number of bits.
+ // Values larger than 2**127 will shift into the sign bit.
+ Int128& operator<<=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ highbits <<= bits;
+ highbits |= (lowbits >> (64 - bits));
+ lowbits <<= bits;
+ } else if (bits < 128) {
+ highbits = static_cast<int64_t>(lowbits) << (bits - 64);
+ lowbits = 0;
+ } else {
+ highbits = 0;
+ lowbits = 0;
+ }
+ }
+ return *this;
+ }
+
+ // Shift right by the given number of bits. Negative values will
+ // sign extend and fill with one bits.
+ Int128& operator>>=(uint32_t bits) {
+ if (bits != 0) {
+ if (bits < 64) {
+ lowbits >>= bits;
+ lowbits |= static_cast<uint64_t>(highbits << (64 - bits));
+ highbits =
+ static_cast<int64_t>(static_cast<uint64_t>(highbits) >> bits);
+ } else if (bits < 128) {
+ lowbits = static_cast<uint64_t>(highbits >> (bits - 64));
+ highbits = highbits >= 0 ? 0 : -1l;
+ } else {
+ highbits = highbits >= 0 ? 0 : -1l;
+ lowbits = static_cast<uint64_t>(highbits);
+ }
+ }
+ return *this;
+ }
+
+ bool operator==(const Int128& right) const {
+ return highbits == right.highbits && lowbits == right.lowbits;
+ }
+
+ bool operator!=(const Int128& right) const {
+ return highbits != right.highbits || lowbits != right.lowbits;
+ }
+
+ bool operator<(const Int128& right) const {
+ if (highbits == right.highbits) {
+ return lowbits < right.lowbits;
+ } else {
+ return highbits < right.highbits;
+ }
+ }
+
+ bool operator<=(const Int128& right) const {
+ if (highbits == right.highbits) {
+ return lowbits <= right.lowbits;
+ } else {
+ return highbits <= right.highbits;
+ }
+ }
+
+ bool operator>(const Int128& right) const {
+ if (highbits == right.highbits) {
+ return lowbits > right.lowbits;
+ } else {
+ return highbits > right.highbits;
+ }
+ }
+
+ bool operator>=(const Int128& right) const {
+ if (highbits == right.highbits) {
+ return lowbits >= right.lowbits;
+ } else {
+ return highbits >= right.highbits;
+ }
+ }
+
+ uint32_t hash() const {
+ return static_cast<uint32_t>(highbits >> 32) ^
+ static_cast<uint32_t>(highbits) ^
+ static_cast<uint32_t>(lowbits >> 32) ^
+ static_cast<uint32_t>(lowbits);
+ }
+
+ // Does this value fit into a long?
+ bool fitsInLong() const {
+ switch (highbits) {
+ case 0:
+ return !(lowbits & LONG_SIGN_BIT);
+ case -1:
+ return lowbits & LONG_SIGN_BIT;
+ default:
+ return false;
+ }
+ }
+
+ // Convert the value to a long and
+ int64_t toLong() const {
+ if (fitsInLong()) {
+ return static_cast<int64_t>(lowbits);
+ }
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Int128 too large to convert to long");
+ }
+
+ // Return the base 10 string representation of the integer.
+ std::string toString() const;
+
+ // Return the base 10 string representation with a decimal point,
+ // the given number of places after the decimal.
+ std::string toDecimalString(int32_t scale = 0) const;
+
+ // Return the base 16 string representation of the two's complement with
+ // a prefix of "0x".
+ // Int128(-1).toHexString() = "0xffffffffffffffffffffffffffffffff".
+ std::string toHexString() const;
+
+ // Get the high bits of the twos complement representation of the number.
+ int64_t getHighBits() { return highbits; }
+
+ // Get the low bits of the twos complement representation of the number.
+ uint64_t getLowBits() { return lowbits; }
+
+ // Represent the absolute number as a list of uint32.
+ // Visible for testing only.
+ // @param array the array that is set to the value of the number
+ // @param wasNegative set to true if the original number was negative
+ // @return the number of elements that were set in the array (1 to 4)
+ int64_t fillInArray(uint32_t* array, bool& wasNegative) const; // NOLINT
+
+ private:
+ static const uint64_t LONG_SIGN_BIT = 0x8000000000000000u;
+ int64_t highbits;
+ uint64_t lowbits;
+};
+
+/**
+ * Scales up an Int128 value
+ * @param value the Int128 value to scale
+ * @param power the scale offset. Result of a negative factor is undefined.
+ * @param overflow returns whether the result overflows or not
+ * @return the scaled value
+ */
+Int128 scaleUpInt128ByPowerOfTen(Int128 value, int32_t power, bool& overflow);
+/**
+ * Scales down an Int128 value
+ * @param value the Int128 value to scale
+ * @param power the scale offset. Result of a negative factor is undefined.
+ * @return the scaled value
+ */
+Int128 scaleDownInt128ByPowerOfTen(Int128 value, int32_t power);
+} // end of namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_INT128_H_
diff --git a/depends/storage/src/storage/format/orc/lzo-decompressor.cc b/depends/storage/src/storage/format/orc/lzo-decompressor.cc
new file mode 100644
index 0000000..2752fec
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/lzo-decompressor.cc
@@ -0,0 +1,396 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <sstream>
+#include <string>
+
+#include "dbcommon/log/logger.h"
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/lzo-decompressor.h"
+
+namespace orc {
+
+static const int32_t DEC_32_TABLE[] = {4, 1, 2, 1, 4, 4, 4, 4};
+static const int32_t DEC_64_TABLE[] = {0, 0, 0, -1, 0, 1, 2, 3};
+
+static const int32_t SIZE_OF_SHORT = 2;
+static const int32_t SIZE_OF_INT = 4;
+static const int32_t SIZE_OF_LONG = 8;
+
+static std::string toHex(uint64_t val) {
+ std::ostringstream out;
+ out << "0x" << std::hex << val;
+ return out.str();
+}
+
+static std::string toString(int64_t val) {
+ std::ostringstream out;
+ out << val;
+ return out.str();
+}
+
+class MalformedInputException : public ParseError {
+ public:
+ explicit MalformedInputException(int64_t off)
+ : ParseError("MalformedInputException at " + toString(off)) {}
+
+ MalformedInputException(int64_t off, const std::string &msg)
+ : ParseError("MalformedInputException " + msg + " at " + toString(off)) {}
+
+ MalformedInputException(const MalformedInputException &other)
+ : ParseError(other.what()) {}
+
+ virtual ~MalformedInputException() noexcept;
+};
+
+MalformedInputException::~MalformedInputException() noexcept {
+ // PASS
+}
+
+uint64_t lzoDecompress(const char *inputAddress, const char *inputLimit,
+ char *outputAddress, char *outputLimit) {
+ // nothing compresses to nothing
+ if (inputAddress == inputLimit) {
+ return 0;
+ }
+
+ // maximum offset in buffers to which it's safe to write long-at-a-time
+ char *const fastOutputLimit = outputLimit - SIZE_OF_LONG;
+
+ // LZO can concat two blocks together so, decode until the input data is
+ // consumed
+ const char *input = inputAddress;
+ char *output = outputAddress;
+ while (input < inputLimit) {
+ //
+ // Note: For safety some of the code below may stop decoding early or
+ // skip decoding, because input is not available. This makes the code
+ // safe, and since LZO requires an explicit "stop" command, the decoder
+ // will still throw a exception.
+ //
+
+ bool firstCommand = true;
+ uint32_t lastLiteralLength = 0;
+ while (true) {
+ if (input >= inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ uint32_t command = *(input++) & 0xFF;
+ if (command == 0x11) {
+ break;
+ }
+
+ // Commands are described using a bit pattern notation:
+ // 0: bit is not set
+ // 1: bit is set
+ // L: part of literal length
+ // P: part of match offset position
+ // M: part of match length
+ // ?: see documentation in command decoder
+
+ int32_t matchLength;
+ int32_t matchOffset;
+ uint32_t literalLength;
+ if ((command & 0xf0) == 0) {
+ if (lastLiteralLength == 0) {
+ // 0b0000_LLLL (0bLLLL_LLLL)*
+
+ // copy length :: fixed
+ // 0
+ matchOffset = 0;
+
+ // copy offset :: fixed
+ // 0
+ matchLength = 0;
+
+ // literal length - 3 :: variable bits :: valid range [4..]
+ // 3 + variableLength(command bits [0..3], 4)
+ literalLength = command & 0xf;
+ if (literalLength == 0) {
+ literalLength = 0xf;
+
+ uint32_t nextByte = 0;
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
+ literalLength += 0xff;
+ }
+ literalLength += nextByte;
+ }
+ literalLength += 3;
+ } else if (lastLiteralLength <= 3) {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length: fixed
+ // 3
+ matchLength = 3;
+
+ // copy offset :: 12 bits :: valid range [2048..3071]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ // [10] unset
+ // [11] set
+ if (input >= inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+ matchOffset |= 0x800;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ // 0b0000_PPLL 0bPPPP_PPPP
+
+ // copy length :: fixed
+ // 2
+ matchLength = 2;
+
+ // copy offset :: 10 bits :: valid range [0..1023]
+ // [0..1] from command [2..3]
+ // [2..9] from trailer [0..7]
+ if (input >= inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ matchOffset = (command & 0xc) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ }
+ } else if (firstCommand) {
+ // first command has special handling when high nibble is set
+ matchLength = 0;
+ matchOffset = 0;
+ literalLength = command - 17;
+ } else if ((command & 0xf0) == 0x10) {
+ // 0b0001_?MMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..2], 3)
+ matchLength = command & 0x7;
+ if (matchLength == 0) {
+ matchLength = 0x7;
+
+ int32_t nextByte = 0;
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ uint32_t trailer = *reinterpret_cast<const uint16_t *>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 16 bits :: valid range [32767..49151]
+ // [0..13] from trailer [2..15]
+ // [14] if command bit [3] unset
+ // [15] if command bit [3] set
+ matchOffset = trailer >> 2;
+ if ((command & 0x8) == 0) {
+ matchOffset |= 0x4000;
+ } else {
+ matchOffset |= 0x8000;
+ }
+ matchOffset--;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xe0) == 0x20) {
+ // 0b001M_MMMM (0bMMMM_MMMM)* 0bPPPP_PPPP_PPPP_PPLL
+
+ // copy length - 2 :: variable bits :: valid range [3..]
+ // 2 + variableLength(command bits [0..4], 5)
+ matchLength = command & 0x1f;
+ if (matchLength == 0) {
+ matchLength = 0x1f;
+
+ int nextByte = 0;
+ while (input < inputLimit && (nextByte = *(input++) & 0xFF) == 0) {
+ matchLength += 0xff;
+ }
+ matchLength += nextByte;
+ }
+ matchLength += 2;
+
+ // read trailer
+ if (input + SIZE_OF_SHORT > inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ int32_t trailer = *reinterpret_cast<const int16_t *>(input) & 0xFFFF;
+ input += SIZE_OF_SHORT;
+
+ // copy offset :: 14 bits :: valid range [0..16383]
+ // [0..13] from trailer [2..15]
+ matchOffset = trailer >> 2;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from trailer [0..1]
+ literalLength = trailer & 0x3;
+ } else if ((command & 0xc0) != 0) {
+ // 0bMMMP_PPLL 0bPPPP_PPPP
+
+ // copy length - 1 :: 3 bits :: valid range [1..8]
+ // [0..2] from command [5..7]
+ // add 1
+ matchLength = (command & 0xe0) >> 5;
+ matchLength += 1;
+
+ // copy offset :: 11 bits :: valid range [0..4095]
+ // [0..2] from command [2..4]
+ // [3..10] from trailer [0..7]
+ if (input >= inputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ matchOffset = (command & 0x1c) >> 2;
+ matchOffset |= (*(input++) & 0xFF) << 3;
+
+ // literal length :: 2 bits :: valid range [0..3]
+ // [0..1] from command [0..1]
+ literalLength = (command & 0x3);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "MalformedInputException: Invalid LZO command %s at %ld",
+ toHex(command).c_str(), input - inputAddress - 1);
+ }
+ firstCommand = false;
+
+ // copy match
+ if (matchLength != 0) {
+ // lzo encodes match offset minus one
+ matchOffset++;
+
+ char *matchAddress = output - matchOffset;
+ if (matchAddress < outputAddress ||
+ output + matchLength > outputLimit) {
+ throw MalformedInputException(input - inputAddress);
+ }
+ char *matchOutputLimit = output + matchLength;
+
+ if (output > fastOutputLimit) {
+ // slow match copy
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ // copy repeated sequence
+ if (matchOffset < SIZE_OF_LONG) {
+ // 8 bytes apart so that we can copy long-at-a-time below
+ int32_t increment32 = DEC_32_TABLE[matchOffset];
+ int32_t decrement64 = DEC_64_TABLE[matchOffset];
+
+ output[0] = *matchAddress;
+ output[1] = *(matchAddress + 1);
+ output[2] = *(matchAddress + 2);
+ output[3] = *(matchAddress + 3);
+ output += SIZE_OF_INT;
+ matchAddress += increment32;
+
+ *reinterpret_cast<int32_t *>(output) =
+ *reinterpret_cast<int32_t *>(matchAddress);
+ output += SIZE_OF_INT;
+ matchAddress -= decrement64;
+ } else {
+ *reinterpret_cast<int64_t *>(output) =
+ *reinterpret_cast<int64_t *>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ if (matchOutputLimit >= fastOutputLimit) {
+ if (matchOutputLimit > outputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "MalformedInputException at %ld", input - inputAddress);
+ }
+
+ while (output < fastOutputLimit) {
+ *reinterpret_cast<int64_t *>(output) =
+ *reinterpret_cast<int64_t *>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+
+ while (output < matchOutputLimit) {
+ *(output++) = *(matchAddress++);
+ }
+ } else {
+ while (output < matchOutputLimit) {
+ *reinterpret_cast<int64_t *>(output) =
+ *reinterpret_cast<int64_t *>(matchAddress);
+ matchAddress += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ }
+ }
+ }
+ output = matchOutputLimit; // correction in case we over-copied
+ }
+
+ // copy literal
+ char *literalOutputLimit = output + literalLength;
+ if (literalOutputLimit > fastOutputLimit ||
+ input + literalLength > inputLimit - SIZE_OF_LONG) {
+ if (literalOutputLimit > outputLimit) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+
+ // slow, precise copy
+ memcpy(output, input, literalLength);
+ input += literalLength;
+ output += literalLength;
+ } else {
+ // fast copy. We may over-copy but there's enough room in input
+ // and output to not overrun them
+ do {
+ *reinterpret_cast<int64_t *>(output) =
+ *reinterpret_cast<const int64_t *>(input);
+ input += SIZE_OF_LONG;
+ output += SIZE_OF_LONG;
+ } while (output < literalOutputLimit);
+ // adjust index if we over-copied
+ input -= (output - literalOutputLimit);
+ output = literalOutputLimit;
+ }
+ lastLiteralLength = literalLength;
+ }
+
+ if (input + SIZE_OF_SHORT > inputLimit &&
+ *reinterpret_cast<const int16_t *>(input) != 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "MalformedInputException at %ld",
+ input - inputAddress);
+ }
+ input += SIZE_OF_SHORT;
+ }
+
+ return static_cast<uint64_t>(output - outputAddress);
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/lzo-decompressor.h b/depends/storage/src/storage/format/orc/lzo-decompressor.h
new file mode 100644
index 0000000..f163f3e
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/lzo-decompressor.h
@@ -0,0 +1,35 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_LZO_DECOMPRESSOR_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_LZO_DECOMPRESSOR_H_
+
+namespace orc {
+
+// Decompress the bytes in to the output buffer.
+// @param inputAddress the start of the input
+// @param inputLimit one past the last byte of the input
+// @param outputAddress the start of the output buffer
+// @param outputLimit one past the last byte of the output buffer
+// @result the number of bytes decompressed
+uint64_t lzoDecompress(const char *inputAddress, const char *inputLimit,
+ char *outputAddress, char *outputLimit);
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_LZO_DECOMPRESSOR_H_
diff --git a/depends/storage/src/storage/format/orc/orc-format-reader.cc b/depends/storage/src/storage/format/orc/orc-format-reader.cc
new file mode 100644
index 0000000..071623e
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format-reader.cc
@@ -0,0 +1,278 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/orc-format-reader.h"
+
+#include <list>
+#include <memory>
+#include <utility>
+
+#include "dbcommon/common/vector.h"
+#include "dbcommon/common/vector/decimal-vector.h"
+#include "dbcommon/common/vector/list-vector.h"
+#include "dbcommon/common/vector/struct-vector.h"
+#include "dbcommon/common/vector/variable-length-vector.h"
+#include "dbcommon/utils/global.h"
+#include "dbcommon/utils/url.h"
+
+#include "storage/format/orc/input-stream.h"
+
+namespace storage {
+
+void ORCFormatReader::beginRead(
+ dbcommon::FileSystemManagerInterface *fsManager,
+ const univplan::UnivPlanScanFileSplitListList *splits,
+ std::vector<bool> *columnsToRead, uint32_t nTuplesPerBatch,
+ const univplan::UnivPlanExprPolyList *predicateExprs,
+ const dbcommon::TupleDesc *td, bool readStatsOnly) {
+ assert(fsManager != nullptr && splits != nullptr);
+
+ this->fsManager = fsManager;
+ this->splits = splits;
+ this->nTuplesPerBatch = nTuplesPerBatch;
+ this->columnsToRead = columnsToRead;
+
+ if (columnsToRead != nullptr) {
+ std::list<uint64_t> toRead;
+ for (uint32_t i = 0; i < columnsToRead->size(); i++) {
+ if ((*columnsToRead)[i]) {
+ toRead.push_back(i);
+ }
+ }
+
+ opts.include(toRead);
+ }
+
+ opts.setPredicateExprs(predicateExprs);
+ opts.setTupleDesc(td);
+ opts.setReadStatsOnlyFlag(readStatsOnly);
+}
+
+void ORCFormatReader::startNewSplit() {
+ assert(currentSplitIdx < splits->front()->splits_size());
+
+ std::string splitFilename;
+ splits->front()->splits_filename(currentSplitIdx, &splitFilename);
+ bool reuseInputStream = false;
+ if (currentSplitIdx > 0) {
+ std::string splitFilenamePrev;
+ splits->front()->splits_filename(currentSplitIdx - 1, &splitFilenamePrev);
+ if (splitFilename == splitFilenamePrev) {
+ reuseInputStream = true;
+ }
+ }
+ std::unique_ptr<orc::InputStream> inputStream;
+ if (reuseInputStream) {
+ inputStream = orcReader->ownInputStream();
+ } else {
+ dbcommon::URL url(splitFilename);
+ if (orcReader) {
+ inputStream = orcReader->ownInputStream();
+ inputStream.reset(nullptr);
+ }
+ inputStream = orc::readFile(fsManager->get(url.getNormalizedServiceName()),
+ url.getPath());
+ }
+ opts.range(splits->front()->splits_start(currentSplitIdx),
+ splits->front()->splits_len(currentSplitIdx));
+ orcReader = orc::createReader(std::move(inputStream), opts);
+
+ if (batch == nullptr) {
+ batch = orcReader->createRowBatch(this->nTuplesPerBatch);
+ }
+}
+
+bool ORCFormatReader::hasSomethingToRead() {
+ if (startAnotherSplit) {
+ startAnotherSplit = false;
+ currentSplitIdx++;
+ while (true) {
+ if (currentSplitIdx >= splits->front()->splits_size()) {
+ return false;
+ }
+ // skip empty split
+ if (splits->front()->splits_len(currentSplitIdx) > 0) {
+ startNewSplit();
+ return true;
+ }
+ currentSplitIdx++;
+ }
+ }
+ return true;
+}
+
+dbcommon::TupleBatch::uptr ORCFormatReader::read() {
+ while (hasSomethingToRead()) {
+ if (batch && orcReader->next(*batch)) {
+ return createTupleBatch(batch.get());
+ } else {
+ startAnotherSplit = true;
+ if (orcReader && opts.getPredicateExprs())
+ orcReader->collectPredicateStats(&scannedStripe, &skippedStripe);
+ }
+ }
+
+ orcReader.reset(nullptr);
+ if (batch) batch.reset(nullptr);
+ return dbcommon::TupleBatch::uptr(nullptr);
+}
+
+void ORCFormatReader::endRead() {
+ if (opts.getPredicateExprs())
+ LOG_INFO("Predicate Info: current qe scan %u stripes, skip %u stripes",
+ scannedStripe, skippedStripe);
+}
+
+void ORCFormatReader::reset() {
+ startAnotherSplit = true;
+ currentSplitIdx = -1;
+ skippedStripe = 0;
+ scannedStripe = 0;
+}
+
+dbcommon::TupleBatch::uptr ORCFormatReader::createTupleBatch(
+ orc::ColumnVectorBatch *batch) {
+ orc::StructVectorBatch *structBatch =
+ dynamic_cast<orc::StructVectorBatch *>(batch);
+ assert(structBatch != nullptr);
+
+ uint32_t nCols = columnsToRead != nullptr ? columnsToRead->size()
+ : structBatch->fields.size();
+ dbcommon::TupleBatch::uptr tbatch(new dbcommon::TupleBatch(nCols));
+
+ std::vector<orc::ColumnVectorBatch *>::iterator it =
+ structBatch->fields.begin();
+
+ tbatch->setNumOfRows(structBatch->numElements);
+
+ for (uint32_t colIdx = 0; colIdx < nCols; colIdx++) {
+ if (columnsToRead && !columnsToRead->at(colIdx)) {
+ continue;
+ }
+
+ orc::ColumnVectorBatch *b = *it++;
+
+ std::unique_ptr<dbcommon::Vector> v;
+ if (b->getType() == orc::ORCTypeKind::LIST) {
+ orc::ListVectorBatch *lb = dynamic_cast<orc::ListVectorBatch *>(b);
+ v = lb->buildVector(lb->elements->getType());
+ } else if (b->getType() == orc::ORCTypeKind::DECIMAL) {
+ v = b->buildVector((opts.getTupleDesc())->getColumnType(colIdx));
+ } else if (b->getType() == orc::ORCTypeKind::TIMESTAMP) {
+ v = b->buildVector((opts.getTupleDesc())->getColumnType(colIdx));
+ } else {
+ v = b->buildVector();
+ }
+
+ if (b->hasStats && b->getType() != orc::ORCTypeKind::TIMESTAMP) {
+ v->setVectorStatistics(b->stats);
+ // append one dummy item
+ v->append("1", false);
+ tbatch->setColumn(colIdx, std::move(v), false);
+ tbatch->setNumOfRows(1);
+ continue;
+ }
+
+ switch (b->getType()) {
+ case orc::ORCTypeKind::BOOLEAN:
+ case orc::ORCTypeKind::BYTE:
+ case orc::ORCTypeKind::SHORT:
+ case orc::ORCTypeKind::INT:
+ case orc::ORCTypeKind::LONG:
+ case orc::ORCTypeKind::FLOAT:
+ case orc::ORCTypeKind::DOUBLE:
+ case orc::ORCTypeKind::DATE:
+ case orc::ORCTypeKind::TIME: {
+ v->setValue(b->getData(), b->numElements * b->getWidth());
+ v->setHasNull(b->hasNulls);
+ if (b->hasNulls) v->setNotNulls(b->getNotNull(), b->numElements);
+ assert(v->isValid());
+ break;
+ }
+ case orc::ORCTypeKind::TIMESTAMP: {
+ v->setValue(b->getData(), b->numElements * b->getWidth() / 2);
+ v->setNanoseconds(b->getNanoseconds(),
+ b->numElements * b->getWidth() / 2);
+ v->setHasNull(b->hasNulls);
+ if (b->hasNulls) v->setNotNulls(b->getNotNull(), b->numElements);
+ assert(v->isValid());
+ break;
+ }
+ case orc::ORCTypeKind::DECIMAL: {
+ assert(dynamic_cast<dbcommon::DecimalVector *>(v.get()));
+ uint64_t count = b->numElements;
+ v->setAuxiliaryValue(b->getAuxiliaryData(),
+ b->numElements * b->getWidth() / 3);
+ v->setValue(b->getData(), b->numElements * b->getWidth() / 3);
+ v->setScaleValue(b->getScaleData(), b->numElements * b->getWidth() / 3);
+ v->setHasNull(b->hasNulls);
+ if (b->hasNulls) v->setNotNulls(b->getNotNull(), b->numElements);
+ assert(v->isValid());
+ break;
+ }
+ case orc::ORCTypeKind::CHAR:
+ case orc::ORCTypeKind::VARCHAR:
+ case orc::ORCTypeKind::STRING:
+ case orc::ORCTypeKind::BINARY: {
+ orc::BytesVectorBatch *sb = dynamic_cast<orc::BytesVectorBatch *>(b);
+ v->setLengths(reinterpret_cast<uint64_t *>(sb->length.data()),
+ sb->numElements);
+ // todo: memory leak? when is the ownership of values
+ v->setValPtrs((const char **)sb->data.data(), sb->numElements);
+ v->setHasNull(b->hasNulls);
+ if (b->hasNulls) v->setNotNulls(b->getNotNull(), b->numElements);
+ reinterpret_cast<dbcommon::StringVector *>(v.get())->setDirectEncoding(
+ sb->isDirectEncoding);
+
+ assert(v->isValid());
+ break;
+ }
+ case orc::ORCTypeKind::LIST: {
+ orc::ListVectorBatch *lb = dynamic_cast<orc::ListVectorBatch *>(b);
+ dbcommon::ListVector *lv =
+ dynamic_cast<dbcommon::ListVector *>(v.get());
+ lv->setOffsets(reinterpret_cast<uint64_t *>(lb->offsets.data()),
+ lb->numElements + 1);
+ orc::ColumnVectorBatch *clb = lb->elements.get();
+ std::unique_ptr<dbcommon::Vector> clv = clb->buildVector();
+ // Now only support fixed-length type list
+ clv->setValue(clb->getData(), clb->numElements * clb->getWidth());
+ clv->setHasNull(clb->hasNulls);
+ if (clb->hasNulls)
+ clv->setNotNulls(clb->getNotNull(), clb->numElements);
+ assert(clv->isValid());
+ lv->addChildVector(std::move(clv));
+ lv->setHasNull(lb->hasNulls);
+ if (lb->hasNulls) lv->setNotNulls(lb->getNotNull(), lb->numElements);
+ assert(lv->isValid());
+ break;
+ }
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "type %d not supported yet",
+ b->getType());
+ break;
+ }
+
+ tbatch->setColumn(colIdx, std::move(v), false);
+ }
+
+ return std::move(tbatch);
+}
+
+} // namespace storage
diff --git a/depends/storage/src/storage/format/orc/orc-format-reader.h b/depends/storage/src/storage/format/orc/orc-format-reader.h
new file mode 100644
index 0000000..53807f8
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format-reader.h
@@ -0,0 +1,77 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_READER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_READER_H_
+
+#include <string>
+#include <vector>
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/utils/byte-buffer.h"
+
+#include "storage/format/format.h"
+#include "storage/format/orc/column-printer.h"
+#include "storage/format/orc/reader.h"
+#include "storage/format/orc/seekable-input-stream.h"
+#include "storage/format/orc/vector.h"
+
+namespace storage {
+
+class ORCFormatReader {
+ public:
+ ORCFormatReader() {}
+ virtual ~ORCFormatReader() {}
+
+ void beginRead(dbcommon::FileSystemManagerInterface *fsManager,
+ const univplan::UnivPlanScanFileSplitListList *splits,
+ std::vector<bool> *columnsToRead, uint32_t nTuplesPerBatch,
+ const univplan::UnivPlanExprPolyList *predicateExprs,
+ const dbcommon::TupleDesc *td, bool readStatsOnly);
+ dbcommon::TupleBatch::uptr read();
+ void endRead();
+ void reset();
+
+ private:
+ void startNewSplit();
+ dbcommon::TupleBatch::uptr createTupleBatch(orc::ColumnVectorBatch *batch);
+ bool hasSomethingToRead();
+
+ private:
+ const univplan::UnivPlanScanFileSplitListList *splits = nullptr;
+ std::vector<bool> *columnsToRead = nullptr;
+ dbcommon::FileSystemManagerInterface *fsManager = nullptr;
+ uint32_t nTuplesPerBatch = Format::kTuplesPerBatch;
+
+ std::unique_ptr<orc::Reader> orcReader;
+ orc::ReaderOptions opts;
+ std::unique_ptr<orc::ColumnVectorBatch> batch;
+ bool startAnotherSplit = true;
+ int32_t currentSplitIdx = -1;
+
+ // count for filter push down
+ uint32_t skippedStripe = 0;
+ uint32_t scannedStripe = 0;
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_READER_H_
diff --git a/depends/storage/src/storage/format/orc/orc-format-writer.cc b/depends/storage/src/storage/format/orc/orc-format-writer.cc
new file mode 100644
index 0000000..c5fd10d
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format-writer.cc
@@ -0,0 +1,208 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "json/json.h"
+
+// #include "kv/common/cn-global.h"
+// #include "kv/common/configuration.h"
+#include "dbcommon/utils/parameters.h"
+#include "dbcommon/utils/url.h"
+#include "storage/format/orc/orc-format-writer.h"
+
+namespace storage {
+ORCFormatWriter::ORCFormatWriter(
+ dbcommon::FileSystemManagerInterface *fsManager, dbcommon::TupleDesc *td,
+ const char *fileName, uint32_t blockAlignSize, dbcommon::Parameters *p) {
+ this->fsManager = fsManager;
+ this->fileName = fileName;
+
+ dbcommon::URL url(fileName);
+ this->fileSystem = fsManager->get(url.getNormalizedServiceName());
+
+ std::unique_ptr<orc::Type> schema = buildSchema(td);
+ opts.setSchema(std::move(schema));
+ opts.setBlockSize(blockAlignSize);
+
+ assert(p != nullptr);
+ std::string tableOptionStr = p->get("table.options", "");
+ Json::Reader reader;
+ Json::Value root;
+ if (!reader.parse(tableOptionStr, root))
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "jsoncpp failed to parse \'%s\'",
+ tableOptionStr.c_str());
+ if (root.isMember("compresstype"))
+ opts.setCompressionKind(root["compresstype"].asCString());
+ if (root.isMember("rlecoder"))
+ opts.setRleVersion(root["rlecoder"].asCString());
+ if (root.isMember("dicthreshold"))
+ opts.setDictKeySizeThreshold(atof(root["dicthreshold"].asCString()));
+ if (root.isMember("bloomfilter")) {
+ std::vector<int> columns;
+ int col_size = root["bloomfilter"].size();
+ for (int i = 0; i < col_size; ++i) {
+ columns.push_back(root["bloomfilter"][i].asInt());
+ }
+ opts.setColumnsToBloomFilter(columns, td->getNumOfColumns());
+ }
+ if (root.isMember("writestats"))
+ opts.setWriteStats(root["writestats"].asBool());
+
+ writer = orc::createWriter(orc::writeFile(fileSystem, url.getPath()), &opts);
+}
+
+void ORCFormatWriter::beginWrite() { writer->begin(); }
+
+void ORCFormatWriter::write(dbcommon::TupleBatch *tb) {
+ writer->addTupleBatch(tb);
+}
+
+void ORCFormatWriter::endWrite() { writer->end(); }
+
+std::unique_ptr<orc::Type> ORCFormatWriter::buildSchema(
+ dbcommon::TupleDesc *td) {
+ assert(td != nullptr);
+
+ std::vector<dbcommon::TypeKind> &types = td->getColumnTypes();
+ std::vector<std::string> &colNames = td->getColumnNames();
+ std::vector<int64_t> &colTypeMod = td->getColumnTypeModifiers();
+
+ std::unique_ptr<orc::Type> ret(new orc::TypeImpl(orc::ORCTypeKind::STRUCT));
+
+ for (uint32_t i = 0; i < types.size(); i++) {
+ dbcommon::TypeKind t = types[i];
+ std::string &name = colNames[i];
+ int64_t typeMod = colTypeMod[i];
+ std::unique_ptr<orc::Type> child;
+ std::unique_ptr<orc::Type> grandchild;
+
+ switch (t) {
+ case dbcommon::TypeKind::TINYINTID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::BYTE));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::SMALLINTID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::SHORT));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::INTID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::INT));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::BIGINTID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LONG));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::FLOATID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::FLOAT));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::DOUBLEID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::DOUBLE));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::STRINGID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::STRING));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::VARCHARID:
+ child.reset(
+ new orc::TypeImpl(orc::ORCTypeKind::VARCHAR,
+ dbcommon::TypeModifierUtil::getMaxLen(typeMod)));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::CHARID:
+ child.reset(
+ new orc::TypeImpl(orc::ORCTypeKind::CHAR,
+ dbcommon::TypeModifierUtil::getMaxLen(typeMod)));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::BOOLEANID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::BOOLEAN));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::DATEID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::DATE));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::TIMEID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::TIME));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::BINARYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::BINARY));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::TIMESTAMP));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::DECIMALID:
+ case dbcommon::TypeKind::DECIMALNEWID:
+ child.reset(
+ new orc::TypeImpl(orc::ORCTypeKind::DECIMAL,
+ dbcommon::TypeModifierUtil::getPrecision(typeMod),
+ dbcommon::TypeModifierUtil::getScale(typeMod)));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::SMALLINTARRAYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LIST));
+ grandchild.reset(new orc::TypeImpl(orc::ORCTypeKind::SHORT));
+ child->addStructField(name, std::move(grandchild));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::INTARRAYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LIST));
+ grandchild.reset(new orc::TypeImpl(orc::ORCTypeKind::INT));
+ child->addStructField(name, std::move(grandchild));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::BIGINTARRAYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LIST));
+ grandchild.reset(new orc::TypeImpl(orc::ORCTypeKind::LONG));
+ child->addStructField(name, std::move(grandchild));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::FLOATARRAYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LIST));
+ grandchild.reset(new orc::TypeImpl(orc::ORCTypeKind::FLOAT));
+ child->addStructField(name, std::move(grandchild));
+ ret->addStructField(name, std::move(child));
+ break;
+ case dbcommon::TypeKind::DOUBLEARRAYID:
+ child.reset(new orc::TypeImpl(orc::ORCTypeKind::LIST));
+ grandchild.reset(new orc::TypeImpl(orc::ORCTypeKind::DOUBLE));
+ child->addStructField(name, std::move(grandchild));
+ ret->addStructField(name, std::move(child));
+ break;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "type not supported for orc: %d", t);
+ }
+ }
+
+ ret->assignIds(0);
+ return std::move(ret);
+}
+
+} // namespace storage
diff --git a/depends/storage/src/storage/format/orc/orc-format-writer.h b/depends/storage/src/storage/format/orc/orc-format-writer.h
new file mode 100644
index 0000000..0cd23aa
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format-writer.h
@@ -0,0 +1,64 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_WRITER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_WRITER_H_
+
+#include <string>
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "storage/format/orc/writer.h"
+
+namespace storage {
+
+class Parameters;
+class TupleBatch;
+class ORCFormatWriter {
+ public:
+ ORCFormatWriter(dbcommon::FileSystemManagerInterface* fsManager,
+ dbcommon::TupleDesc* td, const char* fileName,
+ uint32_t blockAlignSize, dbcommon::Parameters* p);
+
+ virtual ~ORCFormatWriter() {}
+
+ void beginWrite();
+
+ void write(dbcommon::TupleBatch* tb);
+
+ void endWrite();
+
+ private:
+ std::unique_ptr<orc::Type> buildSchema(dbcommon::TupleDesc* td);
+
+ private:
+ dbcommon::FileSystemManagerInterface* fsManager = nullptr;
+ dbcommon::FileSystem* fileSystem = nullptr;
+ std::string fileName;
+ dbcommon::TupleDesc* desc = nullptr;
+
+ orc::WriterOptions opts;
+ std::unique_ptr<orc::Writer> writer;
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_WRITER_H_
diff --git a/depends/storage/src/storage/format/orc/orc-format.cc b/depends/storage/src/storage/format/orc/orc-format.cc
new file mode 100644
index 0000000..1a5d668
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format.cc
@@ -0,0 +1,129 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/orc-format.h"
+
+#include <memory>
+#include <utility>
+
+#include "dbcommon/log/logger.h"
+
+namespace storage {
+
+void ORCFormat::beginInsert(const std::string &targetName,
+ const dbcommon::TupleDesc &tupleDesc) {
+ assert(!targetName.empty());
+ assert(params != nullptr);
+
+ writer.reset(new ORCFormatWriter(
+ this->fsManager, const_cast<dbcommon::TupleDesc *>(&tupleDesc),
+ targetName.c_str(), this->blockAlignSize, params));
+ writer->beginWrite();
+}
+
+void ORCFormat::doInsert(std::unique_ptr<dbcommon::TupleBatch> tb) {
+ writer->write(tb.get());
+}
+
+void ORCFormat::endInsert() { writer->endWrite(); }
+
+void ORCFormat::beginUpdate(const std::string &targetName,
+ const dbcommon::TupleDesc &td) {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "UPDATE is not implemented yet");
+}
+
+void ORCFormat::doUpdate(std::unique_ptr<dbcommon::TupleBatch> tb) {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "UPDATE is not implemented yet");
+}
+
+void ORCFormat::endUpdate() {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "UPDATE is not implemented yet");
+}
+
+void ORCFormat::beginDelete(const std::string &targetName,
+ const dbcommon::TupleDesc &td) {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "DELETE is not implemented yet");
+}
+
+void ORCFormat::doDelete(std::unique_ptr<dbcommon::TupleBatch> tb) {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "DELETE is not implemented yet");
+}
+
+void ORCFormat::endDelete() {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "DELETE is not implemented yet");
+}
+
+void ORCFormat::beginScan(const univplan::UnivPlanScanFileSplitListList *splits,
+ const dbcommon::TupleDesc *tupleDesc,
+ const std::vector<bool> *projectionCols,
+ const univplan::UnivPlanExprPolyList *filterExprs,
+ const FormatContext *formatContext,
+ bool readStatsOnly) {
+ this->splits = splits;
+ if (this->splits != nullptr) {
+ assert(tupleDesc != nullptr);
+ assert(params != nullptr);
+ std::string tableOptionStr = params->get("table.options", "");
+ assert(!tableOptionStr.empty());
+ Json::Reader jreader;
+ Json::Value root;
+ if (!jreader.parse(tableOptionStr, root))
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "jsoncpp failed to parse \'%s\'",
+ tableOptionStr.c_str());
+
+ reader.reset(new ORCFormatReader());
+
+ assert(fsManager != nullptr);
+
+ reader->beginRead(fsManager, splits,
+ const_cast<std::vector<bool> *>(projectionCols),
+ this->nTuplesPerBatch,
+ const_cast<univplan::UnivPlanExprPolyList *>(filterExprs),
+ tupleDesc, readStatsOnly);
+ }
+}
+
+dbcommon::TupleBatch::uptr ORCFormat::next() {
+ if (splits != nullptr) {
+ dbcommon::TupleBatch::uptr result = reader->read();
+ assert(!result || result->isValid());
+ return std::move(result);
+ }
+
+ return dbcommon::TupleBatch::uptr();
+}
+
+void ORCFormat::endScan() {
+ if (splits != nullptr) {
+ reader->endRead();
+ }
+}
+
+void ORCFormat::reScan() {
+ if (this->splits != nullptr) {
+ assert(reader != nullptr);
+ reader->reset();
+ }
+}
+
+void ORCFormat::stopScan() {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "stopScan is not implemented yet");
+}
+
+} // namespace storage
diff --git a/depends/storage/src/storage/format/orc/orc-format.h b/depends/storage/src/storage/format/orc/orc-format.h
new file mode 100644
index 0000000..6dd3632
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-format.h
@@ -0,0 +1,113 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_H_
+
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "json/json.h"
+
+#include "dbcommon/utils/int-util.h"
+#include "storage/format/format.h"
+#include "storage/format/orc/orc-format-reader.h"
+#include "storage/format/orc/orc-format-writer.h"
+
+namespace storage {
+
+//
+// ORCFormat
+//
+class ORCFormat : public Format {
+ public:
+ ORCFormat() {}
+
+ // constructor with input parameters map
+ explicit ORCFormat(dbcommon::Parameters *params) : params(params) {
+ if (params != nullptr) {
+ this->blockAlignSize =
+ params->getAsInt32("format.block.align.size", Format::kBlockSize);
+
+ if (!dbcommon::isPowerOfTwo(this->blockAlignSize)) {
+ LOG_ERROR(ERRCODE_INVALID_PARAMETER_VALUE,
+ "for ORCFormat, "
+ "format.block.align.size can only be power of 2, input is %d",
+ this->blockAlignSize);
+ }
+
+ this->nTuplesPerBatch = params->getAsInt32("number.tuples.per.batch",
+ Format::kTuplesPerBatch);
+ if (this->nTuplesPerBatch % 8 != 0)
+ LOG_ERROR(
+ ERRCODE_INVALID_PARAMETER_VALUE,
+ "for ORCFormat, "
+ "number.tuples.per.batch can only be multiples of 8, input is %d",
+ this->nTuplesPerBatch);
+ }
+ }
+
+ virtual ~ORCFormat() {}
+
+ ORCFormat(ORCFormat &&format) = delete;
+ ORCFormat(const ORCFormat &format) = delete;
+ ORCFormat &operator=(const ORCFormat &format) = delete;
+ ORCFormat &operator=(ORCFormat &&format) = delete;
+
+ void beginScan(const univplan::UnivPlanScanFileSplitListList *splits,
+ const dbcommon::TupleDesc *tupleDesc,
+ const std::vector<bool> *projectionCols,
+ const univplan::UnivPlanExprPolyList *filterExpr,
+ const FormatContext *formatContext,
+ bool readStatsOnly) override;
+
+ dbcommon::TupleBatch::uptr next() override;
+ void endScan() override;
+ void reScan() override;
+ void stopScan() override;
+
+ void beginInsert(const std::string &targetName,
+ const dbcommon::TupleDesc &td) override;
+ void doInsert(std::unique_ptr<dbcommon::TupleBatch> tb) override;
+ void endInsert() override;
+
+ void beginUpdate(const std::string &targetName,
+ const dbcommon::TupleDesc &td) override;
+ void doUpdate(std::unique_ptr<dbcommon::TupleBatch> tb) override;
+ void endUpdate() override;
+
+ void beginDelete(const std::string &targetName,
+ const dbcommon::TupleDesc &td) override;
+ void doDelete(std::unique_ptr<dbcommon::TupleBatch> tb) override;
+ void endDelete() override;
+
+ private:
+ std::unique_ptr<ORCFormatWriter> writer;
+ std::unique_ptr<ORCFormatReader> reader;
+
+ uint32_t blockAlignSize = Format::kBlockSize;
+ uint32_t nTuplesPerBatch = Format::kTuplesPerBatch;
+
+ dbcommon::Parameters *params = nullptr;
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_FORMAT_H_
diff --git a/depends/storage/src/storage/format/orc/orc-predicates.cc b/depends/storage/src/storage/format/orc/orc-predicates.cc
new file mode 100644
index 0000000..e5ef1b6
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-predicates.cc
@@ -0,0 +1,281 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/orc-predicates.h"
+
+#include <memory>
+#include <sstream>
+#include <utility>
+
+#include "dbcommon/log/logger.h"
+#include "dbcommon/type/decimal.h"
+#include "dbcommon/utils/string-util.h"
+#include "univplan/common/plannode-walker.h"
+
+#include "storage/common/bloom-filter.h"
+#include "storage/format/orc/reader.h"
+
+namespace orc {
+
+bool OrcPredicates::hasAllNull(int32_t colId) const {
+ disableInvalidColId(colId);
+ const Type& child = *reader->getType().getSubtype(colId - 1);
+ return stripeStats->getColumnStatistics(child.getColumnId())
+ ->getNumberOfValues() == 0 &&
+ stripeStats->getColumnStatistics(child.getColumnId())->hasNull();
+}
+
+bool OrcPredicates::hasNull(int32_t colId) const {
+ disableInvalidColId(colId);
+ const Type& child = *reader->getType().getSubtype(colId - 1);
+ return stripeStats->getColumnStatistics(child.getColumnId())->hasNull();
+}
+
+univplan::PredicateStats OrcPredicates::getMinMax(int32_t colId) const {
+ disableInvalidColId(colId);
+ dbcommon::Timestamp ts1, ts2;
+ return getMinMax(colId, &ts1, &ts2);
+}
+
+univplan::PredicateStats OrcPredicates::getMinMax(
+ int32_t colId, dbcommon::Timestamp* minTimestamp,
+ dbcommon::Timestamp* maxTimestamp) const {
+ disableInvalidColId(colId);
+ const Type& child = *reader->getType().getSubtype(colId - 1);
+ const univplan::ColumnStatistics* stats =
+ stripeStats->getColumnStatistics(child.getColumnId());
+ dbcommon::TypeKind type = td->getColumnType(colId - 1);
+ univplan::PredicateStats ret;
+ ret.hasMinMax = true;
+ switch (type) {
+ case dbcommon::TypeKind::SMALLINTID:
+ case dbcommon::TypeKind::INTID:
+ case dbcommon::TypeKind::BIGINTID:
+ case dbcommon::TypeKind::TIMEID: {
+ const IntegerColumnStatisticsImpl* iStat =
+ dynamic_cast<const IntegerColumnStatisticsImpl*>(stats);
+ if (type == dbcommon::TypeKind::SMALLINTID) {
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int16_t>(iStat->getMinimum())),
+ false);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int16_t>(iStat->getMaximum())),
+ false);
+ } else if (type == dbcommon::TypeKind::INTID ||
+ type == dbcommon::TypeKind::DATEID) {
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int32_t>(iStat->getMinimum())),
+ false);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int32_t>(iStat->getMaximum())),
+ false);
+ } else {
+ ret.minValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(iStat->getMinimum()), false);
+ ret.maxValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(iStat->getMaximum()), false);
+ }
+ break;
+ }
+ case dbcommon::TypeKind::FLOATID:
+ case dbcommon::TypeKind::DOUBLEID: {
+ const DoubleColumnStatisticsImpl* dStat =
+ dynamic_cast<const DoubleColumnStatisticsImpl*>(stats);
+ if (type == dbcommon::TypeKind::FLOATID) {
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<float>(dStat->getMinimum())),
+ false);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<float>(dStat->getMaximum())),
+ false);
+ } else {
+ ret.minValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(dStat->getMinimum()), false);
+ ret.maxValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(dStat->getMaximum()), false);
+ }
+ break;
+ }
+ case dbcommon::TypeKind::CHARID: {
+ // we need to trim here
+ const StringColumnStatisticsImpl* sStat =
+ dynamic_cast<const StringColumnStatisticsImpl*>(stats);
+ ret.minValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(sStat->getMinimum()), false);
+ const char* s = sStat->getMinimum();
+ uint32_t len = strlen(s);
+ while (len != 0 && s[len - 1] == ' ') --len;
+ ret.minValue.length = len;
+ ret.maxValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(sStat->getMaximum()), false);
+ s = sStat->getMaximum();
+ len = strlen(s);
+ while (len != 0 && s[len - 1] == ' ') --len;
+ ret.maxValue.length = len;
+ break;
+ }
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::STRINGID: {
+ const StringColumnStatisticsImpl* sStat =
+ dynamic_cast<const StringColumnStatisticsImpl*>(stats);
+ ret.minValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(sStat->getMinimum()), false);
+ ret.minValue.length = strlen(sStat->getMinimum());
+ ret.maxValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(sStat->getMaximum()), false);
+ ret.maxValue.length = strlen(sStat->getMaximum());
+ break;
+ }
+ case dbcommon::TypeKind::BOOLEANID: {
+ const BooleanColumnStatisticsImpl* bStat =
+ dynamic_cast<const BooleanColumnStatisticsImpl*>(stats);
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(bStat->getFalseCount() == 0), false);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(bStat->getTrueCount() > 0), false);
+ break;
+ }
+ case dbcommon::TypeKind::DATEID: {
+ const DateColumnStatisticsImpl* dStat =
+ dynamic_cast<const DateColumnStatisticsImpl*>(stats);
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int32_t>(dStat->getMinimum())),
+ false);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(static_cast<int32_t>(dStat->getMaximum())),
+ false);
+ break;
+ }
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID: {
+ const TimestampColumnStatisticsImpl* tStat =
+ dynamic_cast<const TimestampColumnStatisticsImpl*>(stats);
+ minTimestamp->second = tStat->getMinimum() / 1000;
+ minTimestamp->nanosecond = (tStat->getMinimum() % 1000) * 1000000;
+ maxTimestamp->second = tStat->getMaximum() / 1000;
+ maxTimestamp->nanosecond =
+ (tStat->getMaximum() % 1000) * 1000000 + 999999;
+ ret.minValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(minTimestamp), false);
+ ret.minValue.length = sizeof(dbcommon::Timestamp);
+ ret.maxValue =
+ dbcommon::Scalar(dbcommon::CreateDatum(maxTimestamp), false);
+ ret.maxValue.length = sizeof(dbcommon::Timestamp);
+ break;
+ }
+ case dbcommon::TypeKind::DECIMALID: {
+ const DecimalColumnStatisticsImpl* dStat =
+ dynamic_cast<const DecimalColumnStatisticsImpl*>(stats);
+ ret.minValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(dStat->getMinimumStr()), false);
+ ret.minValue.length = sizeof(dbcommon::DecimalVar);
+ ret.maxValue = dbcommon::Scalar(
+ dbcommon::CreateDatum(dStat->getMaximumStr()), false);
+ ret.maxValue.length = sizeof(dbcommon::DecimalVar);
+ break;
+ }
+ default: {
+ ret.hasMinMax = false;
+ }
+ }
+ return ret;
+}
+
+bool OrcPredicates::canDropByBloomFilter(int32_t colId,
+ univplan::PredicateStats* stat,
+ dbcommon::TypeKind type) const {
+ disableInvalidColId(colId);
+ const Type& child = *reader->getType().getSubtype(colId - 1);
+ proto::BloomFilterIndex bloomFilterIndexProto =
+ reader->rebuildBloomFilter(child.getColumnId());
+ if (bloomFilterIndexProto.bloomfilter_size() == 0) return false;
+
+ for (int32_t i = 0; i < bloomFilterIndexProto.bloomfilter_size(); ++i) {
+ const proto::BloomFilter& bloomFilterProto =
+ bloomFilterIndexProto.bloomfilter(i);
+ std::vector<uint64_t> data;
+ for (int32_t j = 0; j < bloomFilterProto.bitset_size(); ++j)
+ data.push_back(bloomFilterProto.bitset(j));
+ storage::BloomFilter::uptr bf(new storage::BloomFilter(
+ data.data(), data.size(), bloomFilterProto.numhashfunctions()));
+ switch (type) {
+ case dbcommon::TypeKind::SMALLINTID: {
+ if (bf->testInt(dbcommon::DatumGetValue<int16_t>(stat->maxValue.value)))
+ return false;
+ break;
+ }
+ case dbcommon::TypeKind::INTID:
+ case dbcommon::TypeKind::DATEID: {
+ if (bf->testInt(dbcommon::DatumGetValue<int32_t>(stat->maxValue.value)))
+ return false;
+ break;
+ }
+ case dbcommon::TypeKind::BIGINTID:
+ case dbcommon::TypeKind::TIMEID: {
+ if (bf->testInt(dbcommon::DatumGetValue<int64_t>(stat->maxValue.value)))
+ return false;
+ break;
+ }
+ case dbcommon::TypeKind::FLOATID: {
+ if (bf->testDouble(
+ dbcommon::DatumGetValue<float>(stat->maxValue.value)))
+ return false;
+ break;
+ }
+ case dbcommon::TypeKind::DOUBLEID: {
+ if (bf->testDouble(
+ dbcommon::DatumGetValue<double>(stat->maxValue.value)))
+ return false;
+ break;
+ }
+ case dbcommon::TypeKind::CHARID:
+ case dbcommon::TypeKind::VARCHARID:
+ case dbcommon::TypeKind::STRINGID: {
+ const char* str =
+ dbcommon::DatumGetValue<const char*>(stat->maxValue.value);
+ if (bf->testString(str, strlen(str))) return false;
+ break;
+ }
+ case dbcommon::TypeKind::TIMESTAMPID:
+ case dbcommon::TypeKind::TIMESTAMPTZID: {
+ dbcommon::Timestamp* ts =
+ dbcommon::DatumGetValue<dbcommon::Timestamp*>(stat->maxValue.value);
+ if (bf->testInt(ts->second * 1000 + ts->nanosecond / 1000000))
+ return false;
+ break;
+ }
+ default: {
+ LOG_ERROR(
+ ERRCODE_FEATURE_NOT_SUPPORTED,
+ "not supported type %d in OrcPredicates::canDropByBloomFilter",
+ type);
+ }
+ }
+ }
+
+ return true;
+}
+
+void OrcPredicates::disableInvalidColId(int32_t colId) const {
+ if (colId < 0)
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "hidden column doesn't support predicate");
+}
+
+} // end of namespace orc
diff --git a/depends/storage/src/storage/format/orc/orc-predicates.h b/depends/storage/src/storage/format/orc/orc-predicates.h
new file mode 100644
index 0000000..0da9029
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-predicates.h
@@ -0,0 +1,71 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PREDICATES_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PREDICATES_H_
+
+#include <string>
+#include <vector>
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/nodes/datum.h"
+
+#include "storage/format/orc/orc-proto-definition.h"
+
+#include "univplan/common/expression.h"
+#include "univplan/common/univplan-type.h"
+#include "univplan/common/var-util.h"
+#include "univplan/minmax/minmax-predicates.h"
+
+namespace orc {
+
+class ReaderImpl;
+class OrcPredicates : public univplan::MinMaxPredicatesPage {
+ public:
+ OrcPredicates(const univplan::Statistics* s, ReaderImpl* r,
+ const univplan::UnivPlanExprPolyList* predicateExprs,
+ const dbcommon::TupleDesc* tupleDesc)
+ : univplan::MinMaxPredicatesPage(s, predicateExprs, tupleDesc),
+ reader(r) {}
+ virtual ~OrcPredicates() {}
+
+ typedef std::unique_ptr<OrcPredicates> uptr;
+
+ public:
+ virtual bool hasNull(int32_t colId) const;
+ virtual bool hasAllNull(int32_t colId) const;
+ virtual bool canDropByBloomFilter(int32_t colId,
+ univplan::PredicateStats* stat,
+ dbcommon::TypeKind type) const;
+ virtual univplan::PredicateStats getMinMax(int32_t colId) const;
+ virtual univplan::PredicateStats getMinMax(
+ int32_t colId, dbcommon::Timestamp* minTimestamp,
+ dbcommon::Timestamp* maxTimestamp) const;
+
+ private:
+ void disableInvalidColId(int32_t colId) const;
+
+ private:
+ ReaderImpl* reader;
+};
+
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PREDICATES_H_
diff --git a/depends/storage/src/storage/format/orc/orc-proto-definition.cc b/depends/storage/src/storage/format/orc/orc-proto-definition.cc
new file mode 100644
index 0000000..9c6caa1
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-proto-definition.cc
@@ -0,0 +1,221 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/orc-proto-definition.h"
+#include "storage/format/orc/seekable-input-stream.h"
+
+namespace orc {
+
+StreamInformation::~StreamInformation() {
+ // PASS
+}
+
+StripeInformation::~StripeInformation() {
+ // PASS
+}
+
+void StripeInformationImpl::ensureStripeFooterLoaded() const {
+ if (stripeFooter.get() == nullptr) {
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ compression,
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ stream, offset + indexLength + dataLength, footerLength,
+ memoryPool)),
+ blockSize, memoryPool);
+ stripeFooter.reset(new proto::StripeFooter());
+ if (!stripeFooter->ParseFromZeroCopyStream(pbStream.get())) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failed to parse the stripe footer");
+ }
+ }
+}
+
+std::unique_ptr<StreamInformation> StripeInformationImpl::getStreamInformation(
+ uint64_t streamId) const {
+ ensureStripeFooterLoaded();
+ uint64_t streamOffset = offset;
+ for (uint64_t s = 0; s < streamId; ++s) {
+ streamOffset += stripeFooter->streams(static_cast<int>(s)).length();
+ }
+ return std::unique_ptr<StreamInformation>(new StreamInformationImpl(
+ streamOffset, stripeFooter->streams(static_cast<int>(streamId))));
+}
+
+StatisticsImpl::~StatisticsImpl() {
+ for (std::list<univplan::ColumnStatistics*>::iterator ptr = colStats.begin();
+ ptr != colStats.end(); ++ptr) {
+ delete *ptr;
+ }
+}
+
+univplan::ColumnStatistics* convertColumnStatistics(
+ const proto::ColumnStatistics& s, bool correctStats) {
+ if (s.has_intstatistics()) {
+ return new IntegerColumnStatisticsImpl(s);
+ } else if (s.has_doublestatistics()) {
+ return new DoubleColumnStatisticsImpl(s);
+ } else if (s.has_stringstatistics()) {
+ return new StringColumnStatisticsImpl(s, correctStats);
+ } else if (s.has_bucketstatistics()) {
+ return new BooleanColumnStatisticsImpl(s, correctStats);
+ } else if (s.has_decimalstatistics()) {
+ return new DecimalColumnStatisticsImpl(s, correctStats);
+ } else if (s.has_timestampstatistics()) {
+ return new TimestampColumnStatisticsImpl(s, correctStats);
+ } else if (s.has_datestatistics()) {
+ return new DateColumnStatisticsImpl(s, correctStats);
+ } else if (s.has_binarystatistics()) {
+ return new BinaryColumnStatisticsImpl(s, correctStats);
+ } else {
+ return new ColumnStatisticsImpl(s);
+ }
+}
+
+// TODO(zhenglin): to complete other types
+std::unique_ptr<ColumnStatisticsImpl> createColumnStatistics(
+ const orc::Type* type) {
+ switch (type->getKind()) {
+ case orc::ORCTypeKind::BYTE:
+ case orc::ORCTypeKind::SHORT:
+ case orc::ORCTypeKind::INT:
+ case orc::ORCTypeKind::LONG:
+ case orc::ORCTypeKind::TIME:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new IntegerColumnStatisticsImpl());
+ case orc::ORCTypeKind::FLOAT:
+ case orc::ORCTypeKind::DOUBLE:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new DoubleColumnStatisticsImpl());
+ case orc::ORCTypeKind::STRING:
+ case orc::ORCTypeKind::VARCHAR:
+ case orc::ORCTypeKind::CHAR:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new StringColumnStatisticsImpl());
+ case orc::ORCTypeKind::BOOLEAN:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new BooleanColumnStatisticsImpl());
+ case orc::ORCTypeKind::BINARY:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new BinaryColumnStatisticsImpl());
+ case orc::ORCTypeKind::DATE:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new DateColumnStatisticsImpl());
+ case orc::ORCTypeKind::TIMESTAMP:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new TimestampColumnStatisticsImpl());
+ case orc::ORCTypeKind::DECIMAL:
+ return std::unique_ptr<ColumnStatisticsImpl>(
+ new DecimalColumnStatisticsImpl());
+ default:
+ return std::unique_ptr<ColumnStatisticsImpl>(new ColumnStatisticsImpl());
+ }
+}
+
+DateColumnStatisticsImpl::~DateColumnStatisticsImpl() {
+ // PASS
+}
+
+DecimalColumnStatisticsImpl::~DecimalColumnStatisticsImpl() {
+ // PASS
+}
+
+TimestampColumnStatisticsImpl::~TimestampColumnStatisticsImpl() {
+ // PASS
+}
+
+DateColumnStatisticsImpl::DateColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb, bool correctStats) {
+ valueCount = pb.numberofvalues();
+ hasNullValue = pb.hasnull();
+ if (!pb.has_datestatistics() || !correctStats) {
+ _hasStats = false;
+
+ minimum = 0;
+ maximum = 0;
+ } else {
+ _hasStats = pb.datestatistics().has_minimum();
+ minimum = pb.datestatistics().minimum();
+ maximum = pb.datestatistics().maximum();
+ }
+}
+
+DecimalColumnStatisticsImpl::DecimalColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb, bool correctStats) {
+ valueCount = pb.numberofvalues();
+ hasNullValue = pb.hasnull();
+ if (!pb.has_decimalstatistics() || !correctStats) {
+ _hasMinimum = false;
+ _hasMaximum = false;
+ _hasSum = false;
+ } else {
+ const proto::DecimalStatistics& stats = pb.decimalstatistics();
+ _hasMinimum = stats.has_minimum();
+ _hasMaximum = stats.has_maximum();
+ _hasSum = stats.has_sum();
+
+ minimum = stats.minimum();
+ maximum = stats.maximum();
+ sum = stats.sum();
+ }
+}
+
+void DecimalColumnStatisticsImpl::updateSum(Decimal value) {
+ bool overflow = false;
+ Decimal currentSum = this->getSum();
+
+ if (currentSum.scale > value.scale) {
+ value.value = scaleUpInt128ByPowerOfTen(
+ value.value, currentSum.scale - value.scale, overflow);
+ } else if (currentSum.scale < value.scale) {
+ currentSum.value = scaleUpInt128ByPowerOfTen(
+ currentSum.value, value.scale - currentSum.scale, overflow);
+ currentSum.scale = value.scale;
+ }
+
+ if (!overflow) {
+ bool wasPositive = currentSum.value >= 0;
+ currentSum.value += value.value;
+ if ((value.value >= 0) == wasPositive) {
+ _hasSum = ((currentSum.value >= 0) == wasPositive);
+ }
+ } else {
+ _hasSum = false;
+ }
+
+ if (_hasSum) {
+ sum = currentSum.toString();
+ }
+}
+
+TimestampColumnStatisticsImpl::TimestampColumnStatisticsImpl(
+ const proto::ColumnStatistics& pb, bool correctStats) {
+ valueCount = pb.numberofvalues();
+ hasNullValue = pb.hasnull();
+ if (!pb.has_timestampstatistics() || !correctStats) {
+ _hasStats = false;
+ minimum = 0;
+ maximum = 0;
+ } else {
+ const proto::TimestampStatistics& stats = pb.timestampstatistics();
+ _hasStats = stats.has_minimum();
+
+ minimum = stats.minimum();
+ maximum = stats.maximum();
+ }
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/orc-proto-definition.h b/depends/storage/src/storage/format/orc/orc-proto-definition.h
new file mode 100644
index 0000000..47b67cd
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc-proto-definition.h
@@ -0,0 +1,1131 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PROTO_DEFINITION_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PROTO_DEFINITION_H_
+
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+
+#include "univplan/common/statistics.h"
+
+#include "storage/format/orc/input-stream.h"
+#include "storage/format/orc/type-impl.h"
+#include "storage/format/orc/vector.h"
+
+namespace orc {
+
+static const uint64_t ORC_COMPRESSION_BLOCK_SIZE = 256 * 1024; // 256K
+
+enum WriterId {
+ ORC_JAVA_WRITER = 0,
+ ORC_CPP_WRITER = 1,
+ PRESTO_WRITER = 2,
+ UNKNOWN_WRITER = INT32_MAX
+};
+
+enum CompressionKind {
+ CompressionKind_NONE = 0,
+ CompressionKind_ZLIB = 1,
+ CompressionKind_SNAPPY = 2,
+ CompressionKind_LZO = 3,
+ CompressionKind_LZ4 = 4,
+ CompressionKind_ZSTD = 5,
+ CompressionKind_MAX = INT64_MAX
+};
+
+enum WriterVersion {
+ WriterVersion_ORIGINAL = 0,
+ WriterVersion_HIVE_8732 = 1,
+ WriterVersion_HIVE_4243 = 2,
+ WriterVersion_HIVE_12055 = 3,
+ WriterVersion_HIVE_13083 = 4,
+ WriterVersion_ORC_101 = 5,
+ WriterVersion_ORC_135 = 6,
+ WriterVersion_MAX = INT64_MAX
+};
+
+enum StreamKind {
+ StreamKind_PRESENT = 0,
+ StreamKind_DATA = 1,
+ StreamKind_LENGTH = 2,
+ StreamKind_DICTIONARY_DATA = 3,
+ StreamKind_DICTIONARY_COUNT = 4,
+ StreamKind_SECONDARY = 5,
+ StreamKind_ROW_INDEX = 6,
+ StreamKind_BLOOM_FILTER = 7
+};
+
+class ColumnStatisticsImpl : public univplan::ColumnStatistics {
+ public:
+ ColumnStatisticsImpl() { reset(); }
+ explicit ColumnStatisticsImpl(const proto::ColumnStatistics& stats) {
+ if (stats.has_numberofvalues())
+ valueCount = stats.numberofvalues();
+ else
+ valueCount = 0;
+ if (stats.has_hasnull())
+ hasNullValue = stats.hasnull();
+ else
+ hasNullValue = true;
+ }
+ virtual ~ColumnStatisticsImpl() {}
+
+ virtual void serialize(proto::ColumnStatistics* pb) {
+ assert(pb != nullptr);
+ pb->set_numberofvalues(valueCount);
+ pb->set_hasnull(hasNullValue);
+ }
+};
+
+class IntegerColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasStats;
+ int64_t minimum;
+ int64_t maximum;
+ int64_t sum;
+
+ public:
+ IntegerColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ explicit IntegerColumnStatisticsImpl(const proto::ColumnStatistics& stats)
+ : ColumnStatisticsImpl(stats) {
+ if (!stats.has_intstatistics()) {
+ resetInternal();
+ } else {
+ const proto::IntegerStatistics& s = stats.intstatistics();
+ _hasStats = s.has_minimum();
+ if (_hasStats) {
+ minimum = s.minimum();
+ maximum = s.maximum();
+ sum = s.sum();
+ }
+ }
+ }
+ virtual ~IntegerColumnStatisticsImpl() {}
+
+ bool hasMinimum() const { return _hasStats; }
+
+ bool hasMaximum() const { return _hasStats; }
+
+ bool hasSum() const { return _hasStats; }
+
+ int64_t getMinimum() const {
+ if (_hasStats) {
+ return minimum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ int64_t getMaximum() const {
+ if (_hasStats) {
+ return maximum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ int64_t getSum() const {
+ if (_hasStats) {
+ return sum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Sum is not defined.");
+ }
+ }
+
+ void updateInteger(int64_t value) {
+ if (!_hasStats) {
+ _hasStats = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ sum += value;
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const IntegerColumnStatisticsImpl* other =
+ dynamic_cast<const IntegerColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasStats) {
+ _hasStats = other->hasMinimum();
+ minimum = other->getMinimum();
+ maximum = other->getMaximum();
+ sum = other->getSum();
+ } else {
+ if (other->getMinimum() < minimum) {
+ minimum = other->getMinimum();
+ }
+ if (other->getMaximum() > maximum) {
+ maximum = other->getMaximum();
+ }
+ sum += other->getSum();
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::IntegerStatistics* stats = pb->mutable_intstatistics();
+ if (_hasStats) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ stats->set_sum(sum);
+ }
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasStats = false;
+ minimum = 0;
+ maximum = 0;
+ sum = 0;
+ }
+};
+
+class DoubleColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasStats;
+ double minimum;
+ double maximum;
+ double sum;
+
+ public:
+ DoubleColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ explicit DoubleColumnStatisticsImpl(const proto::ColumnStatistics& stats)
+ : ColumnStatisticsImpl(stats) {
+ if (!stats.has_doublestatistics()) {
+ resetInternal();
+ } else {
+ const proto::DoubleStatistics& s = stats.doublestatistics();
+ _hasStats = s.has_minimum();
+ if (_hasStats) {
+ minimum = s.minimum();
+ maximum = s.maximum();
+ sum = s.sum();
+ }
+ }
+ }
+ virtual ~DoubleColumnStatisticsImpl() {}
+
+ bool hasMinimum() const { return _hasStats; }
+
+ bool hasMaximum() const { return _hasStats; }
+
+ bool hasSum() const { return _hasStats; }
+
+ double getMinimum() const {
+ if (_hasStats) {
+ return minimum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ double getMaximum() const {
+ if (_hasStats) {
+ return maximum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ double getSum() const {
+ if (_hasStats) {
+ return sum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Sum is not defined.");
+ }
+ }
+
+ void updateDouble(double value) {
+ if (!_hasStats) {
+ _hasStats = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ sum += value;
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const DoubleColumnStatisticsImpl* other =
+ dynamic_cast<const DoubleColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasStats) {
+ _hasStats = other->hasMinimum();
+ minimum = other->getMinimum();
+ maximum = other->getMaximum();
+ sum = other->getSum();
+ } else {
+ if (other->getMinimum() < minimum) {
+ minimum = other->getMinimum();
+ }
+ if (other->getMaximum() > maximum) {
+ maximum = other->getMaximum();
+ }
+ sum += other->getSum();
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::DoubleStatistics* stats = pb->mutable_doublestatistics();
+ if (_hasStats) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ stats->set_sum(sum);
+ }
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasStats = false;
+ minimum = 0;
+ maximum = 0;
+ sum = 0;
+ }
+};
+
+class StringColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasStats;
+ std::string minimum;
+ std::string maximum;
+ int64_t totalLength;
+
+ public:
+ StringColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ StringColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats)
+ : ColumnStatisticsImpl(stats) {
+ if (!stats.has_stringstatistics() || !correctStats) {
+ resetInternal();
+ } else {
+ const proto::StringStatistics& s = stats.stringstatistics();
+ _hasStats = s.has_minimum();
+ if (_hasStats) {
+ minimum = s.minimum();
+ maximum = s.maximum();
+ totalLength = s.sum();
+ }
+ }
+ }
+ virtual ~StringColumnStatisticsImpl() {}
+
+ bool hasMinimum() const { return _hasStats; }
+
+ bool hasMaximum() const { return _hasStats; }
+
+ bool hasTotalLength() const { return _hasStats; }
+
+ const char* getMinimum() const {
+ if (_hasStats) {
+ return minimum.c_str();
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ const char* getMaximum() const {
+ if (_hasStats) {
+ return maximum.c_str();
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ int64_t getTotalLength() const {
+ if (_hasStats) {
+ return totalLength;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Total length is not defined.");
+ }
+ }
+
+ void updateString(const char* buffer, uint64_t len) {
+ std::string text(buffer, len);
+ if (!_hasStats) {
+ _hasStats = true;
+ maximum = minimum = text;
+ } else if (minimum > text) {
+ minimum = text;
+ } else if (maximum < text) {
+ maximum = text;
+ }
+ totalLength += len;
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const StringColumnStatisticsImpl* other =
+ dynamic_cast<const StringColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasStats) {
+ _hasStats = other->hasMinimum();
+ minimum = other->getMinimum();
+ maximum = other->getMaximum();
+ totalLength = other->getTotalLength();
+ } else {
+ if (other->getMinimum() < minimum) {
+ minimum = other->getMinimum();
+ }
+ if (other->getMaximum() > maximum) {
+ maximum = other->getMaximum();
+ }
+ totalLength += other->getTotalLength();
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::StringStatistics* stats = pb->mutable_stringstatistics();
+ if (_hasStats) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ stats->set_sum(totalLength);
+ }
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasStats = false;
+ minimum.clear();
+ maximum.clear();
+ totalLength = 0;
+ }
+};
+
+class StreamInformation {
+ public:
+ virtual ~StreamInformation();
+
+ virtual StreamKind getKind() const = 0;
+ virtual uint64_t getColumnId() const = 0;
+ virtual uint64_t getOffset() const = 0;
+ virtual uint64_t getLength() const = 0;
+};
+
+class StripeInformation {
+ public:
+ virtual ~StripeInformation();
+
+ // Get the byte offset of the start of the stripe.
+ // @return the bytes from the start of the file
+ virtual uint64_t getOffset() const = 0;
+
+ // Get the total length of the stripe in bytes.
+ // @return the number of bytes in the stripe
+ virtual uint64_t getLength() const = 0;
+
+ // Get the length of the stripe's indexes.
+ // @return the number of bytes in the index
+ virtual uint64_t getIndexLength() const = 0;
+
+ // Get the length of the stripe's data.
+ // @return the number of bytes in the stripe
+ virtual uint64_t getDataLength() const = 0;
+
+ // Get the length of the stripe's tail section, which contains its index.
+ // @return the number of bytes in the tail
+ virtual uint64_t getFooterLength() const = 0;
+
+ // Get the number of rows in the stripe.
+ // @return a count of the number of rows
+ virtual uint64_t getNumberOfRows() const = 0;
+
+ // Get the number of streams in the stripe.
+ virtual uint64_t getNumberOfStreams() const = 0;
+
+ // Get the StreamInformation for the given stream.
+ virtual std::unique_ptr<StreamInformation> getStreamInformation(
+ uint64_t streamId) const = 0;
+
+ // Get the dictionary size.
+ // @param colId the columnId
+ // @return the size of the dictionary or 0 if there isn't one
+ virtual uint64_t getDictionarySize(uint64_t colId) const = 0;
+
+ // Get the writer timezone.
+ virtual const std::string& getWriterTimezone() const = 0;
+};
+
+class BinaryColumnStatisticsImpl : public ColumnStatisticsImpl {
+ public:
+ BinaryColumnStatisticsImpl() { resetInternal(); }
+ BinaryColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats)
+ : ColumnStatisticsImpl(stats) {
+ if (!stats.has_binarystatistics() || !correctStats) {
+ resetInternal();
+ } else {
+ const proto::BinaryStatistics& s = stats.binarystatistics();
+ _hasStats = s.has_sum();
+ if (_hasStats) {
+ totalLength = s.sum();
+ }
+ }
+ }
+ virtual ~BinaryColumnStatisticsImpl() {}
+
+ bool hasTotalLength() const { return _hasStats; }
+
+ uint64_t getTotalLength() const {
+ if (_hasStats) {
+ return totalLength;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Total length is not defined.");
+ }
+ }
+
+ void update(size_t length) {
+ _hasStats = true;
+ totalLength += length;
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const BinaryColumnStatisticsImpl* other =
+ dynamic_cast<const BinaryColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasTotalLength()) {
+ if (!_hasStats) {
+ _hasStats = other->hasTotalLength();
+ totalLength = other->getTotalLength();
+ } else {
+ totalLength += other->getTotalLength();
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+
+ proto::BinaryStatistics* binStats = pb->mutable_binarystatistics();
+ if (_hasStats) {
+ binStats->set_sum(totalLength);
+ }
+ }
+
+ private:
+ void resetInternal() {
+ ColumnStatisticsImpl::reset();
+ _hasStats = false;
+ totalLength = 0;
+ }
+
+ bool _hasStats;
+ int64_t totalLength;
+};
+
+class BooleanColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasCount;
+ uint64_t trueCount;
+
+ public:
+ BooleanColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ BooleanColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats)
+ : ColumnStatisticsImpl(stats) {
+ if (!stats.has_bucketstatistics() || !correctStats ||
+ stats.bucketstatistics().count_size() == 0) {
+ resetInternal();
+ } else {
+ _hasCount = true;
+ trueCount = stats.bucketstatistics().count(0);
+ }
+ }
+ virtual ~BooleanColumnStatisticsImpl() {}
+
+ bool hasCount() const { return _hasCount; }
+
+ uint64_t getFalseCount() const {
+ if (_hasCount) {
+ return valueCount - trueCount;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "False count is not defined.");
+ }
+ }
+
+ uint64_t getTrueCount() const {
+ if (_hasCount) {
+ return trueCount;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "True count is not defined.");
+ }
+ }
+
+ void updateBoolean(bool value) {
+ if (!_hasCount) _hasCount = true;
+ if (value) trueCount += 1;
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const BooleanColumnStatisticsImpl* other =
+ dynamic_cast<const BooleanColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasCount()) {
+ if (!_hasCount) {
+ _hasCount = true;
+ trueCount = other->trueCount;
+ } else {
+ trueCount += other->trueCount;
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::BucketStatistics* stats = pb->mutable_bucketstatistics();
+ if (_hasCount) stats->add_count(trueCount);
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasCount = false;
+ trueCount = 0;
+ }
+};
+
+class DateColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasStats;
+ int32_t minimum;
+ int32_t maximum;
+
+ public:
+ DateColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ DateColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats);
+ virtual ~DateColumnStatisticsImpl();
+
+ bool hasMinimum() const { return _hasStats; }
+
+ bool hasMaximum() const { return _hasStats; }
+
+ int32_t getMinimum() const {
+ if (_hasStats) {
+ return minimum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ int32_t getMaximum() const {
+ if (_hasStats) {
+ return maximum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ void updateDate(int64_t value) {
+ if (!_hasStats) {
+ _hasStats = true;
+ minimum = value;
+ maximum = value;
+ } else if (value < minimum) {
+ minimum = value;
+ } else if (value > maximum) {
+ maximum = value;
+ }
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const DateColumnStatisticsImpl* other =
+ dynamic_cast<const DateColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasStats) {
+ _hasStats = other->hasMinimum();
+ minimum = other->getMinimum();
+ maximum = other->getMaximum();
+ } else {
+ if (other->getMinimum() < minimum) {
+ minimum = other->getMinimum();
+ }
+ if (other->getMaximum() > maximum) {
+ maximum = other->getMaximum();
+ }
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::DateStatistics* stats = pb->mutable_datestatistics();
+ if (_hasStats) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ }
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasStats = false;
+ minimum = 0;
+ maximum = 0;
+ }
+};
+
+class DecimalColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasMinimum;
+ bool _hasMaximum;
+ bool _hasSum;
+ std::string minimum;
+ std::string maximum;
+ std::string sum;
+
+ public:
+ DecimalColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ DecimalColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats);
+ virtual ~DecimalColumnStatisticsImpl();
+
+ bool hasMinimum() const { return _hasMinimum; }
+
+ bool hasMaximum() const { return _hasMaximum; }
+
+ bool hasSum() const { return _hasSum; }
+
+ Decimal getMinimum() const {
+ if (_hasMinimum) {
+ return Decimal(minimum);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ Decimal getMaximum() const {
+ if (_hasMaximum) {
+ return Decimal(maximum);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ Decimal getSum() const {
+ if (_hasSum) {
+ return Decimal(sum);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Sum is not defined.");
+ }
+ }
+
+ const char* getMinimumStr() const {
+ if (_hasMinimum) {
+ return minimum.c_str();
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ const char* getMaximumStr() const {
+ if (_hasMaximum) {
+ return maximum.c_str();
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ const char* getSumStr() const {
+ if (_hasSum) {
+ return sum.c_str();
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Sum is not defined.");
+ }
+ }
+
+ void updateDecimal(const orc::Decimal& value) {
+ if (!_hasMinimum) {
+ _hasMinimum = true;
+ _hasMaximum = true;
+ minimum = value.toString();
+ maximum = value.toString();
+ } else if (value < this->getMinimum()) {
+ minimum = value.toString();
+ } else if (value > this->getMaximum()) {
+ maximum = value.toString();
+ }
+ if (_hasSum) {
+ updateSum(value);
+ } else {
+ _hasSum = true;
+ sum = value.toString();
+ }
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const DecimalColumnStatisticsImpl* other =
+ dynamic_cast<const DecimalColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasMinimum) {
+ _hasMinimum = true;
+ _hasMaximum = true;
+ minimum = other->getMinimum().toString();
+ maximum = other->getMaximum().toString();
+ } else {
+ if (other->getMinimum() < this->getMinimum()) {
+ minimum = other->getMinimum().toString();
+ }
+ if (other->getMaximum() > this->getMaximum()) {
+ maximum = other->getMaximum().toString();
+ }
+ }
+ }
+ if (other->hasSum()) {
+ if (_hasSum) {
+ Decimal otherSum = other->getSum();
+ updateSum(otherSum);
+ } else {
+ _hasSum = true;
+ sum = other->getSum().toString();
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::DecimalStatistics* stats = pb->mutable_decimalstatistics();
+ if (_hasMinimum) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ }
+ if (_hasSum) stats->set_sum(sum);
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void updateSum(Decimal value);
+ void resetInternal() {
+ _hasMaximum = false;
+ _hasMinimum = false;
+ _hasSum = false;
+ minimum.clear();
+ maximum.clear();
+ sum.clear();
+ }
+};
+
+class TimestampColumnStatisticsImpl : public ColumnStatisticsImpl {
+ private:
+ bool _hasStats;
+ int64_t minimum;
+ int64_t maximum;
+
+ public:
+ TimestampColumnStatisticsImpl() : ColumnStatisticsImpl() { resetInternal(); }
+ TimestampColumnStatisticsImpl(const proto::ColumnStatistics& stats,
+ bool correctStats);
+
+ virtual ~TimestampColumnStatisticsImpl();
+
+ bool hasMinimum() const { return _hasStats; }
+
+ bool hasMaximum() const { return _hasStats; }
+
+ int64_t getMinimum() const {
+ if (_hasStats) {
+ return minimum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Minimum is not defined.");
+ }
+ }
+
+ int64_t getMaximum() const {
+ if (_hasStats) {
+ return maximum;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Maximum is not defined.");
+ }
+ }
+
+ void updateTimestamp(const int64_t val) {
+ if (!_hasStats) {
+ _hasStats = true;
+ maximum = minimum = val;
+ } else if (minimum > val) {
+ minimum = val;
+ } else if (maximum < val) {
+ maximum = val;
+ }
+ }
+
+ void merge(const ColumnStatistics& stats) override {
+ ColumnStatisticsImpl::merge(stats);
+ const TimestampColumnStatisticsImpl* other =
+ dynamic_cast<const TimestampColumnStatisticsImpl*>(&stats);
+ assert(other != nullptr);
+ if (other->hasMinimum()) {
+ if (!_hasStats) {
+ _hasStats = other->hasMinimum();
+ minimum = other->getMinimum();
+ maximum = other->getMaximum();
+ } else {
+ if (other->getMinimum() < minimum) {
+ minimum = other->getMinimum();
+ }
+ if (other->getMaximum() > maximum) {
+ maximum = other->getMaximum();
+ }
+ }
+ }
+ }
+
+ void serialize(proto::ColumnStatistics* pb) override {
+ assert(pb != nullptr);
+ ColumnStatisticsImpl::serialize(pb);
+ proto::TimestampStatistics* stats = pb->mutable_timestampstatistics();
+ if (_hasStats) {
+ stats->set_minimum(minimum);
+ stats->set_maximum(maximum);
+ }
+ }
+
+ void reset() override {
+ ColumnStatisticsImpl::reset();
+ resetInternal();
+ }
+
+ private:
+ void resetInternal() {
+ _hasStats = false;
+ minimum = 0;
+ maximum = 0;
+ }
+
+ int8_t compare(dbcommon::Timestamp ts1, dbcommon::Timestamp ts2) {
+ const int64_t val1 = ts1.second;
+ const int64_t nano1 = ts1.nanosecond;
+ const int64_t val2 = ts2.second;
+ const int64_t nano2 = ts2.nanosecond;
+ if (val1 == val2) {
+ if (nano1 > nano2)
+ return 1;
+ else if (nano1 == nano2)
+ return 0;
+ else
+ return -1;
+ } else if (val1 > val2) {
+ return 1;
+ } else {
+ return -1;
+ }
+ }
+};
+
+class StreamInformationImpl : public StreamInformation {
+ private:
+ StreamKind kind;
+ uint64_t column;
+ uint64_t offset;
+ uint64_t length;
+
+ public:
+ StreamInformationImpl(uint64_t _offset, const proto::Stream& stream)
+ : kind(static_cast<StreamKind>(stream.kind())),
+ column(stream.column()),
+ offset(_offset),
+ length(stream.length()) {
+ // PASS
+ }
+
+ ~StreamInformationImpl() {}
+
+ StreamKind getKind() const override { return kind; }
+
+ uint64_t getColumnId() const override { return column; }
+
+ uint64_t getOffset() const override { return offset; }
+
+ uint64_t getLength() const override { return length; }
+};
+
+class StripeInformationImpl : public StripeInformation {
+ uint64_t offset;
+ uint64_t indexLength;
+ uint64_t dataLength;
+ uint64_t footerLength;
+ uint64_t numRows;
+ InputStream* stream;
+ dbcommon::MemoryPool& memoryPool;
+ CompressionKind compression;
+ uint64_t blockSize;
+ mutable std::unique_ptr<proto::StripeFooter> stripeFooter;
+ void ensureStripeFooterLoaded() const;
+
+ public:
+ StripeInformationImpl(uint64_t _offset, uint64_t _indexLength,
+ uint64_t _dataLength, uint64_t _footerLength,
+ uint64_t _numRows, InputStream* _stream,
+ dbcommon::MemoryPool& pool, // NOLINT
+ CompressionKind _compression, uint64_t _blockSize)
+ : offset(_offset),
+ indexLength(_indexLength),
+ dataLength(_dataLength),
+ footerLength(_footerLength),
+ numRows(_numRows),
+ stream(_stream),
+ memoryPool(pool),
+ compression(_compression),
+ blockSize(_blockSize) {
+ // PASS
+ }
+
+ virtual ~StripeInformationImpl() {
+ // PASS
+ }
+
+ uint64_t getOffset() const override { return offset; }
+
+ uint64_t getLength() const override {
+ return indexLength + dataLength + footerLength;
+ }
+ uint64_t getIndexLength() const override { return indexLength; }
+
+ uint64_t getDataLength() const override { return dataLength; }
+
+ uint64_t getFooterLength() const override { return footerLength; }
+
+ uint64_t getNumberOfRows() const override { return numRows; }
+
+ uint64_t getNumberOfStreams() const override {
+ ensureStripeFooterLoaded();
+ return static_cast<uint64_t>(stripeFooter->streams_size());
+ }
+
+ std::unique_ptr<StreamInformation> getStreamInformation(
+ uint64_t streamId) const override;
+
+ uint64_t getDictionarySize(uint64_t colId) const override {
+ ensureStripeFooterLoaded();
+ return static_cast<uint64_t>(
+ stripeFooter->columns(static_cast<int>(colId)).dictionarysize());
+ }
+
+ const std::string& getWriterTimezone() const override {
+ ensureStripeFooterLoaded();
+ return stripeFooter->writertimezone();
+ }
+};
+
+univplan::ColumnStatistics* convertColumnStatistics(
+ const proto::ColumnStatistics& s, bool correctStats);
+std::unique_ptr<ColumnStatisticsImpl> createColumnStatistics(
+ const orc::Type* type);
+
+class StatisticsImpl : public univplan::Statistics {
+ private:
+ std::list<univplan::ColumnStatistics*> colStats;
+
+ // DELIBERATELY NOT IMPLEMENTED
+ StatisticsImpl(const StatisticsImpl&);
+ StatisticsImpl& operator=(const StatisticsImpl&);
+
+ public:
+ StatisticsImpl(const proto::StripeStatistics& stripeStats,
+ bool correctStats) {
+ for (int i = 0; i < stripeStats.colstats_size(); i++) {
+ colStats.push_back(
+ convertColumnStatistics(stripeStats.colstats(i), correctStats));
+ }
+ }
+
+ StatisticsImpl(const proto::Footer& footer, bool correctStats) {
+ for (int i = 0; i < footer.statistics_size(); i++) {
+ colStats.push_back(
+ convertColumnStatistics(footer.statistics(i), correctStats));
+ }
+ }
+
+ const univplan::ColumnStatistics* getColumnStatistics(
+ uint32_t columnId) const override {
+ std::list<univplan::ColumnStatistics*>::const_iterator it =
+ colStats.begin();
+ std::advance(it, static_cast<int64_t>(columnId));
+ return *it;
+ }
+
+ virtual ~StatisticsImpl();
+
+ uint32_t getNumberOfColumns() const override {
+ return static_cast<uint32_t>(colStats.size());
+ }
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_ORC_PROTO_DEFINITION_H_
diff --git a/depends/storage/src/storage/format/orc/orc_proto.proto b/depends/storage/src/storage/format/orc/orc_proto.proto
new file mode 100644
index 0000000..0c8027f
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/orc_proto.proto
@@ -0,0 +1,277 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+syntax = "proto2";
+
+package orc.proto;
+
+option java_package = "org.apache.orc";
+
+message IntegerStatistics {
+ optional sint64 minimum = 1;
+ optional sint64 maximum = 2;
+ optional sint64 sum = 3;
+}
+
+message DoubleStatistics {
+ optional double minimum = 1;
+ optional double maximum = 2;
+ optional double sum = 3;
+}
+
+message StringStatistics {
+ optional string minimum = 1;
+ optional string maximum = 2;
+ // sum will store the total length of all strings in a stripe
+ optional sint64 sum = 3;
+}
+
+message BucketStatistics {
+ repeated uint64 count = 1 [packed = true];
+}
+
+message DecimalStatistics {
+ optional string minimum = 1;
+ optional string maximum = 2;
+ optional string sum = 3;
+}
+
+message DateStatistics {
+ // min,max values saved as days since epoch
+ optional sint32 minimum = 1;
+ optional sint32 maximum = 2;
+}
+
+message TimestampStatistics {
+ // min,max values saved as milliseconds since epoch
+ optional sint64 minimum = 1;
+ optional sint64 maximum = 2;
+ optional sint64 minimumUtc = 3;
+ optional sint64 maximumUtc = 4;
+}
+
+message BinaryStatistics {
+ // sum will store the total binary blob length in a stripe
+ optional sint64 sum = 1;
+}
+
+message ColumnStatistics {
+ optional uint64 numberOfValues = 1;
+ optional IntegerStatistics intStatistics = 2;
+ optional DoubleStatistics doubleStatistics = 3;
+ optional StringStatistics stringStatistics = 4;
+ optional BucketStatistics bucketStatistics = 5;
+ optional DecimalStatistics decimalStatistics = 6;
+ optional DateStatistics dateStatistics = 7;
+ optional BinaryStatistics binaryStatistics = 8;
+ optional TimestampStatistics timestampStatistics = 9;
+ optional bool hasNull = 10;
+}
+
+message RowIndexEntry {
+ repeated uint64 positions = 1 [packed = true];
+ optional ColumnStatistics statistics = 2;
+}
+
+message RowIndex {
+ repeated RowIndexEntry entry = 1;
+}
+
+message BloomFilter {
+ optional uint32 numHashFunctions = 1;
+ repeated fixed64 bitset = 2;
+ optional bytes utf8bitset = 3;
+}
+
+message BloomFilterIndex {
+ repeated BloomFilter bloomFilter = 1;
+}
+
+message Stream {
+ // if you add new index stream kinds, you need to make sure to update
+ // StreamName to ensure it is added to the stripe in the right area
+ enum Kind {
+ PRESENT = 0;
+ DATA = 1;
+ LENGTH = 2;
+ DICTIONARY_DATA = 3;
+ DICTIONARY_COUNT = 4;
+ SECONDARY = 5;
+ ROW_INDEX = 6;
+ BLOOM_FILTER = 7;
+ BLOOM_FILTER_UTF8 = 8;
+ }
+ optional Kind kind = 1;
+ optional uint32 column = 2;
+ optional uint64 length = 3;
+}
+
+message ColumnEncoding {
+ enum Kind {
+ DIRECT = 0;
+ DICTIONARY = 1;
+ DIRECT_V2 = 2;
+ DICTIONARY_V2 = 3;
+ DIRECT_V0 = 4;
+ DICTIONARY_V0 = 5;
+ }
+ optional Kind kind = 1;
+ optional uint32 dictionarySize = 2;
+
+ // The encoding of the bloom filters for this column:
+ // 0 or missing = none or original
+ // 1 = ORC-135 (utc for timestamps)
+ optional uint32 bloomEncoding = 3;
+}
+
+message StripeFooter {
+ repeated Stream streams = 1;
+ repeated ColumnEncoding columns = 2;
+ optional string writerTimezone = 3;
+}
+
+message Type {
+ enum Kind {
+ BOOLEAN = 0;
+ BYTE = 1;
+ SHORT = 2;
+ INT = 3;
+ LONG = 4;
+ FLOAT = 5;
+ DOUBLE = 6;
+ STRING = 7;
+ BINARY = 8;
+ TIMESTAMP = 9;
+ LIST = 10;
+ MAP = 11;
+ STRUCT = 12;
+ UNION = 13;
+ DECIMAL = 14;
+ DATE = 15;
+ VARCHAR = 16;
+ CHAR = 17;
+ TIME = 18;
+ }
+ optional Kind kind = 1;
+ repeated uint32 subtypes = 2 [packed = true];
+ repeated string fieldNames = 3;
+ optional uint32 maximumLength = 4;
+ optional uint32 precision = 5;
+ optional uint32 scale = 6;
+}
+
+message StripeInformation {
+ optional uint64 offset = 1;
+ optional uint64 indexLength = 2;
+ optional uint64 dataLength = 3;
+ optional uint64 footerLength = 4;
+ optional uint64 numberOfRows = 5;
+}
+
+message UserMetadataItem {
+ optional string name = 1;
+ optional bytes value = 2;
+}
+
+message StripeStatistics {
+ repeated ColumnStatistics colStats = 1;
+}
+
+message Metadata {
+ repeated StripeStatistics stripeStats = 1;
+}
+
+message Footer {
+ optional uint64 headerLength = 1;
+ optional uint64 contentLength = 2;
+ repeated StripeInformation stripes = 3;
+ repeated Type types = 4;
+ repeated UserMetadataItem metadata = 5;
+ optional uint64 numberOfRows = 6;
+ repeated ColumnStatistics statistics = 7;
+ optional uint32 rowIndexStride = 8;
+
+ // Each implementation that writes ORC files should register for a code
+ // 0 = ORC Java
+ // 1 = ORC C++
+ // 2 = Presto
+ // 3 = Scritchley Go from https://github.com/scritchley/orc
+ optional uint32 writer = 9;
+}
+
+enum CompressionKind {
+ NONE = 0;
+ ZLIB = 1;
+ SNAPPY = 2;
+ LZO = 3;
+ LZ4 = 4;
+ ZSTD = 5;
+}
+
+// Serialized length must be less that 255 bytes
+message PostScript {
+ optional uint64 footerLength = 1;
+ optional CompressionKind compression = 2;
+ optional uint64 compressionBlockSize = 3;
+ // the version of the file format
+ // [0, 11] = Hive 0.11
+ // [0, 12] = Hive 0.12
+ repeated uint32 version = 4 [packed = true];
+ optional uint64 metadataLength = 5;
+
+ // The version of the writer that wrote the file. This number is
+ // updated when we make fixes or large changes to the writer so that
+ // readers can detect whether a given bug is present in the data.
+ //
+ // Only the Java ORC writer may use values under 6 (or missing) so that
+ // readers that predate ORC-202 treat the new writers correctly. Each
+ // writer should assign their own sequence of versions starting from 6.
+ //
+ // Version of the ORC Java writer:
+ // 0 = original
+ // 1 = HIVE-8732 fixed (fixed stripe/file maximum statistics &
+ // string statistics use utf8 for min/max)
+ // 2 = HIVE-4243 fixed (use real column names from Hive tables)
+ // 3 = HIVE-12055 fixed (vectorized writer implementation)
+ // 4 = HIVE-13083 fixed (decimals write present stream correctly)
+ // 5 = ORC-101 fixed (bloom filters use utf8 consistently)
+ // 6 = ORC-135 fixed (timestamp statistics use utc)
+ //
+ // Version of the ORC C++ writer:
+ // 6 = original
+ //
+ // Version of the Presto writer:
+ // 6 = original
+ //
+ // Version of the Scritchley Go writer:
+ // 6 = original
+ //
+ optional uint32 writerVersion = 6;
+
+ // Leave this last in the record
+ optional string magic = 8000;
+}
+
+// The contents of the file tail that must be serialized.
+// This gets serialized as part of OrcSplit, also used by footer cache.
+message FileTail {
+ optional PostScript postscript = 1;
+ optional Footer footer = 2;
+ optional uint64 fileLength = 3;
+ optional uint64 postscriptLength = 4;
+}
diff --git a/depends/storage/src/storage/format/orc/output-stream.cc b/depends/storage/src/storage/format/orc/output-stream.cc
new file mode 100644
index 0000000..29026f9
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/output-stream.cc
@@ -0,0 +1,33 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <string>
+
+#include "storage/format/orc/output-stream.h"
+
+namespace orc {
+
+std::unique_ptr<OutputStream> writeFile(dbcommon::FileSystem *fs,
+ const std::string &path) {
+ std::unique_ptr<OutputStream> os(new GeneralFileOutputStream(fs, path));
+
+ return std::move(os);
+}
+
+} // end of namespace orc
diff --git a/depends/storage/src/storage/format/orc/output-stream.h b/depends/storage/src/storage/format/orc/output-stream.h
new file mode 100644
index 0000000..5abc687
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/output-stream.h
@@ -0,0 +1,135 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_OUTPUT_STREAM_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_OUTPUT_STREAM_H_
+
+#include <cassert>
+#include <string>
+
+#include "dbcommon/filesystem/file-system.h"
+
+namespace orc {
+
+class OutputStream {
+ public:
+ OutputStream() {}
+ virtual ~OutputStream() {}
+
+ // Write length bytes from the buffer to the file
+ // @param buf The output buffer
+ // @param length The number of bytes in the buffer to write
+ // @return Void
+ virtual void write(void* buf, uint64_t length) = 0;
+
+ // Get the name of the stream for error messages.
+ // @return The stream name
+ virtual const std::string& getName() const = 0;
+
+ // Get the natural size for reads.
+ // @return the number of bytes that should be write at once
+ virtual uint64_t getNaturalWriteSize() const = 0;
+
+ // Get the total length of the file in bytes.
+ // @return The length
+ virtual uint64_t getLength() const = 0;
+
+ // Get current file position
+ // @return Current file position
+ virtual uint64_t getPosition() const = 0;
+
+ // Padding given bytes to the file
+ // @param size The bytes to pad
+ // @return Void
+ virtual void padding(uint64_t size) = 0;
+
+ // Close the stream
+ // @return Void
+ virtual void close() = 0;
+};
+
+class GeneralFileOutputStream : public OutputStream {
+ public:
+ GeneralFileOutputStream(dbcommon::FileSystem* fs, std::string fileName)
+ : fs(fs), fileName(fileName) {
+ file = NULL;
+ totalLength = -1;
+ }
+
+ virtual ~GeneralFileOutputStream() {}
+
+ uint64_t getLength() const override { return totalLength; }
+
+ uint64_t getNaturalWriteSize() const override { return 128 * 1024; }
+
+ void write(void* buf, uint64_t length) override {
+ assert(buf != nullptr);
+
+ if (!file) {
+ file = fs->open(fileName.c_str(), O_WRONLY);
+ totalLength = fs->getFileLength(fileName.c_str());
+ }
+ fs->write(file.get(), buf, length);
+ }
+
+ const std::string& getName() const override { return fileName; }
+
+ uint64_t getPosition() const override { return fs->tell(file.get()); }
+
+ void padding(uint64_t size) override {
+ static char buffer[1024] = {0};
+
+ if (size > 0) {
+ // we use the first byte of the padding area as FASTBlock type.
+ // so we set the padding area to 0
+ int times = size / sizeof(buffer);
+ int left = size % sizeof(buffer);
+
+ for (int i = 0; i < times; i++)
+ fs->write(file.get(), buffer, sizeof(buffer));
+
+ if (left > 0) fs->write(file.get(), buffer, left);
+ }
+ }
+
+ void close() override {
+ if (file) {
+ file->close();
+ }
+ }
+
+ bool fileopen() {
+ if (file)
+ return true;
+ else
+ return false;
+ }
+
+ private:
+ std::string fileName;
+ std::unique_ptr<dbcommon::File> file;
+ uint64_t totalLength = 0;
+ dbcommon::FileSystem* fs = nullptr;
+};
+
+std::unique_ptr<OutputStream> writeFile(dbcommon::FileSystem* fs,
+ const std::string& path);
+
+} // end of namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_OUTPUT_STREAM_H_
diff --git a/depends/storage/src/storage/format/orc/reader.cc b/depends/storage/src/storage/format/orc/reader.cc
new file mode 100644
index 0000000..e88e62a
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/reader.cc
@@ -0,0 +1,2424 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <google/protobuf/io/coded_stream.h>
+
+#include <math.h>
+#include <algorithm>
+#include <iostream>
+#include <iterator>
+#include <limits>
+#include <list>
+#include <map>
+#include <memory>
+#include <set>
+#include <sstream>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "storage/format/orc/byte-rle.h"
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/input-stream.h"
+#include "storage/format/orc/int128.h"
+#include "storage/format/orc/orc-predicates.h"
+#include "storage/format/orc/reader.h"
+#include "storage/format/orc/rle.h"
+#include "storage/format/orc/type-impl.h"
+
+namespace orc {
+
+enum ColumnSelection {
+ ColumnSelection_NONE = 0,
+ ColumnSelection_NAMES = 1,
+ ColumnSelection_FIELD_IDS = 2,
+ ColumnSelection_TYPE_IDS = 3
+};
+
+struct ReaderOptionsPrivate {
+ ColumnSelection selection;
+ std::list<uint64_t> includedColumnIndexes;
+ std::list<std::string> includedColumnNames;
+ uint64_t dataStart;
+ uint64_t dataLength;
+ uint64_t tailLocation;
+ bool throwOnHive11DecimalOverflow;
+ int32_t forcedScaleOnHive11Decimal;
+ std::ostream* errorStream;
+ dbcommon::MemoryPool* memoryPool;
+ std::string serializedTail;
+ const univplan::UnivPlanExprPolyList* predicateExprs;
+ const dbcommon::TupleDesc* td;
+ bool readStatisticsOnly;
+
+ ReaderOptionsPrivate() {
+ selection = ColumnSelection_NONE;
+ dataStart = 0;
+ dataLength = std::numeric_limits<uint64_t>::max();
+ tailLocation = std::numeric_limits<uint64_t>::max();
+ throwOnHive11DecimalOverflow = true;
+ forcedScaleOnHive11Decimal = 6;
+ errorStream = &std::cerr;
+ memoryPool = dbcommon::getDefaultPool();
+ predicateExprs = nullptr;
+ td = nullptr;
+ readStatisticsOnly = false;
+ }
+};
+
+ReaderOptions::ReaderOptions()
+ : privateBits(
+ std::unique_ptr<ReaderOptionsPrivate>(new ReaderOptionsPrivate())) {
+ // PASS
+}
+
+ReaderOptions::ReaderOptions(const ReaderOptions& rhs)
+ : privateBits(std::unique_ptr<ReaderOptionsPrivate>(
+ new ReaderOptionsPrivate(*(rhs.privateBits.get())))) {
+ // PASS
+}
+
+ReaderOptions::ReaderOptions(ReaderOptions& rhs) {
+ // swap privateBits with rhs
+ ReaderOptionsPrivate* l = privateBits.release();
+ privateBits.reset(rhs.privateBits.release());
+ rhs.privateBits.reset(l);
+}
+
+ReaderOptions& ReaderOptions::operator=(const ReaderOptions& rhs) {
+ if (this != &rhs) {
+ privateBits.reset(new ReaderOptionsPrivate(*(rhs.privateBits.get())));
+ }
+ return *this;
+}
+
+ReaderOptions::~ReaderOptions() {
+ // PASS
+}
+
+ReaderOptions& ReaderOptions::include(const std::list<uint64_t>& include) {
+ privateBits->selection = ColumnSelection_FIELD_IDS;
+ privateBits->includedColumnIndexes.assign(include.begin(), include.end());
+ privateBits->includedColumnNames.clear();
+ return *this;
+}
+
+ReaderOptions& ReaderOptions::include(const std::list<std::string>& include) {
+ privateBits->selection = ColumnSelection_NAMES;
+ privateBits->includedColumnNames.assign(include.begin(), include.end());
+ privateBits->includedColumnIndexes.clear();
+ return *this;
+}
+
+ReaderOptions& ReaderOptions::includeTypes(const std::list<uint64_t>& types) {
+ privateBits->selection = ColumnSelection_TYPE_IDS;
+ privateBits->includedColumnIndexes.assign(types.begin(), types.end());
+ privateBits->includedColumnNames.clear();
+ return *this;
+}
+
+ReaderOptions& ReaderOptions::range(uint64_t offset, uint64_t length) {
+ privateBits->dataStart = offset;
+ privateBits->dataLength = length;
+ return *this;
+}
+
+ReaderOptions& ReaderOptions::setTailLocation(uint64_t offset) {
+ privateBits->tailLocation = offset;
+ return *this;
+}
+
+ReaderOptions& ReaderOptions::setSerializedFileTail(const std::string& value) {
+ privateBits->serializedTail = value;
+ return *this;
+}
+
+dbcommon::MemoryPool* ReaderOptions::getMemoryPool() const {
+ return privateBits->memoryPool;
+}
+
+bool ReaderOptions::getIndexesSet() const {
+ return privateBits->selection == ColumnSelection_FIELD_IDS;
+}
+
+bool ReaderOptions::getTypeIdsSet() const {
+ return privateBits->selection == ColumnSelection_TYPE_IDS;
+}
+
+const std::list<uint64_t>& ReaderOptions::getInclude() const {
+ return privateBits->includedColumnIndexes;
+}
+
+bool ReaderOptions::getNamesSet() const {
+ return privateBits->selection == ColumnSelection_NAMES;
+}
+
+const std::list<std::string>& ReaderOptions::getIncludeNames() const {
+ return privateBits->includedColumnNames;
+}
+
+uint64_t ReaderOptions::getOffset() const { return privateBits->dataStart; }
+
+uint64_t ReaderOptions::getLength() const { return privateBits->dataLength; }
+
+uint64_t ReaderOptions::getTailLocation() const {
+ return privateBits->tailLocation;
+}
+
+ReaderOptions& ReaderOptions::throwOnHive11DecimalOverflow(bool shouldThrow) {
+ privateBits->throwOnHive11DecimalOverflow = shouldThrow;
+ return *this;
+}
+
+bool ReaderOptions::getThrowOnHive11DecimalOverflow() const {
+ return privateBits->throwOnHive11DecimalOverflow;
+}
+
+ReaderOptions& ReaderOptions::forcedScaleOnHive11Decimal(int32_t forcedScale) {
+ privateBits->forcedScaleOnHive11Decimal = forcedScale;
+ return *this;
+}
+
+int32_t ReaderOptions::getForcedScaleOnHive11Decimal() const {
+ return privateBits->forcedScaleOnHive11Decimal;
+}
+
+ReaderOptions& ReaderOptions::setErrorStream(std::ostream& stream) {
+ privateBits->errorStream = &stream;
+ return *this;
+}
+
+std::ostream* ReaderOptions::getErrorStream() const {
+ return privateBits->errorStream;
+}
+
+std::string ReaderOptions::getSerializedFileTail() const {
+ return privateBits->serializedTail;
+}
+
+void ReaderOptions::setPredicateExprs(
+ const univplan::UnivPlanExprPolyList* predicateExprs) {
+ privateBits->predicateExprs = predicateExprs;
+}
+
+const univplan::UnivPlanExprPolyList* ReaderOptions::getPredicateExprs() const {
+ return privateBits->predicateExprs;
+}
+
+void ReaderOptions::setTupleDesc(const dbcommon::TupleDesc* td) {
+ privateBits->td = td;
+}
+
+const dbcommon::TupleDesc* ReaderOptions::getTupleDesc() const {
+ return privateBits->td;
+}
+
+void ReaderOptions::setReadStatsOnlyFlag(bool readStatsOnly) {
+ privateBits->readStatisticsOnly = readStatsOnly;
+}
+
+bool ReaderOptions::readStatsOnly() const {
+ return privateBits->readStatisticsOnly;
+}
+
+Reader::~Reader() {
+ // PASS
+}
+
+static const uint64_t DIRECTORY_SIZE_GUESS = 16 * 1024;
+
+uint64_t getCompressionBlockSize(const proto::PostScript& ps) {
+ if (ps.has_compressionblocksize()) {
+ return ps.compressionblocksize();
+ } else {
+ return 256 * 1024;
+ }
+}
+
+CompressionKind convertCompressionKind(const proto::PostScript& ps) {
+ if (ps.has_compression()) {
+ return static_cast<CompressionKind>(ps.compression());
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown compression type");
+ }
+}
+
+void readFully(char* buffer, int64_t bufferSize, SeekableInputStream* stream) {
+ int64_t posn = 0;
+ while (posn < bufferSize) {
+ const void* chunk;
+ int length;
+ if (!stream->Next(&chunk, &length)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in readFully");
+ }
+ memcpy(buffer + posn, chunk, static_cast<size_t>(length));
+ posn += length;
+ }
+}
+
+ReaderImpl::ReaderImpl(std::unique_ptr<InputStream> input,
+ const ReaderOptions& opts,
+ std::unique_ptr<proto::PostScript> _postscript,
+ std::unique_ptr<proto::Footer> _footer,
+ uint64_t _fileLength, uint64_t _postscriptLength)
+ : localTimezone(getLocalTimezone()),
+ stream(std::move(input)),
+ options(opts),
+ fileLength(_fileLength),
+ postscriptLength(_postscriptLength),
+ postscript(std::move(_postscript)),
+ memoryPool(*opts.getMemoryPool()),
+ blockSize(getCompressionBlockSize(*postscript)),
+ compression(convertCompressionKind(*postscript)),
+ footer(std::move(_footer)),
+ firstRowOfStripe(memoryPool, 0) {
+ isMetadataLoaded = false;
+ checkOrcVersion();
+ numberOfStripes = static_cast<uint64_t>(footer->stripes_size());
+ currentStripe = static_cast<uint64_t>(footer->stripes_size());
+ lastStripe = 0;
+ currentRowInStripe = 0;
+ uint64_t rowTotal = 0;
+
+ firstRowOfStripe.resize(static_cast<uint64_t>(footer->stripes_size()));
+ for (size_t i = 0; i < static_cast<size_t>(footer->stripes_size()); ++i) {
+ firstRowOfStripe[i] = rowTotal;
+ proto::StripeInformation stripeInfo = footer->stripes(static_cast<int>(i));
+ rowTotal += stripeInfo.numberofrows();
+ bool isStripeInRange =
+ stripeInfo.offset() >= opts.getOffset() &&
+ stripeInfo.offset() < opts.getOffset() + opts.getLength();
+ if (isStripeInRange) {
+ if (i < currentStripe) {
+ currentStripe = i;
+ }
+ if (i >= lastStripe) {
+ lastStripe = i + 1;
+ }
+ // read all stripe footer in the range
+ if (!options.readStatsOnly())
+ stripeFooters.push_back(getStripeFooter(stripeInfo));
+ }
+ }
+ firstStripe = currentStripe;
+
+ if (currentStripe == 0) {
+ previousRow = (std::numeric_limits<uint64_t>::max)();
+ } else if (currentStripe == static_cast<uint64_t>(footer->stripes_size())) {
+ previousRow = footer->numberofrows();
+ } else {
+ previousRow = firstRowOfStripe[firstStripe] - 1;
+ }
+ if (numberOfStripes) {
+ schema = convertType(footer->types(0), *footer);
+ std::vector<std::string> columns;
+ buildTypeNameIdMap(schema.get(), columns);
+ updateSelected();
+ }
+}
+
+void ReaderImpl::updateSelected() {
+ selectedColumns.assign(static_cast<size_t>(footer->types_size()), false);
+ if (schema->getKind() == STRUCT && options.getIndexesSet()) {
+ for (std::list<uint64_t>::const_iterator field =
+ options.getInclude().begin();
+ field != options.getInclude().end(); ++field) {
+ updateSelectedByFieldId(*field);
+ }
+ } else if (schema->getKind() == STRUCT && options.getNamesSet()) {
+ for (std::list<std::string>::const_iterator field =
+ options.getIncludeNames().begin();
+ field != options.getIncludeNames().end(); ++field) {
+ updateSelectedByName(*field);
+ }
+ } else if (options.getTypeIdsSet()) {
+ for (std::list<uint64_t>::const_iterator typeId =
+ options.getInclude().begin();
+ typeId != options.getInclude().end(); ++typeId) {
+ updateSelectedByTypeId(*typeId);
+ }
+ } else {
+ // default is to select all columns
+ std::fill(selectedColumns.begin(), selectedColumns.end(), true);
+ }
+ selectParents(*schema);
+ selectedColumns[0] = true; // column 0 is selected by default
+}
+
+std::string ReaderImpl::getSerializedFileTail() const {
+ proto::FileTail tail;
+ proto::PostScript* mutable_ps = tail.mutable_postscript();
+ mutable_ps->CopyFrom(*postscript);
+ proto::Footer* mutableFooter = tail.mutable_footer();
+ mutableFooter->CopyFrom(*footer);
+ tail.set_filelength(fileLength);
+ tail.set_postscriptlength(postscriptLength);
+ std::string result;
+ if (!tail.SerializeToString(&result)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failed to serialize file tail");
+ }
+ return result;
+}
+
+// Recurses over a type tree and build two maps
+// map<TypeName, TypeId>, map<TypeId, Type>
+void ReaderImpl::buildTypeNameIdMap(const Type* type,
+ std::vector<std::string>& columns) {
+ // map<type_id, Type*>
+ idTypeMap[type->getColumnId()] = type;
+
+ if (orc::STRUCT == type->getKind()) {
+ for (size_t i = 0; i < type->getSubtypeCount(); ++i) {
+ const std::string& fieldName = type->getFieldName(i);
+ columns.push_back(fieldName);
+ nameIdMap[toDotColumnPath(columns)] = type->getSubtype(i)->getColumnId();
+ buildTypeNameIdMap(type->getSubtype(i), columns);
+ columns.pop_back();
+ }
+ } else {
+ // other non-primitive type
+ for (size_t j = 0; j < type->getSubtypeCount(); ++j) {
+ buildTypeNameIdMap(type->getSubtype(j), columns);
+ }
+ }
+}
+
+std::string ReaderImpl::toDotColumnPath(
+ const std::vector<std::string>& columns) {
+ if (columns.empty()) {
+ return std::string();
+ }
+ std::ostringstream columnStream;
+ std::copy(columns.begin(), columns.end(),
+ std::ostream_iterator<std::string>(columnStream, "."));
+ std::string columnPath = columnStream.str();
+ return columnPath.substr(0, columnPath.length() - 1);
+}
+
+const ReaderOptions& ReaderImpl::getReaderOptions() const { return options; }
+
+CompressionKind ReaderImpl::getCompression() const { return compression; }
+
+uint64_t ReaderImpl::getCompressionSize() const { return blockSize; }
+
+uint64_t ReaderImpl::getNumberOfStripes() const { return numberOfStripes; }
+
+uint64_t ReaderImpl::getNumberOfStripeStatistics() const {
+ if (!isMetadataLoaded) {
+ readMetadata();
+ }
+ return metadata.get() == nullptr
+ ? 0
+ : static_cast<uint64_t>(metadata->stripestats_size());
+}
+
+std::unique_ptr<StripeInformation> ReaderImpl::getStripe(
+ uint64_t stripeIndex) const {
+ if (stripeIndex > getNumberOfStripes()) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "stripe index out of range");
+ }
+ proto::StripeInformation stripeInfo =
+ footer->stripes(static_cast<int>(stripeIndex));
+
+ return std::unique_ptr<StripeInformation>(new StripeInformationImpl(
+ stripeInfo.offset(), stripeInfo.indexlength(), stripeInfo.datalength(),
+ stripeInfo.footerlength(), stripeInfo.numberofrows(), stream.get(),
+ memoryPool, compression, blockSize));
+}
+
+std::string ReaderImpl::getFormatVersion() const {
+ std::stringstream result;
+ for (int i = 0; i < postscript->version_size(); ++i) {
+ if (i != 0) {
+ result << ".";
+ }
+ result << postscript->version(i);
+ }
+ return result.str();
+}
+
+uint64_t ReaderImpl::getNumberOfRows() const { return footer->numberofrows(); }
+
+WriterVersion ReaderImpl::getWriterVersion() const {
+ if (!postscript->has_writerversion()) {
+ return WriterVersion_ORIGINAL;
+ }
+ return static_cast<WriterVersion>(postscript->writerversion());
+}
+
+uint64_t ReaderImpl::getContentLength() const {
+ return footer->contentlength();
+}
+
+uint64_t ReaderImpl::getStripeStatisticsLength() const {
+ return postscript->metadatalength();
+}
+
+uint64_t ReaderImpl::getFileFooterLength() const {
+ return postscript->footerlength();
+}
+
+uint64_t ReaderImpl::getFilePostscriptLength() const {
+ return postscriptLength;
+}
+
+uint64_t ReaderImpl::getFileLength() const { return fileLength; }
+
+uint64_t ReaderImpl::getRowIndexStride() const {
+ return footer->rowindexstride();
+}
+
+const std::string& ReaderImpl::getStreamName() const {
+ return stream->getName();
+}
+
+std::list<std::string> ReaderImpl::getMetadataKeys() const {
+ std::list<std::string> result;
+ for (int i = 0; i < footer->metadata_size(); ++i) {
+ result.push_back(footer->metadata(i).name());
+ }
+ return result;
+}
+
+std::string ReaderImpl::getMetadataValue(const std::string& key) const {
+ for (int i = 0; i < footer->metadata_size(); ++i) {
+ if (footer->metadata(i).name() == key) {
+ return footer->metadata(i).value();
+ }
+ }
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "key not found");
+}
+
+bool ReaderImpl::hasMetadataValue(const std::string& key) const {
+ for (int i = 0; i < footer->metadata_size(); ++i) {
+ if (footer->metadata(i).name() == key) {
+ return true;
+ }
+ }
+ return false;
+}
+
+const std::vector<bool> ReaderImpl::getSelectedColumns() const {
+ return selectedColumns;
+}
+
+const Type& ReaderImpl::getType() const { return *(schema.get()); }
+
+const Type& ReaderImpl::getSelectedType() const {
+ if (selectedSchema.get() == nullptr) {
+ selectedSchema = buildSelectedType(schema.get(), selectedColumns);
+ }
+ return *(selectedSchema.get());
+}
+
+uint64_t ReaderImpl::getRowNumber() const { return previousRow; }
+
+std::unique_ptr<univplan::Statistics> ReaderImpl::getStatistics() const {
+ return std::unique_ptr<univplan::Statistics>(
+ new StatisticsImpl(*footer, hasCorrectStatistics()));
+}
+
+std::unique_ptr<univplan::ColumnStatistics> ReaderImpl::getColumnStatistics(
+ uint32_t index) const {
+ if (index >= static_cast<uint64_t>(footer->statistics_size())) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "column index out of range");
+ }
+ proto::ColumnStatistics col = footer->statistics(static_cast<int32_t>(index));
+ return std::unique_ptr<univplan::ColumnStatistics>(
+ convertColumnStatistics(col, hasCorrectStatistics()));
+}
+
+void ReaderImpl::readMetadata() const {
+ uint64_t metadataSize = postscript->metadatalength();
+ uint64_t metadataStart = fileLength - metadataSize -
+ postscript->footerlength() - postscriptLength - 1;
+ if (metadataSize != 0) {
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ compression,
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ stream.get(), metadataStart, metadataSize, memoryPool)),
+ blockSize, memoryPool);
+ metadata.reset(new proto::Metadata());
+ if (!metadata->ParseFromZeroCopyStream(pbStream.get())) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failed to parse the metadata");
+ }
+ }
+ isMetadataLoaded = true;
+}
+
+std::unique_ptr<univplan::Statistics> ReaderImpl::getStripeStatistics(
+ uint64_t stripeIndex) const {
+ if (!isMetadataLoaded) {
+ readMetadata();
+ }
+ if (metadata.get() == nullptr) {
+ return nullptr;
+ }
+ return std::unique_ptr<univplan::Statistics>(
+ new StatisticsImpl(metadata->stripestats(static_cast<int>(stripeIndex)),
+ hasCorrectStatistics()));
+}
+
+void ReaderImpl::seekToRow(uint64_t rowNumber) {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "ReaderImpl::seekToRow not implemented");
+}
+
+bool ReaderImpl::hasCorrectStatistics() const {
+ return getWriterVersion() != WriterVersion_ORIGINAL;
+}
+
+proto::StripeFooter ReaderImpl::getStripeFooter(
+ const proto::StripeInformation& info) const {
+ uint64_t stripeFooterStart =
+ info.offset() + info.indexlength() + info.datalength();
+ uint64_t stripeFooterLength = info.footerlength();
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ compression,
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ stream.get(), stripeFooterStart, stripeFooterLength, memoryPool)),
+ blockSize, memoryPool);
+ proto::StripeFooter result;
+ if (!result.ParseFromZeroCopyStream(pbStream.get())) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad StripeFooter from %s",
+ pbStream->getName().c_str());
+ }
+ return result;
+}
+
+uint64_t maxStreamsForType(const proto::Type& type) {
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_STRUCT:
+ return 1;
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_DATE:
+ case proto::Type_Kind_TIME:
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION:
+ return 2;
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_DECIMAL:
+ case proto::Type_Kind_TIMESTAMP:
+ return 3;
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ return 4;
+ default:
+ return 0;
+ }
+}
+
+uint64_t ReaderImpl::getMemoryUse(int stripeIx) {
+ uint64_t maxDataLength = 0;
+
+ if (stripeIx >= 0 && stripeIx < footer->stripes_size()) {
+ uint64_t stripe = footer->stripes(stripeIx).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ } else {
+ for (int i = 0; i < footer->stripes_size(); i++) {
+ uint64_t stripe = footer->stripes(i).datalength();
+ if (maxDataLength < stripe) {
+ maxDataLength = stripe;
+ }
+ }
+ }
+
+ bool hasStringColumn = false;
+ uint64_t nSelectedStreams = 0;
+ for (int i = 0; !hasStringColumn && i < footer->types_size(); i++) {
+ if (selectedColumns[static_cast<size_t>(i)]) {
+ const proto::Type& type = footer->types(i);
+ nSelectedStreams += maxStreamsForType(type);
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_VARCHAR:
+ case proto::Type_Kind_BINARY: {
+ hasStringColumn = true;
+ break;
+ }
+ }
+ }
+ }
+
+ /* If a string column is read, use stripe datalength as a memory estimate
+ * because we don't know the dictionary size. Multiply by 2 because
+ * a string column requires two buffers:
+ * in the input stream and in the seekable input stream.
+ * If no string column is read, estimate from the number of streams.
+ */
+ uint64_t memory =
+ hasStringColumn
+ ? 2 * maxDataLength
+ : std::min(uint64_t(maxDataLength),
+ nSelectedStreams * stream->getNaturalReadSize());
+
+ // Do we need even more memory to read the footer or the metadata?
+ if (memory < postscript->footerlength() + DIRECTORY_SIZE_GUESS) {
+ memory = postscript->footerlength() + DIRECTORY_SIZE_GUESS;
+ }
+ if (memory < postscript->metadatalength()) {
+ memory = postscript->metadatalength();
+ }
+
+ // Account for firstRowOfStripe.
+ memory += firstRowOfStripe.capacity() * sizeof(uint64_t);
+
+ // Decompressors need buffers for each stream
+ uint64_t decompressorMemory = 0;
+ if (compression != CompressionKind_NONE) {
+ for (int i = 0; i < footer->types_size(); i++) {
+ if (selectedColumns[static_cast<size_t>(i)]) {
+ const proto::Type& type = footer->types(i);
+ decompressorMemory += maxStreamsForType(type) * blockSize;
+ }
+ }
+ if (compression == CompressionKind_SNAPPY) {
+ decompressorMemory *= 2; // Snappy decompressor uses a second buffer
+ }
+ }
+
+ return memory + decompressorMemory;
+}
+
+void ReaderImpl::collectPredicateStats(uint32_t* scanned, uint32_t* skipped) {
+ *scanned += this->scannedStripe;
+ *skipped += this->skippedStripe;
+}
+
+std::unique_ptr<InputStream> ReaderImpl::ownInputStream() {
+ return std::move(stream);
+}
+
+proto::BloomFilterIndex ReaderImpl::rebuildBloomFilter(uint32_t colId) {
+ std::unique_ptr<SeekableInputStream> stream =
+ currentStripeStream->getStreamForBloomFilter(
+ colId, proto::Stream_Kind_BLOOM_FILTER, false);
+ proto::BloomFilterIndex bloomFilterIndexProto;
+ if (stream) {
+ google::protobuf::io::CodedInputStream input(stream.get());
+ bloomFilterIndexProto.ParseFromCodedStream(&input);
+ }
+ return bloomFilterIndexProto;
+}
+
+bool ReaderImpl::doReadStatsOnly(ColumnVectorBatch* data) {
+ currentStripeStats = getStripeStatistics(currentStripe);
+ orc::StructVectorBatch* structBatch =
+ dynamic_cast<orc::StructVectorBatch*>(data);
+ assert(structBatch != nullptr);
+ std::vector<orc::ColumnVectorBatch*>::iterator it =
+ structBatch->fields.begin();
+ for (uint64_t i = 0; i < schema->getSubtypeCount(); ++i) {
+ const Type& child = *schema->getSubtype(i);
+ if (!selectedColumns[child.getColumnId()]) continue;
+
+ orc::ColumnVectorBatch* b = *it++;
+ const univplan::ColumnStatistics* s =
+ currentStripeStats->getColumnStatistics(child.getColumnId());
+ b->hasStats = true;
+ b->stats.hasMinMaxStats = true;
+ b->stats.valueCount = s->getNumberOfValues();
+ switch (b->getType()) {
+ case orc::ORCTypeKind::BYTE:
+ case orc::ORCTypeKind::SHORT:
+ case orc::ORCTypeKind::INT:
+ case orc::ORCTypeKind::LONG:
+ case orc::ORCTypeKind::TIME: {
+ const IntegerColumnStatisticsImpl* iStat =
+ dynamic_cast<const IntegerColumnStatisticsImpl*>(s);
+ if (iStat->hasMinimum()) {
+ if (b->getType() == orc::ORCTypeKind::BYTE) {
+ b->stats.minimum =
+ dbcommon::CreateDatum(static_cast<int8_t>(iStat->getMinimum()));
+ b->stats.maximum =
+ dbcommon::CreateDatum(static_cast<int8_t>(iStat->getMaximum()));
+ } else if (b->getType() == orc::ORCTypeKind::SHORT) {
+ b->stats.minimum = dbcommon::CreateDatum(
+ static_cast<int16_t>(iStat->getMinimum()));
+ b->stats.maximum = dbcommon::CreateDatum(
+ static_cast<int16_t>(iStat->getMaximum()));
+ } else if (b->getType() == orc::ORCTypeKind::INT) {
+ b->stats.minimum = dbcommon::CreateDatum(
+ static_cast<int32_t>(iStat->getMinimum()));
+ b->stats.maximum = dbcommon::CreateDatum(
+ static_cast<int32_t>(iStat->getMaximum()));
+ } else {
+ b->stats.minimum = dbcommon::CreateDatum(iStat->getMinimum());
+ b->stats.maximum = dbcommon::CreateDatum(iStat->getMaximum());
+ }
+ b->stats.sum = dbcommon::CreateDatum(iStat->getSum());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ case orc::ORCTypeKind::FLOAT:
+ case orc::ORCTypeKind::DOUBLE: {
+ const DoubleColumnStatisticsImpl* dStat =
+ dynamic_cast<const DoubleColumnStatisticsImpl*>(s);
+ if (dStat->hasMinimum()) {
+ if (b->getType() == orc::ORCTypeKind::FLOAT) {
+ b->stats.minimum =
+ dbcommon::CreateDatum(static_cast<float>(dStat->getMinimum()));
+ b->stats.maximum =
+ dbcommon::CreateDatum(static_cast<float>(dStat->getMaximum()));
+ } else {
+ b->stats.minimum = dbcommon::CreateDatum(dStat->getMinimum());
+ b->stats.maximum = dbcommon::CreateDatum(dStat->getMaximum());
+ }
+ b->stats.sum = dbcommon::CreateDatum(dStat->getSum());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ case orc::ORCTypeKind::CHAR:
+ case orc::ORCTypeKind::VARCHAR:
+ case orc::ORCTypeKind::STRING: {
+ const StringColumnStatisticsImpl* sStat =
+ dynamic_cast<const StringColumnStatisticsImpl*>(s);
+ if (sStat->hasMinimum()) {
+ b->stats.minimum = dbcommon::CreateDatum(sStat->getMinimum());
+ b->stats.maximum = dbcommon::CreateDatum(sStat->getMaximum());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ case orc::ORCTypeKind::BINARY: {
+ const BinaryColumnStatisticsImpl* sStat =
+ dynamic_cast<const BinaryColumnStatisticsImpl*>(s);
+ b->stats.hasMinMaxStats = false;
+ break;
+ }
+ case orc::ORCTypeKind::DATE: {
+ const DateColumnStatisticsImpl* dStat =
+ dynamic_cast<const DateColumnStatisticsImpl*>(s);
+ if (dStat->hasMinimum()) {
+ b->stats.minimum = dbcommon::CreateDatum(dStat->getMinimum());
+ b->stats.maximum = dbcommon::CreateDatum(dStat->getMaximum());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ case orc::ORCTypeKind::TIMESTAMP: {
+ const TimestampColumnStatisticsImpl* tStat =
+ dynamic_cast<const TimestampColumnStatisticsImpl*>(s);
+ if (tStat->hasMinimum()) {
+ b->stats.minimum = dbcommon::CreateDatum(tStat->getMinimum());
+ b->stats.maximum = dbcommon::CreateDatum(tStat->getMaximum());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ case orc::ORCTypeKind::DECIMAL: {
+ const DecimalColumnStatisticsImpl* dStat =
+ dynamic_cast<const DecimalColumnStatisticsImpl*>(s);
+ if (dStat->hasMinimum()) {
+ b->stats.minimum = dbcommon::CreateDatum(dStat->getMinimumStr());
+ b->stats.maximum = dbcommon::CreateDatum(dStat->getMaximumStr());
+ b->stats.sum = dbcommon::CreateDatum(dStat->getSumStr());
+ } else {
+ b->stats.hasMinMaxStats = false;
+ }
+ break;
+ }
+ default: {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "type %d not supported yet",
+ b->getType());
+ }
+ }
+ }
+ currentStripe += 1;
+ currentRowInStripe = 0;
+ return true;
+}
+
+void ReaderImpl::startNextStripe() {
+ rowsInCurrentStripe = currentStripeInfo.numberofrows();
+ curReader.reset();
+ curReader = buildReader(*(schema.get()), *(currentStripeStream.get()));
+}
+
+void ReaderImpl::checkOrcVersion() {
+ std::string version = getFormatVersion();
+ if (version != "0.11" && version != "0.12") {
+ *(options.getErrorStream())
+ << "Warning: ORC file " << stream->getName()
+ << " was written in an unknown format version " << version << "\n";
+ }
+}
+
+bool ReaderImpl::notIncludeType(ColumnVectorBatch* data,
+ orc::ORCTypeKind typekind) {
+ currentStripeStats = getStripeStatistics(currentStripe);
+ orc::StructVectorBatch* structBatch =
+ dynamic_cast<orc::StructVectorBatch*>(data);
+ assert(structBatch != nullptr);
+ std::vector<orc::ColumnVectorBatch*>::iterator it =
+ structBatch->fields.begin();
+ for (uint64_t i = 0; i < schema->getSubtypeCount(); ++i) {
+ const Type& child = *schema->getSubtype(i);
+ if (!selectedColumns[child.getColumnId()]) continue;
+
+ orc::ColumnVectorBatch* b = *it++;
+ const univplan::ColumnStatistics* s =
+ currentStripeStats->getColumnStatistics(child.getColumnId());
+ if (b->getType() == typekind) return false;
+ }
+ return true;
+}
+
+bool ReaderImpl::next(ColumnVectorBatch& data) {
+again:
+ if (currentStripe >= lastStripe) {
+ data.numElements = 0;
+ if (lastStripe > 0) {
+ previousRow =
+ firstRowOfStripe[lastStripe - 1] +
+ footer->stripes(static_cast<int>(lastStripe - 1)).numberofrows();
+ } else {
+ previousRow = 0;
+ }
+ return false;
+ }
+ if (currentRowInStripe == 0) {
+ // check if only read stripe statistics, and then return
+ if (options.readStatsOnly()) {
+ return doReadStatsOnly(&data);
+ }
+
+ const proto::StripeFooter* currentStripeFooter =
+ &stripeFooters[currentStripe - firstStripe];
+ const Timezone& writerTimezone =
+ currentStripeFooter->has_writertimezone()
+ ? getTimezoneByName(currentStripeFooter->writertimezone())
+ : localTimezone;
+ currentStripeInfo = footer->stripes(static_cast<int>(currentStripe));
+ currentStripeStream.reset(new StripeStreamsImpl(
+ *this, *currentStripeFooter, currentStripeInfo.offset(),
+ *(stream.get()), memoryPool, writerTimezone));
+
+ // filter push down
+ bool canDrop = false;
+ if (options.getPredicateExprs()) {
+ std::unique_ptr<univplan::Statistics> stats =
+ getStripeStatistics(currentStripe);
+ if (stats) {
+ OrcPredicates::uptr predicate(
+ new OrcPredicates(stats.get(), this, options.getPredicateExprs(),
+ options.getTupleDesc()));
+ canDrop = predicate->canDrop();
+ }
+ }
+ if (!canDrop) {
+ ++scannedStripe;
+ startNextStripe();
+ } else {
+ ++skippedStripe;
+ currentStripe += 1;
+ goto again;
+ }
+ }
+ uint64_t rowsToRead = std::min(static_cast<uint64_t>(data.capacity),
+ rowsInCurrentStripe - currentRowInStripe);
+ data.numElements = rowsToRead;
+ curReader->next(data, rowsToRead, 0);
+
+ // update row number
+ previousRow = firstRowOfStripe[currentStripe] + currentRowInStripe;
+ currentRowInStripe += rowsToRead;
+ if (currentRowInStripe >= rowsInCurrentStripe) {
+ currentStripe += 1;
+ currentRowInStripe = 0;
+ }
+ return rowsToRead != 0;
+}
+
+std::unique_ptr<ColumnVectorBatch> ReaderImpl::createRowBatch(
+ uint64_t capacity) const {
+ if (numberOfStripes)
+ return getSelectedType().createRowBatch(capacity, memoryPool);
+ else
+ return nullptr;
+}
+
+void ensureOrcFooter(InputStream* stream, DataBuffer<char>* buffer,
+ uint64_t postscriptLength) {
+ const std::string MAGIC("ORC");
+ const uint64_t magicLength = MAGIC.length();
+ const char* const bufferStart = buffer->data();
+ const uint64_t bufferLength = buffer->size();
+
+ if (postscriptLength < magicLength || bufferLength < magicLength) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Invalid ORC postscript length");
+ }
+ const char* magicStart = bufferStart + bufferLength - 1 - magicLength;
+
+ // Look for the magic string at the end of the postscript.
+ if (memcmp(magicStart, MAGIC.c_str(), magicLength) != 0) {
+ // If there is no magic string at the end, check the beginning.
+ // Only files written by Hive 0.11.0 don't have the tail ORC string.
+ char* frontBuffer = new char[magicLength];
+ stream->read(frontBuffer, magicLength, 0);
+ bool foundMatch = memcmp(frontBuffer, MAGIC.c_str(), magicLength) == 0;
+ delete[] frontBuffer;
+ if (!foundMatch) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not an ORC file");
+ }
+ }
+}
+
+/**
+ * Read the file's postscript from the given buffer.
+ * @param stream the file stream
+ * @param buffer the buffer with the tail of the file.
+ * @param postscriptSize the length of postscript in bytes
+ */
+std::unique_ptr<proto::PostScript> readPostscript(InputStream* stream,
+ DataBuffer<char>* buffer,
+ uint64_t postscriptSize) {
+ char* ptr = buffer->data();
+ uint64_t readSize = buffer->size();
+
+ ensureOrcFooter(stream, buffer, postscriptSize);
+
+ std::unique_ptr<proto::PostScript> postscript =
+ std::unique_ptr<proto::PostScript>(new proto::PostScript());
+ if (!postscript->ParseFromArray(ptr + readSize - 1 - postscriptSize,
+ static_cast<int>(postscriptSize))) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failed to parse the postscript from %s",
+ stream->getName().c_str());
+ }
+ return std::move(postscript);
+}
+
+/**
+ * Parse the footer from the given buffer.
+ * @param stream the file's stream
+ * @param buffer the buffer to parse the footer from
+ * @param footerOffset the offset within the buffer that contains the footer
+ * @param ps the file's postscript
+ */
+std::unique_ptr<proto::Footer> readFooter(
+ InputStream* stream, DataBuffer<char>* buffer, uint64_t footerOffset,
+ const proto::PostScript& ps,
+ dbcommon::MemoryPool& pool) { // NOLINT
+ char* footerPtr = buffer->data() + footerOffset;
+
+ std::unique_ptr<SeekableInputStream> pbStream = createDecompressor(
+ convertCompressionKind(ps),
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(footerPtr, ps.footerlength())),
+ getCompressionBlockSize(ps), pool);
+
+ std::unique_ptr<proto::Footer> footer =
+ std::unique_ptr<proto::Footer>(new proto::Footer());
+ if (!footer->ParseFromZeroCopyStream(pbStream.get())) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failed to parse the footer from %s",
+ stream->getName().c_str());
+ }
+ return std::move(footer);
+}
+
+std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
+ const ReaderOptions& options) {
+ dbcommon::MemoryPool* memoryPool = options.getMemoryPool();
+ std::unique_ptr<proto::PostScript> ps;
+ std::unique_ptr<proto::Footer> footer;
+ std::string serializedFooter = options.getSerializedFileTail();
+ uint64_t fileLength;
+ uint64_t postscriptLength;
+ if (serializedFooter.length() != 0) {
+ // Parse the file tail from the serialized one.
+ proto::FileTail tail;
+ if (!tail.ParseFromString(serializedFooter)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Failed to parse the file tail from string");
+ }
+ ps.reset(new proto::PostScript(tail.postscript()));
+ footer.reset(new proto::Footer(tail.footer()));
+ fileLength = tail.filelength();
+ postscriptLength = tail.postscriptlength();
+ } else {
+ // figure out the size of the file using the option or filesystem
+ fileLength = std::min(options.getTailLocation(),
+ static_cast<uint64_t>(stream->getLength()));
+
+ // read last bytes into buffer to get PostScript
+ uint64_t readSize = std::min(fileLength, DIRECTORY_SIZE_GUESS);
+ if (readSize < 4) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "File size too small");
+ }
+
+ std::unique_ptr<DataBuffer<char>> buffer(
+ new DataBuffer<char>(*memoryPool, readSize));
+ stream->read(buffer->data(), readSize, fileLength - readSize);
+
+ postscriptLength = buffer->data()[readSize - 1] & 0xff;
+ ps = readPostscript(stream.get(), buffer.get(), postscriptLength);
+ uint64_t footerSize = ps->footerlength();
+ uint64_t tailSize = 1 + postscriptLength + footerSize;
+ uint64_t footerOffset;
+
+ if (tailSize > readSize) {
+ buffer->resize(footerSize);
+ stream->read(buffer->data(), footerSize, fileLength - tailSize);
+ footerOffset = 0;
+ } else {
+ footerOffset = readSize - tailSize;
+ }
+
+ footer =
+ readFooter(stream.get(), buffer.get(), footerOffset, *ps, *memoryPool);
+ }
+ return std::unique_ptr<Reader>(
+ new ReaderImpl(std::move(stream), options, std::move(ps),
+ std::move(footer), fileLength, postscriptLength));
+}
+
+void ReaderImpl::updateSelectedByFieldId(uint64_t fieldId) {
+ if (fieldId < schema->getSubtypeCount()) {
+ selectChildren(*schema->getSubtype(fieldId));
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Invalid column selected %llu out of %llu", fieldId,
+ schema->getSubtypeCount());
+ }
+}
+
+void ReaderImpl::updateSelectedByTypeId(uint64_t typeId) {
+ if (typeId < selectedColumns.size()) {
+ const Type& type = *idTypeMap[typeId];
+ selectChildren(type);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Invalid type id selected %llu out of %zu", typeId,
+ selectedColumns.size());
+ }
+}
+
+void ReaderImpl::updateSelectedByName(const std::string& fieldName) {
+ std::map<std::string, uint64_t>::const_iterator ite =
+ nameIdMap.find(fieldName);
+ if (ite != nameIdMap.end()) {
+ updateSelectedByTypeId(ite->second);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Invalid column selected %s",
+ fieldName.c_str());
+ }
+}
+
+void ReaderImpl::selectChildren(const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ if (!selectedColumns[id]) {
+ selectedColumns[id] = true;
+ for (size_t c = id; c <= type.getMaximumColumnId(); ++c) {
+ selectedColumns[c] = true;
+ }
+ }
+}
+
+// Recurses over a type tree and selects the parents of every selected type.
+// @return true if any child was selected.
+bool ReaderImpl::selectParents(const Type& type) {
+ size_t id = static_cast<size_t>(type.getColumnId());
+ bool result = selectedColumns[id];
+ for (uint64_t c = 0; c < type.getSubtypeCount(); ++c) {
+ result |= selectParents(*type.getSubtype(c));
+ }
+ selectedColumns[id] = result;
+ return result;
+}
+
+StripeStreams::~StripeStreams() {
+ // PASS
+}
+
+StripeStreamsImpl::StripeStreamsImpl(const ReaderImpl& _reader,
+ const proto::StripeFooter& _footer,
+ uint64_t _stripeStart, InputStream& _input,
+ dbcommon::MemoryPool& pool,
+ const Timezone& _writerTimezone)
+ : reader(_reader),
+ footer(_footer),
+ stripeStart(_stripeStart),
+ input(_input),
+ memoryPool(pool),
+ writerTimezone(_writerTimezone) {
+ // PASS
+}
+
+StripeStreamsImpl::~StripeStreamsImpl() {
+ // PASS
+}
+
+const ReaderOptions& StripeStreamsImpl::getReaderOptions() const {
+ return reader.getReaderOptions();
+}
+
+const std::vector<bool> StripeStreamsImpl::getSelectedColumns() const {
+ return reader.getSelectedColumns();
+}
+
+proto::ColumnEncoding StripeStreamsImpl::getEncoding(uint64_t columnId) const {
+ return footer.columns(static_cast<int>(columnId));
+}
+
+const Timezone& StripeStreamsImpl::getWriterTimezone() const {
+ return writerTimezone;
+}
+
+std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStream(
+ uint64_t columnId, proto::Stream_Kind kind, bool shouldStream) const {
+ uint64_t offset = stripeStart;
+ for (int i = 0; i < footer.streams_size(); ++i) {
+ const proto::Stream& stream = footer.streams(i);
+ if (stream.has_kind() && stream.kind() == kind &&
+ stream.column() == static_cast<uint64_t>(columnId)) {
+ uint64_t myBlock =
+ shouldStream ? input.getNaturalReadSize() : stream.length();
+ return createDecompressor(
+ reader.getCompression(),
+ std::unique_ptr<SeekableInputStream>(new SeekableFileInputStream(
+ &input, offset, stream.length(), memoryPool, myBlock)),
+ reader.getCompressionSize(), memoryPool);
+ }
+ offset += stream.length();
+ }
+ return std::unique_ptr<SeekableInputStream>();
+}
+
+std::unique_ptr<SeekableInputStream> StripeStreamsImpl::getStreamForBloomFilter(
+ uint64_t columnId, proto::Stream_Kind kind, bool shouldStream) const {
+ uint64_t offset = stripeStart;
+ for (int i = 0; i < footer.streams_size(); ++i) {
+ const proto::Stream& stream = footer.streams(i);
+ if (stream.has_kind() && stream.kind() == kind &&
+ stream.column() == static_cast<uint64_t>(columnId)) {
+ uint64_t myBlock =
+ shouldStream ? input.getNaturalReadSize() : stream.length();
+ return createDecompressor(
+ reader.getCompression(),
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableFileBloomFilterInputStream(
+ &input, offset, stream.length(), memoryPool, myBlock)),
+ reader.getCompressionSize(), memoryPool);
+ }
+ offset += stream.length();
+ }
+ return std::unique_ptr<SeekableInputStream>();
+}
+
+dbcommon::MemoryPool& StripeStreamsImpl::getMemoryPool() const {
+ return memoryPool;
+}
+
+RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind) {
+ switch (static_cast<int64_t>(kind)) {
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ return RleVersion_1;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ return RleVersion_2;
+ case proto::ColumnEncoding_Kind_DIRECT_V0:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V0:
+ return RleVersion_0;
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Unknown encoding in convertRleVersion");
+ }
+}
+
+ColumnReader::ColumnReader(const Type& type, StripeStreams& stripe)
+ : columnId(type.getColumnId()), memoryPool(stripe.getMemoryPool()) {
+ std::unique_ptr<SeekableInputStream> stream =
+ stripe.getStream(columnId, proto::Stream_Kind_PRESENT, true);
+ if (stream.get()) {
+ notNullDecoder = createBooleanRleDecoder(std::move(stream));
+ }
+}
+
+ColumnReader::~ColumnReader() {
+ // PASS
+}
+
+uint64_t ColumnReader::skip(uint64_t numValues) {
+ ByteRleDecoder* decoder = notNullDecoder.get();
+ if (decoder) {
+ // page through the values that we want to skip
+ // and count how many are non-null
+ const size_t MAX_BUFFER_SIZE = 32768;
+ size_t bufferSize =
+ std::min(MAX_BUFFER_SIZE, static_cast<size_t>(numValues));
+ char buffer[MAX_BUFFER_SIZE];
+ uint64_t remaining = numValues;
+ while (remaining > 0) {
+ uint64_t chunkSize =
+ std::min(remaining, static_cast<uint64_t>(bufferSize));
+ decoder->next(buffer, chunkSize, 0);
+ remaining -= chunkSize;
+ for (uint64_t i = 0; i < chunkSize; ++i) {
+ if (!buffer[i]) {
+ numValues -= 1;
+ }
+ }
+ }
+ }
+ return numValues;
+}
+
+void ColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* incomingMask) {
+ if (numValues > rowBatch.capacity) {
+ rowBatch.resize(numValues);
+ }
+ rowBatch.numElements = numValues;
+ ByteRleDecoder* decoder = notNullDecoder.get();
+ if (decoder) {
+ char* notNullArray = rowBatch.notNull.data();
+ decoder->next(notNullArray, numValues, incomingMask);
+ // check to see if there are nulls in this batch
+ // performance: reduce branch to enable vectorize
+ char tmp = 0x0; // false
+#pragma clang vectorize(enable)
+ for (uint64_t i = 0; i < numValues; ++i) {
+ tmp |= notNullArray[i] ^ 0x1; // !notNull
+ }
+ rowBatch.hasNulls = (tmp);
+ if (rowBatch.hasNulls) return;
+ } else if (incomingMask) {
+ // If we don't have a notNull stream, copy the incomingMask
+ rowBatch.hasNulls = true;
+ memcpy(rowBatch.notNull.data(), incomingMask, numValues);
+ return;
+ }
+ rowBatch.hasNulls = false;
+}
+
+// Expand an array of bytes in place to the corresponding array of longs.
+// Has to work backwards so that they data isn't clobbered during the
+// expansion.
+// @param buffer the array of chars and array of longs that need to be
+// expanded
+// @param numValues the number of bytes to convert to longs
+void expandBytesToLongs(int64_t* buffer, uint64_t numValues) {
+ for (size_t i = numValues - 1; i < numValues; --i) {
+ buffer[i] = reinterpret_cast<char*>(buffer)[i];
+ }
+}
+
+BooleanColumnReader::BooleanColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ rle = createBooleanRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true));
+}
+
+BooleanColumnReader::~BooleanColumnReader() {
+ // PASS
+}
+
+uint64_t BooleanColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+void BooleanColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ bool* ptr = dynamic_cast<BooleanVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : 0);
+}
+
+ByteColumnReader::ByteColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ rle = createByteRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true));
+}
+
+ByteColumnReader::~ByteColumnReader() {
+ // PASS
+}
+
+uint64_t ByteColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+void ByteColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // Since the byte rle places the output in a char* instead of long*,
+ // we cheat here and use the long* and then expand it in a second pass.
+ int8_t* ptr = dynamic_cast<ByteVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<char*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : 0);
+ // expandBytesToLongs(ptr, numValues);
+}
+
+template <class IntType>
+IntegerColumnReader<IntType>::IntegerColumnReader(
+ const Type& type,
+ StripeStreams& stripe) // NOLINT
+ : ColumnReader(type, stripe) {}
+
+template <class IntType>
+uint64_t IntegerColumnReader<IntType>::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+template <class IntType>
+void IntegerColumnReader<IntType>::next(ColumnVectorBatch& rowBatch, // NOLINT
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ rle->next(dynamic_cast<FixedSizeVectorBatch<IntType>&>(rowBatch).data.data(),
+ numValues, rowBatch.hasNulls ? rowBatch.notNull.data() : 0);
+}
+
+DateColumnReader::DateColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, INT);
+}
+
+uint64_t DateColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+void DateColumnReader::next(ColumnVectorBatch& rowBatch, // NOLINT
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ int32_t* ptr = dynamic_cast<DateVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<int32_t*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : 0);
+}
+
+TimeColumnReader::TimeColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, LONG);
+}
+
+uint64_t TimeColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+void TimeColumnReader::next(ColumnVectorBatch& rowBatch, // NOLINT
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ int64_t* ptr = dynamic_cast<TimeVectorBatch&>(rowBatch).data.data();
+ rle->next(reinterpret_cast<int64_t*>(ptr), numValues,
+ rowBatch.hasNulls ? rowBatch.notNull.data() : 0);
+}
+
+TimestampColumnReader::TimestampColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe),
+ writerTimezone(stripe.getWriterTimezone()),
+ epochOffset(writerTimezone.getEpoch()) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ secondsRle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, LONG);
+ nanoRle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true), false,
+ vers, memoryPool, LONG);
+}
+
+TimestampColumnReader::~TimestampColumnReader() {
+ // PASS
+}
+
+uint64_t TimestampColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ secondsRle->skip(numValues);
+ nanoRle->skip(numValues);
+ return numValues;
+}
+
+void TimestampColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : nullptr;
+ TimestampVectorBatch& timestampBatch =
+ dynamic_cast<TimestampVectorBatch&>(rowBatch);
+ int64_t* secsBuffer = timestampBatch.data.data();
+ secondsRle->next(secsBuffer, numValues, notNull);
+ int64_t* nanoBuffer = timestampBatch.nanoseconds.data();
+ nanoRle->next(nanoBuffer, numValues, notNull);
+
+ // Construct the values
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (notNull == nullptr || notNull[i]) {
+ uint64_t zeros = nanoBuffer[i] & 0x7;
+ nanoBuffer[i] >>= 3;
+ if (zeros != 0) {
+ for (uint64_t j = 0; j <= zeros; ++j) {
+ nanoBuffer[i] *= 10;
+ }
+ }
+ int64_t writerTime = secsBuffer[i] + epochOffset;
+ // For now only support timestamp without timezone
+ // secsBuffer[i] =
+ // writerTime + writerTimezone.getVariant(writerTime).gmtOffset;
+ secsBuffer[i] +=
+ (ORC_TIMESTAMP_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECONDS_PER_DAY;
+ if (secsBuffer[i] < 0 && nanoBuffer[i] != 0) {
+ secsBuffer[i] -= 1;
+ }
+ }
+ }
+}
+
+DoubleColumnReader::DoubleColumnReader(const Type& type, StripeStreams& stripe)
+ : // NOLINT
+ ColumnReader(type, stripe),
+ inputStream(stripe.getStream(columnId, proto::Stream_Kind_DATA, true)),
+ columnKind(type.getKind()),
+ bytesPerValue((type.getKind() == FLOAT) ? 4 : 8),
+ bufferPointer(NULL),
+ bufferEnd(NULL) {
+ // PASS
+}
+
+DoubleColumnReader::~DoubleColumnReader() {
+ // PASS
+}
+
+uint64_t DoubleColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+
+ if (static_cast<size_t>(bufferEnd - bufferPointer) >=
+ bytesPerValue * numValues) {
+ bufferPointer += bytesPerValue * numValues;
+ } else {
+ inputStream->Skip(
+ static_cast<int>(bytesPerValue * numValues -
+ static_cast<size_t>(bufferEnd - bufferPointer)));
+ bufferEnd = NULL;
+ bufferPointer = NULL;
+ }
+
+ return numValues;
+}
+
+void DoubleColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+
+ if (columnKind == FLOAT) {
+ void* data = dynamic_cast<FloatVectorBatch&>(rowBatch).data.data();
+ if (notNull) {
+ uint64_t notNullValues = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++notNullValues;
+ }
+ }
+ float* dat = reinterpret_cast<float*>(data);
+ readData(reinterpret_cast<char*>(dat), notNullValues);
+ for (int64_t j = numValues - 1, k = notNullValues - 1; j >= 0; --j) {
+ if (notNull[j]) {
+ dat[j] = dat[k--];
+ }
+ }
+ } else {
+ readData(reinterpret_cast<char*>(data), numValues);
+ }
+ } else {
+ void* data = dynamic_cast<DoubleVectorBatch&>(rowBatch).data.data();
+ if (notNull) {
+ uint64_t notNullValues = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++notNullValues;
+ }
+ }
+ double* dat = reinterpret_cast<double*>(data);
+ readData(reinterpret_cast<char*>(dat), notNullValues);
+ for (int64_t j = numValues - 1, k = notNullValues - 1; j >= 0; --j) {
+ if (notNull[j]) {
+ dat[j] = dat[k--];
+ }
+ }
+ } else {
+ readData(reinterpret_cast<char*>(data), numValues);
+ }
+ }
+}
+
+StringDictionaryColumnReader::StringDictionaryColumnReader(
+ const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe),
+ dictionaryBlob(stripe.getMemoryPool()),
+ dictionaryOffset(stripe.getMemoryPool()) {
+ RleVersion rleVersion =
+ convertRleVersion(stripe.getEncoding(columnId).kind());
+ dictionaryCount = stripe.getEncoding(columnId).dictionarysize();
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), false,
+ rleVersion, memoryPool, LONG);
+ std::unique_ptr<RleDecoder> lengthDecoder = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, false), false,
+ rleVersion, memoryPool, LONG);
+ dictionaryOffset.resize(dictionaryCount + 1);
+ int64_t* lengthArray = dictionaryOffset.data();
+ lengthDecoder->next(lengthArray + 1, dictionaryCount, 0);
+ lengthArray[0] = 0;
+ for (uint64_t i = 1; i < dictionaryCount + 1; ++i) {
+ lengthArray[i] += lengthArray[i - 1];
+ }
+ int64_t blobSize = lengthArray[dictionaryCount];
+ dictionaryBlob.resize(static_cast<uint64_t>(blobSize));
+ std::unique_ptr<SeekableInputStream> blobStream =
+ stripe.getStream(columnId, proto::Stream_Kind_DICTIONARY_DATA, false);
+ readFully(dictionaryBlob.data(), blobSize, blobStream.get());
+}
+
+StringDictionaryColumnReader::~StringDictionaryColumnReader() {
+ // PASS
+}
+
+uint64_t StringDictionaryColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ rle->skip(numValues);
+ return numValues;
+}
+
+void StringDictionaryColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ BytesVectorBatch& byteBatch = dynamic_cast<BytesVectorBatch&>(rowBatch);
+ byteBatch.isDirectEncoding = false;
+ char* blob = dictionaryBlob.data();
+ int64_t* dictionaryOffsets = dictionaryOffset.data();
+ char** outputStarts = byteBatch.data.data();
+ int64_t* outputLengths = byteBatch.length.data();
+ rle->next(outputLengths, numValues, notNull);
+ if (notNull) {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ int64_t entry = outputLengths[i];
+ outputStarts[i] = blob + dictionaryOffsets[entry];
+ outputLengths[i] =
+ dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < numValues; ++i) {
+ int64_t entry = outputLengths[i];
+ outputStarts[i] = blob + dictionaryOffsets[entry];
+ outputLengths[i] =
+ dictionaryOffsets[entry + 1] - dictionaryOffsets[entry];
+ }
+ }
+}
+
+StringDirectColumnReader::StringDirectColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe), blobBuffer(stripe.getMemoryPool()) {
+ RleVersion rleVersion =
+ convertRleVersion(stripe.getEncoding(columnId).kind());
+ lengthRle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true), false,
+ rleVersion, memoryPool, LONG);
+ blobStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ lastBuffer = 0;
+ lastBufferLength = 0;
+}
+
+StringDirectColumnReader::~StringDirectColumnReader() {
+ // PASS
+}
+
+uint64_t StringDirectColumnReader::skip(uint64_t numValues) {
+ const size_t BUFFER_SIZE = 1024;
+ numValues = ColumnReader::skip(numValues);
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t done = 0;
+ size_t totalBytes = 0;
+ // read the lengths, so we know haw many bytes to skip
+ while (done < numValues) {
+ uint64_t step =
+ std::min(BUFFER_SIZE, static_cast<size_t>(numValues - done));
+ lengthRle->next(buffer, step, 0);
+ totalBytes += computeSize(buffer, 0, step);
+ done += step;
+ }
+ if (totalBytes <= lastBufferLength) {
+ // subtract the needed bytes from the ones left over
+ lastBufferLength -= totalBytes;
+ lastBuffer += totalBytes;
+ } else {
+ // move the stream forward after accounting for the buffered bytes
+ totalBytes -= lastBufferLength;
+ blobStream->Skip(static_cast<int>(totalBytes));
+ lastBufferLength = 0;
+ lastBuffer = 0;
+ }
+ return numValues;
+}
+
+size_t StringDirectColumnReader::computeSize(const int64_t* lengths,
+ const char* notNull,
+ uint64_t numValues) {
+ size_t totalLength = 0;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ totalLength += static_cast<size_t>(lengths[i]);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ totalLength += static_cast<size_t>(lengths[i]);
+ }
+ }
+ return totalLength;
+}
+
+void StringDirectColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ // update the notNull from the parent class
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ BytesVectorBatch& byteBatch = dynamic_cast<BytesVectorBatch&>(rowBatch);
+ byteBatch.isDirectEncoding = true;
+ char** __restrict__ startPtr = byteBatch.data.data();
+ int64_t* __restrict__ lengthPtr = byteBatch.length.data();
+
+ // read the length vector
+ lengthRle->next(lengthPtr, numValues, notNull);
+
+ // figure out the total length of data we need from the blob stream
+ const size_t totalLength = computeSize(lengthPtr, notNull, numValues);
+
+ // Load data from the blob stream into our buffer until we have enough
+ // to get the rest directly out of the stream's buffer.
+ size_t bytesBuffered = 0;
+ blobBuffer.resize(totalLength);
+ char* ptr = blobBuffer.data();
+ while (bytesBuffered + lastBufferLength < totalLength) {
+ blobBuffer.resize(bytesBuffered + lastBufferLength);
+ memcpy(ptr + bytesBuffered, lastBuffer, lastBufferLength);
+ bytesBuffered += lastBufferLength;
+ const void* readBuffer;
+ int readLength;
+ if (!blobStream->Next(&readBuffer, &readLength)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "failed to read in StringDirectColumnReader.next");
+ }
+ lastBuffer = static_cast<const char*>(readBuffer);
+ lastBufferLength = static_cast<size_t>(readLength);
+ }
+
+ // Set up the start pointers for the ones that will come out of the buffer.
+ size_t filledSlots = 0;
+ size_t usedBytes = 0;
+ ptr = blobBuffer.data();
+ if (notNull) {
+ while (filledSlots < numValues &&
+ (!notNull[filledSlots] ||
+ usedBytes + static_cast<size_t>(lengthPtr[filledSlots]) <=
+ bytesBuffered)) {
+ if (notNull[filledSlots]) {
+ startPtr[filledSlots] = ptr + usedBytes;
+ usedBytes += static_cast<size_t>(lengthPtr[filledSlots]);
+ }
+ filledSlots += 1;
+ }
+ } else {
+ while (filledSlots < numValues &&
+ (usedBytes + static_cast<size_t>(lengthPtr[filledSlots]) <=
+ bytesBuffered)) {
+ startPtr[filledSlots] = ptr + usedBytes;
+ usedBytes += static_cast<size_t>(lengthPtr[filledSlots]);
+ filledSlots += 1;
+ }
+ }
+
+ // do we need to complete the last value in the blob buffer?
+ if (usedBytes < bytesBuffered) {
+ size_t moreBytes = static_cast<size_t>(lengthPtr[filledSlots]) -
+ (bytesBuffered - usedBytes);
+ blobBuffer.resize(bytesBuffered + moreBytes);
+ ptr = blobBuffer.data();
+ memcpy(ptr + bytesBuffered, lastBuffer, moreBytes);
+ lastBuffer += moreBytes;
+ lastBufferLength -= moreBytes;
+ startPtr[filledSlots++] = ptr + usedBytes;
+ }
+
+ // Finally, set up any remaining entries into the stream buffer
+ if (notNull) {
+ while (filledSlots < numValues) {
+ if (notNull[filledSlots]) {
+ startPtr[filledSlots] = const_cast<char*>(lastBuffer);
+ lastBuffer += lengthPtr[filledSlots];
+ lastBufferLength -= static_cast<size_t>(lengthPtr[filledSlots]);
+ }
+ filledSlots += 1;
+ }
+ } else {
+ // performance: use tmp variable to avoid needless memory update for the
+ // member variable
+ const char* tmpLastBuffer = lastBuffer;
+#pragma clang loop unroll(full)
+ while (filledSlots < numValues) {
+ startPtr[filledSlots] = const_cast<char*>(tmpLastBuffer);
+ tmpLastBuffer += lengthPtr[filledSlots];
+ filledSlots += 1;
+ }
+ lastBufferLength = lastBufferLength - (tmpLastBuffer - lastBuffer);
+ lastBuffer = tmpLastBuffer;
+ }
+}
+
+StructColumnReader::StructColumnReader(const Type& type, StripeStreams& stripe)
+ : // NOLINT
+ ColumnReader(type, stripe) {
+ // count the number of selected sub-columns
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ switch (static_cast<int64_t>(stripe.getEncoding(columnId).kind())) {
+ case proto::ColumnEncoding_Kind_DIRECT:
+ for (unsigned int i = 0; i < type.getSubtypeCount(); ++i) {
+ const Type& child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<uint64_t>(child.getColumnId())]) {
+ children.push_back(buildReader(child, stripe).release());
+ }
+ }
+ break;
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Unknown encoding for StructColumnReader");
+ }
+}
+
+StructColumnReader::~StructColumnReader() {
+ for (size_t i = 0; i < children.size(); i++) {
+ delete children[i];
+ }
+}
+
+uint64_t StructColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ for (std::vector<ColumnReader*>::iterator ptr = children.begin();
+ ptr != children.end(); ++ptr) {
+ (*ptr)->skip(numValues);
+ }
+ return numValues;
+}
+
+void StructColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ uint64_t i = 0;
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ for (std::vector<ColumnReader*>::iterator ptr = children.begin();
+ ptr != children.end(); ++ptr, ++i) {
+ (*ptr)->next(*(dynamic_cast<StructVectorBatch&>(rowBatch).fields[i]),
+ numValues, notNull);
+ }
+}
+
+ListColumnReader::ListColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ // count the number of selected sub-columns
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true), false, vers,
+ memoryPool, LONG);
+ const Type& childType = *type.getSubtype(0);
+ if (selectedColumns[static_cast<uint64_t>(childType.getColumnId())]) {
+ child = buildReader(childType, stripe);
+ }
+}
+
+ListColumnReader::~ListColumnReader() {
+ // PASS
+}
+
+uint64_t ListColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ ColumnReader* childReader = child.get();
+ if (childReader) {
+ const uint64_t BUFFER_SIZE = 1024;
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t childrenElements = 0;
+ uint64_t lengthsRead = 0;
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, 0);
+ for (size_t i = 0; i < chunk; ++i) {
+ childrenElements += static_cast<size_t>(buffer[i]);
+ }
+ lengthsRead += chunk;
+ }
+ childReader->skip(childrenElements);
+ } else {
+ rle->skip(numValues);
+ }
+ return numValues;
+}
+
+void ListColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ ListVectorBatch& listBatch = dynamic_cast<ListVectorBatch&>(rowBatch);
+ int64_t* offsets = listBatch.offsets.data();
+ notNull = listBatch.hasNulls ? listBatch.notNull.data() : 0;
+ rle->next(offsets, numValues, notNull);
+ uint64_t totalChildren = 0;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ } else {
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ }
+ }
+ offsets[numValues] = static_cast<int64_t>(totalChildren);
+ ColumnReader* childReader = child.get();
+ if (childReader) {
+ childReader->next(*(listBatch.elements.get()), totalChildren, 0);
+ }
+}
+
+MapColumnReader::MapColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ // Determine if the key and/or value columns are selected
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_LENGTH, true), false, vers,
+ memoryPool, LONG);
+ const Type& keyType = *type.getSubtype(0);
+ if (selectedColumns[static_cast<uint64_t>(keyType.getColumnId())]) {
+ keyReader = buildReader(keyType, stripe);
+ }
+ const Type& elementType = *type.getSubtype(1);
+ if (selectedColumns[static_cast<uint64_t>(elementType.getColumnId())]) {
+ elementReader = buildReader(elementType, stripe);
+ }
+}
+
+MapColumnReader::~MapColumnReader() {
+ // PASS
+}
+
+uint64_t MapColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ ColumnReader* rawKeyReader = keyReader.get();
+ ColumnReader* rawElementReader = elementReader.get();
+ if (rawKeyReader || rawElementReader) {
+ const uint64_t BUFFER_SIZE = 1024;
+ int64_t buffer[BUFFER_SIZE];
+ uint64_t childrenElements = 0;
+ uint64_t lengthsRead = 0;
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, 0);
+ for (size_t i = 0; i < chunk; ++i) {
+ childrenElements += static_cast<size_t>(buffer[i]);
+ }
+ lengthsRead += chunk;
+ }
+ if (rawKeyReader) {
+ rawKeyReader->skip(childrenElements);
+ }
+ if (rawElementReader) {
+ rawElementReader->skip(childrenElements);
+ }
+ } else {
+ rle->skip(numValues);
+ }
+ return numValues;
+}
+
+void MapColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ MapVectorBatch& mapBatch = dynamic_cast<MapVectorBatch&>(rowBatch);
+ int64_t* offsets = mapBatch.offsets.data();
+ notNull = mapBatch.hasNulls ? mapBatch.notNull.data() : 0;
+ rle->next(offsets, numValues, notNull);
+ uint64_t totalChildren = 0;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ } else {
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ uint64_t tmp = static_cast<uint64_t>(offsets[i]);
+ offsets[i] = static_cast<int64_t>(totalChildren);
+ totalChildren += tmp;
+ }
+ }
+ offsets[numValues] = static_cast<int64_t>(totalChildren);
+ ColumnReader* rawKeyReader = keyReader.get();
+ if (rawKeyReader) {
+ rawKeyReader->next(*(mapBatch.keys.get()), totalChildren, 0);
+ }
+ ColumnReader* rawElementReader = elementReader.get();
+ if (rawElementReader) {
+ rawElementReader->next(*(mapBatch.elements.get()), totalChildren, 0);
+ }
+}
+
+UnionColumnReader::UnionColumnReader(const Type& type, StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ numChildren = type.getSubtypeCount();
+ childrenReader.resize(numChildren);
+ childrenCounts.resize(numChildren);
+
+ rle = createByteRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true));
+ // figure out which types are selected
+ const std::vector<bool> selectedColumns = stripe.getSelectedColumns();
+ for (unsigned int i = 0; i < numChildren; ++i) {
+ const Type& child = *type.getSubtype(i);
+ if (selectedColumns[static_cast<size_t>(child.getColumnId())]) {
+ childrenReader[i] = buildReader(child, stripe).release();
+ }
+ }
+}
+
+UnionColumnReader::~UnionColumnReader() {
+ for (std::vector<ColumnReader*>::iterator itr = childrenReader.begin();
+ itr != childrenReader.end(); ++itr) {
+ delete *itr;
+ }
+}
+
+uint64_t UnionColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ const uint64_t BUFFER_SIZE = 1024;
+ char buffer[BUFFER_SIZE];
+ uint64_t lengthsRead = 0;
+ int64_t* counts = childrenCounts.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren);
+ while (lengthsRead < numValues) {
+ uint64_t chunk = std::min(numValues - lengthsRead, BUFFER_SIZE);
+ rle->next(buffer, chunk, 0);
+ for (size_t i = 0; i < chunk; ++i) {
+ counts[static_cast<size_t>(buffer[i])] += 1;
+ }
+ lengthsRead += chunk;
+ }
+ for (size_t i = 0; i < numChildren; ++i) {
+ if (counts[i] != 0 && childrenReader[i] != NULL) {
+ childrenReader[i]->skip(static_cast<uint64_t>(counts[i]));
+ }
+ }
+ return numValues;
+}
+
+void UnionColumnReader::next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ UnionVectorBatch& unionBatch = dynamic_cast<UnionVectorBatch&>(rowBatch);
+ uint64_t* offsets = unionBatch.offsets.data();
+ int64_t* counts = childrenCounts.data();
+ memset(counts, 0, sizeof(int64_t) * numChildren);
+ unsigned char* tags = unionBatch.tags.data();
+ notNull = unionBatch.hasNulls ? unionBatch.notNull.data() : 0;
+ rle->next(reinterpret_cast<char*>(tags), numValues, notNull);
+ // set the offsets for each row
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ offsets[i] =
+ static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ offsets[i] =
+ static_cast<uint64_t>(counts[static_cast<size_t>(tags[i])]++);
+ }
+ }
+ // read the right number of each child column
+ for (size_t i = 0; i < numChildren; ++i) {
+ if (childrenReader[i] != nullptr) {
+ childrenReader[i]->next(*(unionBatch.children[i]),
+ static_cast<uint64_t>(counts[i]), nullptr);
+ }
+ }
+}
+
+// Destructively convert the number from zigzag encoding to the
+// natural signed representation.
+void unZigZagInt128(Int128& value) { // NOLINT
+ bool needsNegate = value.getLowBits() & 1;
+ value >>= 1;
+ if (needsNegate) {
+ value.negate();
+ value -= 1;
+ }
+}
+
+const uint32_t Decimal64ColumnReader::MAX_PRECISION_64;
+const uint32_t Decimal64ColumnReader::MAX_PRECISION_128;
+const int64_t Decimal64ColumnReader::POWERS_OF_TEN[MAX_PRECISION_64 + 1] = {
+ 1,
+ 10,
+ 100,
+ 1000,
+ 10000,
+ 100000,
+ 1000000,
+ 10000000,
+ 100000000,
+ 1000000000,
+ 10000000000,
+ 100000000000,
+ 1000000000000,
+ 10000000000000,
+ 100000000000000,
+ 1000000000000000,
+ 10000000000000000,
+ 100000000000000000,
+ 1000000000000000000};
+
+Decimal64ColumnReader::Decimal64ColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : ColumnReader(type, stripe) {
+ scale = static_cast<int32_t>(type.getScale());
+ precision = static_cast<int32_t>(type.getPrecision());
+ valueStream = stripe.getStream(columnId, proto::Stream_Kind_DATA, true);
+ buffer = nullptr;
+ bufferEnd = nullptr;
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ scaleDecoder = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_SECONDARY, true), true,
+ vers, memoryPool, LONG);
+}
+
+Decimal64ColumnReader::~Decimal64ColumnReader() {
+ // PASS
+}
+
+uint64_t Decimal64ColumnReader::skip(uint64_t numValues) {
+ numValues = ColumnReader::skip(numValues);
+ uint64_t skipped = 0;
+ while (skipped < numValues) {
+ readBuffer();
+ if (!(0x80 & *(buffer++))) {
+ skipped += 1;
+ }
+ }
+ scaleDecoder->skip(numValues);
+ return numValues;
+}
+
+void Decimal64ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ Decimal64VectorBatch& batch = dynamic_cast<Decimal64VectorBatch&>(rowBatch);
+ int64_t* values = batch.values.data();
+ int64_t* highbits = batch.highbitValues.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ Int128 val = Int128(values[i]);
+ highbits[i] = val.getHighBits();
+ values[i] = val.getLowBits();
+ scaleBuffer[i] = scale;
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ readInt64(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ Int128 val = Int128(values[i]);
+ highbits[i] = val.getHighBits();
+ values[i] = val.getLowBits();
+ scaleBuffer[i] = scale;
+ }
+ }
+}
+
+void scaleInt128(Int128& value, uint32_t scale, // NOLINT
+ uint32_t currentScale) {
+ if (scale > currentScale) {
+ while (scale > currentScale) {
+ uint32_t scaleAdjust = std::min(Decimal64ColumnReader::MAX_PRECISION_64,
+ scale - currentScale);
+ value *= Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust];
+ currentScale += scaleAdjust;
+ }
+ } else if (scale < currentScale) {
+ Int128 remainder;
+ while (currentScale > scale) {
+ uint32_t scaleAdjust = std::min(Decimal64ColumnReader::MAX_PRECISION_64,
+ currentScale - scale);
+ value = value.divide(Decimal64ColumnReader::POWERS_OF_TEN[scaleAdjust],
+ remainder);
+ currentScale -= scaleAdjust;
+ }
+ }
+}
+
+Decimal128ColumnReader::Decimal128ColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
+ // PASS
+}
+
+Decimal128ColumnReader::~Decimal128ColumnReader() {
+ // PASS
+}
+
+void Decimal128ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Int128* values = batch.values.data();
+ int64_t* highbits = batch.highbitValues.data();
+ uint64_t* lowbits = batch.lowbitValues.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ highbits[i] = values[i].getHighBits();
+ lowbits[i] = values[i].getLowBits();
+ scaleBuffer[i] = scale;
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]));
+ highbits[i] = values[i].getHighBits();
+ lowbits[i] = values[i].getLowBits();
+ scaleBuffer[i] = scale;
+ }
+ }
+}
+
+DecimalHive11ColumnReader::DecimalHive11ColumnReader(const Type& type,
+ StripeStreams& stripe)
+ : Decimal64ColumnReader(type, stripe) {
+ const ReaderOptions options = stripe.getReaderOptions();
+ scale = options.getForcedScaleOnHive11Decimal();
+ throwOnOverflow = options.getThrowOnHive11DecimalOverflow();
+ errorStream = options.getErrorStream();
+}
+
+DecimalHive11ColumnReader::~DecimalHive11ColumnReader() {
+ // PASS
+}
+
+void DecimalHive11ColumnReader::next(ColumnVectorBatch& rowBatch,
+ uint64_t numValues, char* notNull) {
+ ColumnReader::next(rowBatch, numValues, notNull);
+ notNull = rowBatch.hasNulls ? rowBatch.notNull.data() : 0;
+ Decimal128VectorBatch& batch = dynamic_cast<Decimal128VectorBatch&>(rowBatch);
+ Int128* values = batch.values.data();
+ // read the next group of scales
+ int64_t* scaleBuffer = batch.readScales.data();
+
+ scaleDecoder->next(scaleBuffer, numValues, notNull);
+
+ batch.precision = precision;
+ batch.scale = scale;
+ if (notNull) {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
+ if (throwOnOverflow) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Hive 0.11 decimal was more than 38 digits.");
+ } else {
+ *errorStream << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
+ notNull[i] = false;
+ }
+ }
+ }
+ }
+ } else {
+ for (size_t i = 0; i < numValues; ++i) {
+ if (!readInt128(values[i], static_cast<int32_t>(scaleBuffer[i]))) {
+ if (throwOnOverflow) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Hive 0.11 decimal was more than 38 digits.");
+ } else {
+ *errorStream << "Warning: "
+ << "Hive 0.11 decimal with more than 38 digits "
+ << "replaced by NULL.\n";
+ batch.hasNulls = true;
+ batch.notNull[i] = false;
+ }
+ }
+ }
+ }
+}
+
+// Create a reader for the given stripe.
+std::unique_ptr<ColumnReader> buildReader(const Type& type,
+ StripeStreams& stripe) { // NOLINT
+ switch (static_cast<int64_t>(type.getKind())) {
+ case SHORT:
+ return std::unique_ptr<ColumnReader>(new ShortColumnReader(type, stripe));
+ case INT:
+ return std::unique_ptr<ColumnReader>(new IntColumnReader(type, stripe));
+ case LONG:
+ return std::unique_ptr<ColumnReader>(new LongColumnReader(type, stripe));
+ case DATE:
+ return std::unique_ptr<ColumnReader>(new DateColumnReader(type, stripe));
+ case TIME:
+ return std::unique_ptr<ColumnReader>(new TimeColumnReader(type, stripe));
+ case BINARY:
+ case CHAR:
+ case STRING:
+ case VARCHAR:
+ switch (
+ static_cast<int64_t>(stripe.getEncoding(type.getColumnId()).kind())) {
+ case proto::ColumnEncoding_Kind_DICTIONARY:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V2:
+ case proto::ColumnEncoding_Kind_DICTIONARY_V0:
+ return std::unique_ptr<ColumnReader>(
+ new StringDictionaryColumnReader(type, stripe));
+ case proto::ColumnEncoding_Kind_DIRECT:
+ case proto::ColumnEncoding_Kind_DIRECT_V2:
+ case proto::ColumnEncoding_Kind_DIRECT_V0:
+ return std::unique_ptr<ColumnReader>(
+ new StringDirectColumnReader(type, stripe));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "buildReader unhandled string encoding");
+ }
+
+ case BYTE:
+ return std::unique_ptr<ColumnReader>(new ByteColumnReader(type, stripe));
+
+ case FLOAT:
+ case DOUBLE:
+ return std::unique_ptr<ColumnReader>(
+ new DoubleColumnReader(type, stripe));
+
+ case BOOLEAN:
+ return std::unique_ptr<ColumnReader>(
+ new BooleanColumnReader(type, stripe));
+
+ case LIST:
+ return std::unique_ptr<ColumnReader>(new ListColumnReader(type, stripe));
+
+ case MAP:
+ return std::unique_ptr<ColumnReader>(new MapColumnReader(type, stripe));
+
+ case UNION:
+ return std::unique_ptr<ColumnReader>(new UnionColumnReader(type, stripe));
+
+ case STRUCT:
+ return std::unique_ptr<ColumnReader>(
+ new StructColumnReader(type, stripe));
+
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnReader>(
+ new TimestampColumnReader(type, stripe));
+
+ case DECIMAL:
+ // is this a Hive 0.11 or 0.12 file?
+ if (type.getPrecision() == 0) {
+ return std::unique_ptr<ColumnReader>(
+ new DecimalHive11ColumnReader(type, stripe));
+
+ // can we represent the values using int64_t?
+ } else if (type.getPrecision() <=
+ Decimal64ColumnReader::MAX_PRECISION_64) {
+ return std::unique_ptr<ColumnReader>(
+ new Decimal64ColumnReader(type, stripe));
+
+ // otherwise we use the Int128 implementation
+ } else {
+ return std::unique_ptr<ColumnReader>(
+ new Decimal128ColumnReader(type, stripe));
+ }
+
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "buildReader unhandled type");
+ }
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/reader.h b/depends/storage/src/storage/format/orc/reader.h
new file mode 100644
index 0000000..f46750d
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/reader.h
@@ -0,0 +1,1071 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_READER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_READER_H_
+
+#include <algorithm>
+#include <list>
+#include <map>
+#include <memory>
+#include <string>
+#include <vector>
+
+#include "univplan/common/statistics.h"
+
+#include "storage/common/bloom-filter.h"
+#include "storage/format/orc/byte-rle.h"
+#include "storage/format/orc/orc-proto-definition.h"
+#include "storage/format/orc/reader.h"
+#include "storage/format/orc/rle.h"
+#include "storage/format/orc/seekable-input-stream.h"
+#include "storage/format/orc/timezone.h"
+#include "storage/format/orc/type.h"
+#include "storage/format/orc/vector.h"
+
+#include "storage/format/orc/orc_proto.pb.h"
+
+namespace orc {
+
+// classes that hold data members so we can maintain binary compatibility
+struct ReaderOptionsPrivate;
+
+// Options for creating a Reader.
+class ReaderOptions {
+ private:
+ std::unique_ptr<ReaderOptionsPrivate> privateBits;
+
+ public:
+ ReaderOptions();
+ ReaderOptions(const ReaderOptions&);
+ ReaderOptions(ReaderOptions&);
+ ReaderOptions& operator=(const ReaderOptions&);
+ virtual ~ReaderOptions();
+
+ // For files that have structs as the top-level object, select the fields
+ // to read. The first field is 0, the second 1, and so on. By default,
+ // all columns are read. This option clears any previous setting of
+ // the selected columns.
+ // @param include a list of fields to read
+ // @return this
+ ReaderOptions& include(const std::list<uint64_t>& include);
+
+ // For files that have structs as the top-level object, select the fields
+ // to read by name. By default, all columns are read. This option clears
+ // any previous setting of the selected columns.
+ // @param include a list of fields to read
+ // @return this
+ ReaderOptions& include(const std::list<std::string>& include);
+
+ // Selects which type ids to read. The root type is always 0 and the
+ // rest of the types are labeled in a preorder traversal of the tree.
+ // The parent types are automatically selected, but the children are not.
+ //
+ // This option clears any previous setting of the selected columns or
+ // types.
+ // @param types a list of the type ids to read
+ // @return this
+ ReaderOptions& includeTypes(const std::list<uint64_t>& types);
+
+ // Set the section of the file to process.
+ // @param offset the starting byte offset
+ // @param length the number of bytes to read
+ // @return this
+ ReaderOptions& range(uint64_t offset, uint64_t length);
+
+ // For Hive 0.11 (and 0.12) decimals, the precision was unlimited
+ // and thus may overflow the 38 digits that is supported. If one
+ // of the Hive 0.11 decimals is too large, the reader may either convert
+ // the value to NULL or throw an exception. That choice is controlled
+ // by this setting.
+ //
+ // Defaults to true.
+ //
+ // @param shouldThrow should the reader throw a ParseError?
+ // @return returns *this
+ ReaderOptions& throwOnHive11DecimalOverflow(bool shouldThrow);
+
+ // For Hive 0.11 (and 0.12) written decimals, which have unlimited
+ // scale and precision, the reader forces the scale to a consistent
+ // number that is configured. This setting changes the scale that is
+ // forced upon these old decimals. See also throwOnHive11DecimalOverflow.
+ //
+ // Defaults to 6.
+ //
+ // @param forcedScale the scale that will be forced on Hive 0.11 decimals
+ // @return returns *this
+ ReaderOptions& forcedScaleOnHive11Decimal(int32_t forcedScale);
+
+ // Set the location of the tail as defined by the logical length of the
+ // file.
+ ReaderOptions& setTailLocation(uint64_t offset);
+
+ // Set the stream to use for printing warning or error messages.
+ ReaderOptions& setErrorStream(std::ostream& stream); // NOLINT
+
+ // Open the file used a serialized copy of the file tail.
+ //
+ // When one process opens the file and other processes need to read
+ // the rows, we want to enable clients to just read the tail once.
+ // By passing the string returned by Reader.getSerializedFileTail(), to
+ // this function, the second reader will not need to read the file tail
+ // from disk.
+ //
+ // @param serialization the bytes of the serialized tail to use
+ ReaderOptions& setSerializedFileTail(const std::string& serialization);
+
+ // Get the memory allocator.
+ dbcommon::MemoryPool* getMemoryPool() const;
+
+ // Were the field ids set?
+ bool getIndexesSet() const;
+
+ // Were the type ids set?
+ bool getTypeIdsSet() const;
+
+ // Get the list of selected field or type ids to read.
+ const std::list<uint64_t>& getInclude() const;
+
+ // Were the include names set?
+ bool getNamesSet() const;
+
+ // Get the list of selected columns to read. All children of the selected
+ // columns are also selected.
+ const std::list<std::string>& getIncludeNames() const;
+
+ // Get the start of the range for the data being processed.
+ // @return if not set, return 0
+ uint64_t getOffset() const;
+
+ // Get the end of the range for the data being processed.
+ // @return if not set, return the maximum long
+ uint64_t getLength() const;
+
+ // Get the desired tail location.
+ // @return if not set, return the maximum long.
+ uint64_t getTailLocation() const;
+
+ // Should the reader throw a ParseError when a Hive 0.11 decimal is
+ // larger than the supported 38 digits of precision? Otherwise, the
+ // data item is replaced by a NULL.
+ bool getThrowOnHive11DecimalOverflow() const;
+
+ // What scale should all Hive 0.11 decimals be normalized to?
+ int32_t getForcedScaleOnHive11Decimal() const;
+
+ // Get the stream to write warnings or errors to.
+ std::ostream* getErrorStream() const;
+
+ // Get the serialized file tail that the user passed in.
+ std::string getSerializedFileTail() const;
+
+ void setPredicateExprs(const univplan::UnivPlanExprPolyList* predicateExprs);
+ const univplan::UnivPlanExprPolyList* getPredicateExprs() const;
+
+ void setTupleDesc(const dbcommon::TupleDesc* td);
+ const dbcommon::TupleDesc* getTupleDesc() const;
+
+ void setReadStatsOnlyFlag(bool readStatsOnly);
+ bool readStatsOnly() const;
+};
+
+class StripeStreams {
+ public:
+ virtual ~StripeStreams();
+
+ // Get the reader options.
+ // @return Reader options
+ virtual const ReaderOptions& getReaderOptions() const = 0;
+
+ // Get the array of booleans for which columns are selected.
+ // @return the address of an array which contains true at the index of
+ // each columnId is selected.
+ virtual const std::vector<bool> getSelectedColumns() const = 0;
+
+ // Get the encoding for the given column for this stripe.
+ // @param columnId The column id
+ // @return The column encoding
+ virtual proto::ColumnEncoding getEncoding(uint64_t columnId) const = 0;
+
+ // Get the stream for the given column/kind in this stripe.
+ // @param columnId The id of the column
+ // @param kind The kind of the stream
+ // @param shouldStream Should the reading page the stream in
+ // @return The new stream
+ virtual std::unique_ptr<SeekableInputStream> getStream(
+ uint64_t columnId, proto::Stream_Kind kind, bool shouldStream) const = 0;
+
+ virtual std::unique_ptr<SeekableInputStream> getStreamForBloomFilter(
+ uint64_t columnId, proto::Stream_Kind kind, bool shouldStream) const = 0;
+
+ // Get the memory pool for this reader.
+ // @return The memory pool
+ virtual dbcommon::MemoryPool& getMemoryPool() const = 0;
+
+ // Get the writer's timezone, so that we can convert their dates correctly.
+ // @return The timezone
+ virtual const Timezone& getWriterTimezone() const = 0;
+};
+
+// The interface for reading ORC data types.
+class ColumnReader {
+ protected:
+ std::unique_ptr<ByteRleDecoder> notNullDecoder; // it is exact a
+ // BooleanRleDecoderImpl, pay
+ // attention to the vitual
+ // function call
+ uint64_t columnId;
+ dbcommon::MemoryPool& memoryPool;
+
+ public:
+ ColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+
+ virtual ~ColumnReader();
+
+ // Skip number of specified rows.
+ // @param numValues the number of values to skip
+ // @return the number of non-null values skipped
+ virtual uint64_t skip(uint64_t numValues);
+
+ // Read the next group of values into this rowBatch.
+ // @param rowBatch the memory to read into.
+ // @param numValues the number of values to read
+ // @param notNull if null, all values are not null. Otherwise, it is
+ // a mask (with at least numValues bytes) for which values to
+ // set.
+ virtual void next(ColumnVectorBatch& rowBatch, uint64_t numValues, // NOLINT
+ char* notNull);
+};
+
+class BooleanColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<orc::ByteRleDecoder> rle;
+
+ public:
+ BooleanColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~BooleanColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class ByteColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<orc::ByteRleDecoder> rle;
+
+ public:
+ ByteColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~ByteColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+extern RleVersion convertRleVersion(proto::ColumnEncoding_Kind kind);
+
+template <class IntType>
+class IntegerColumnReader : public ColumnReader {
+ protected:
+ std::unique_ptr<orc::RleDecoder> rle;
+
+ public:
+ IntegerColumnReader(const Type& type, StripeStreams& stripe); // NOLINT
+
+ // : //NOLINT
+ // ColumnReader(type, stripe) { //NOLINT
+ // RleVersion vers =
+ // convertRleVersion(stripe.getEncoding(columnId).kind());
+ // rle = createRleDecoder<IntType>(
+ // stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true,
+ // vers,
+ // memoryPool);
+ // }
+
+ ~IntegerColumnReader() {}
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class LongColumnReader : public IntegerColumnReader<int64_t> {
+ public:
+ LongColumnReader(const Type& type, StripeStreams& stripe) // NOLINT
+ : IntegerColumnReader<int64_t>(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, LONG);
+ }
+ ~LongColumnReader() {}
+};
+
+class IntColumnReader : public IntegerColumnReader<int32_t> {
+ public:
+ IntColumnReader(const Type& type, StripeStreams& stripe) // NOLINT
+ : IntegerColumnReader<int32_t>(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, INT);
+ }
+ ~IntColumnReader() {}
+};
+
+class ShortColumnReader : public IntegerColumnReader<int16_t> {
+ public:
+ ShortColumnReader(const Type& type, StripeStreams& stripe) // NOLINT
+ : IntegerColumnReader<int16_t>(type, stripe) {
+ RleVersion vers = convertRleVersion(stripe.getEncoding(columnId).kind());
+ rle = createRleDecoder(
+ stripe.getStream(columnId, proto::Stream_Kind_DATA, true), true, vers,
+ memoryPool, SHORT);
+ }
+ ~ShortColumnReader() {}
+};
+
+class DateColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<orc::RleDecoder> rle;
+
+ public:
+ DateColumnReader(const Type& type, StripeStreams& stripe); // NOLINT
+
+ ~DateColumnReader() {}
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class TimeColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<orc::RleDecoder> rle;
+
+ public:
+ TimeColumnReader(const Type& type, StripeStreams& stripe); // NOLINT
+
+ ~TimeColumnReader() {}
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class TimestampColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<orc::RleDecoder> secondsRle;
+ std::unique_ptr<orc::RleDecoder> nanoRle;
+ const Timezone& writerTimezone;
+ const int64_t epochOffset;
+
+ public:
+ TimestampColumnReader(const Type& type, StripeStreams& stripe); // NOLINT
+ ~TimestampColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class DoubleColumnReader : public ColumnReader {
+ public:
+ DoubleColumnReader(const Type& type, StripeStreams& stripe); // NOLINT
+ ~DoubleColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+
+ private:
+ std::unique_ptr<SeekableInputStream> inputStream;
+ ORCTypeKind columnKind;
+ const uint64_t bytesPerValue;
+ const char* bufferPointer;
+ const char* bufferEnd;
+ const char* bufferS = nullptr;
+ const char* bufferE = nullptr;
+
+ void nextBuffer() {
+ int bufferLength = 0;
+ const void* bufferPointer = nullptr;
+ bool result = inputStream->Next(&bufferPointer, &bufferLength);
+ if (!result) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in nextBuffer");
+ }
+ bufferS = static_cast<const char*>(bufferPointer);
+ bufferE = bufferS + bufferLength;
+ }
+
+ void readData(char* data, uint64_t numValues) {
+ uint64_t i = 0;
+ uint64_t count;
+ if (columnKind == FLOAT)
+ count = numValues * sizeof(float);
+ else
+ count = numValues * sizeof(double);
+ while (i < count) {
+ if (bufferS == bufferE) {
+ nextBuffer();
+ }
+ uint64_t copyBytes =
+ std::min(count - i, static_cast<uint64_t>(bufferE - bufferS));
+ memcpy(data, bufferS, copyBytes);
+ bufferS += copyBytes;
+ data += copyBytes;
+ i += copyBytes;
+ }
+ }
+};
+
+class StringDictionaryColumnReader : public ColumnReader {
+ private:
+ DataBuffer<char> dictionaryBlob;
+ DataBuffer<int64_t> dictionaryOffset;
+ std::unique_ptr<RleDecoder> rle;
+ uint64_t dictionaryCount;
+
+ public:
+ StringDictionaryColumnReader(const Type& type,
+ StripeStreams& stipe); // NOLINT
+ ~StringDictionaryColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class StringDirectColumnReader : public ColumnReader {
+ private:
+ DataBuffer<char> blobBuffer;
+ std::unique_ptr<RleDecoder> lengthRle;
+ std::unique_ptr<SeekableInputStream> blobStream;
+ const char* lastBuffer;
+ size_t lastBufferLength;
+
+ // Compute the total length of the values.
+ // @param lengths the array of lengths
+ // @param notNull the array of notNull flags
+ // @param numValues the lengths of the arrays
+ // @return the total number of bytes for the non-null values
+ size_t computeSize(const int64_t* lengths, const char* notNull,
+ uint64_t numValues);
+
+ public:
+ StringDirectColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~StringDirectColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class StructColumnReader : public ColumnReader {
+ private:
+ std::vector<ColumnReader*> children;
+
+ public:
+ StructColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~StructColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class ListColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<ColumnReader> child;
+ std::unique_ptr<RleDecoder> rle;
+
+ public:
+ ListColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~ListColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class MapColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<ColumnReader> keyReader;
+ std::unique_ptr<ColumnReader> elementReader;
+ std::unique_ptr<RleDecoder> rle;
+
+ public:
+ MapColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~MapColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class UnionColumnReader : public ColumnReader {
+ private:
+ std::unique_ptr<ByteRleDecoder> rle;
+ std::vector<ColumnReader*> childrenReader;
+ std::vector<int64_t> childrenCounts;
+ uint64_t numChildren;
+
+ public:
+ UnionColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~UnionColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+class Decimal64ColumnReader : public ColumnReader {
+ public:
+ static const uint32_t MAX_PRECISION_64 = 18;
+ static const uint32_t MAX_PRECISION_128 = 38;
+ static const int64_t POWERS_OF_TEN[MAX_PRECISION_64 + 1];
+
+ protected:
+ std::unique_ptr<SeekableInputStream> valueStream;
+ int32_t precision;
+ int32_t scale;
+ const char* buffer;
+ const char* bufferEnd;
+
+ std::unique_ptr<RleDecoder> scaleDecoder;
+
+ // Read the valueStream for more bytes.
+ void readBuffer() {
+ while (buffer == bufferEnd) {
+ int length;
+ if (!valueStream->Next(reinterpret_cast<const void**>(&buffer),
+ &length)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Read past end of stream in Decimal64ColumnReader %s",
+ valueStream->getName().c_str());
+ }
+ bufferEnd = buffer + length;
+ }
+ }
+
+ void readInt64(int64_t& value, int32_t currentScale) { // NOLINT
+ value = 0;
+ size_t offset = 0;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ value |= static_cast<uint64_t>(ch & 0x7f) << offset;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+ value = unZigZag(static_cast<uint64_t>(value));
+ if (scale > currentScale) {
+ value *= POWERS_OF_TEN[scale - currentScale];
+ } else if (scale < currentScale) {
+ value /= POWERS_OF_TEN[currentScale - scale];
+ }
+ }
+
+ public:
+ Decimal64ColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~Decimal64ColumnReader();
+
+ uint64_t skip(uint64_t numValues) override;
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+extern void unZigZagInt128(Int128& value); // NOLINT
+extern void scaleInt128(Int128& value, uint32_t scale, // NOLINT
+ uint32_t currentScale);
+
+class Decimal128ColumnReader : public Decimal64ColumnReader {
+ public:
+ Decimal128ColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~Decimal128ColumnReader();
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+
+ private:
+ void readInt128(Int128& value, int32_t currentScale) { // NOLINT
+ value = 0;
+ Int128 work;
+ uint32_t offset = 0;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ work = ch & 0x7f;
+ work <<= offset;
+ value |= work;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+ unZigZagInt128(value);
+ scaleInt128(value, static_cast<uint32_t>(scale),
+ static_cast<uint32_t>(currentScale));
+ }
+};
+
+class DecimalHive11ColumnReader : public Decimal64ColumnReader {
+ private:
+ bool throwOnOverflow;
+ std::ostream* errorStream;
+
+ // Read an Int128 from the stream and correct it to the desired scale.
+ bool readInt128(Int128& value, int32_t currentScale) { // NOLINT
+ // -/+ 99999999999999999999999999999999999999
+ static const Int128 MIN_VALUE(-0x4b3b4ca85a86c47b, 0xf675ddc000000001);
+ static const Int128 MAX_VALUE(0x4b3b4ca85a86c47a, 0x098a223fffffffff);
+
+ value = 0;
+ Int128 work;
+ uint32_t offset = 0;
+ bool result = true;
+ while (true) {
+ readBuffer();
+ unsigned char ch = static_cast<unsigned char>(*(buffer++));
+ work = ch & 0x7f;
+ // If we have read more than 128 bits, we flag the error, but keep
+ // reading bytes so the stream isn't thrown off.
+ if (offset > 128 || (offset == 126 && work > 3)) {
+ result = false;
+ }
+ work <<= offset;
+ value |= work;
+ offset += 7;
+ if (!(ch & 0x80)) {
+ break;
+ }
+ }
+
+ if (!result) {
+ return result;
+ }
+ unZigZagInt128(value);
+ scaleInt128(value, static_cast<uint32_t>(scale),
+ static_cast<uint32_t>(currentScale));
+ return value >= MIN_VALUE && value <= MAX_VALUE;
+ }
+
+ public:
+ DecimalHive11ColumnReader(const Type& type, StripeStreams& stipe); // NOLINT
+ ~DecimalHive11ColumnReader();
+
+ void next(ColumnVectorBatch& rowBatch, uint64_t numValues,
+ char* notNull) override; // NOLINT
+};
+
+// The interface for reading ORC files.
+// This is an an abstract class that will subclassed as necessary.
+class Reader {
+ public:
+ virtual ~Reader();
+
+ // Get the format version of the file. Currently known values are:
+ // "0.11" and "0.12"
+ // @return the version string
+ virtual std::string getFormatVersion() const = 0;
+
+ // Get the number of rows in the file.
+ // @return the number of rows
+ virtual uint64_t getNumberOfRows() const = 0;
+
+ // Get the user metadata keys.
+ // @return the set of metadata keys
+ virtual std::list<std::string> getMetadataKeys() const = 0;
+
+ // Get a user metadata value.
+ // @param key a key given by the user
+ // @return the bytes associated with the given key
+ virtual std::string getMetadataValue(const std::string& key) const = 0;
+
+ // Did the user set the given metadata value.
+ // @param key the key to check
+ // @return true if the metadata value was set
+ virtual bool hasMetadataValue(const std::string& key) const = 0;
+
+ // Get the compression kind.
+ // @return the kind of compression in the file
+ virtual CompressionKind getCompression() const = 0;
+
+ // Get the buffer size for the compression.
+ // @return number of bytes to buffer for the compression codec.
+ virtual uint64_t getCompressionSize() const = 0;
+
+ // Get the version of the writer.
+ // @return the version of the writer.
+ virtual WriterVersion getWriterVersion() const = 0;
+
+ // Get the number of rows per a entry in the row index.
+ // @return the number of rows per an entry in the row index or 0 if there
+ // is no row index.
+ virtual uint64_t getRowIndexStride() const = 0;
+
+ // Get the number of stripes in the file.
+ // @return the number of stripes
+ virtual uint64_t getNumberOfStripes() const = 0;
+
+ // Get the information about a stripe.
+ // @param stripeIndex the stripe 0 to N-1 to get information about
+ // @return the information about that stripe
+ virtual std::unique_ptr<StripeInformation> getStripe(
+ uint64_t stripeIndex) const = 0;
+
+ // Get the number of stripe statistics in the file.
+ // @return the number of stripe statistics
+ virtual uint64_t getNumberOfStripeStatistics() const = 0;
+
+ // Get the statistics about a stripe.
+ // @param stripeIndex the stripe 0 to N-1 to get statistics about
+ // @return the statistics about that stripe
+ virtual std::unique_ptr<univplan::Statistics> getStripeStatistics(
+ uint64_t stripeIndex) const = 0;
+
+ // Get the length of the data stripes in the file.
+ // @return the number of bytes in stripes
+ virtual uint64_t getContentLength() const = 0;
+
+ // Get the length of the file stripe statistics
+ // @return the number of compressed bytes in the file stripe statistics
+ virtual uint64_t getStripeStatisticsLength() const = 0;
+
+ // Get the length of the file footer
+ // @return the number of compressed bytes in the file footer
+ virtual uint64_t getFileFooterLength() const = 0;
+
+ // Get the length of the file postscript
+ // @return the number of bytes in the file postscript
+ virtual uint64_t getFilePostscriptLength() const = 0;
+
+ // Get the total length of the file.
+ // @return the number of bytes in the file
+ virtual uint64_t getFileLength() const = 0;
+
+ // Get the statistics about the columns in the file.
+ // @return the information about the column
+ virtual std::unique_ptr<univplan::Statistics> getStatistics() const = 0;
+
+ // Get the statistics about a single column in the file.
+ // @return the information about the column
+ virtual std::unique_ptr<univplan::ColumnStatistics> getColumnStatistics(
+ uint32_t columnId) const = 0;
+
+ // Get the type of the rows in the file. The top level is typically a
+ // struct.
+ // @return the root type
+ virtual const Type& getType() const = 0;
+
+ // Get the selected type of the rows in the file. The file's row type
+ // is projected down to just the selected columns. Thus, if the file's
+ // type is struct<col0:int,col1:double,col2:string> and the selected
+ // columns are "col0,col2" the selected type would be
+ // struct<col0:int,col2:string>.
+ // @return the root type
+ virtual const Type& getSelectedType() const = 0;
+
+ // Get the selected columns of the file.
+ virtual const std::vector<bool> getSelectedColumns() const = 0;
+
+ // Create a row batch for reading the selected columns of this file.
+ // @param size the number of rows to read
+ // @return a new ColumnVectorBatch to read into
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(
+ uint64_t size) const = 0;
+
+ // Read the next row batch from the current position.
+ // Caller must look at numElements in the row batch to determine how
+ // many rows were read.
+ // @param data the row batch to read into.
+ // @return true if a non-zero number of rows were read or false if the
+ // end of the file was reached.
+ virtual bool next(ColumnVectorBatch& data) = 0; // NOLINT
+
+ // Get the row number of the first row in the previously read batch.
+ // @return the row number of the previous batch.
+ virtual uint64_t getRowNumber() const = 0;
+
+ // Seek to a given row.
+ // @param rowNumber the next row the reader should return
+ virtual void seekToRow(uint64_t rowNumber) = 0;
+
+ // Get the name of the input stream.
+ virtual const std::string& getStreamName() const = 0;
+
+ // check file has correct column statistics
+ virtual bool hasCorrectStatistics() const = 0;
+
+ // Get the serialized file tail.
+ // Usefull if another reader of the same file wants to avoid re-reading
+ // the file tail. See ReaderOptions.setSerializedFileTail().
+ // @return a string of bytes with the file tail
+ virtual std::string getSerializedFileTail() const = 0;
+
+ // Estimate an upper bound on heap memory allocation by the Reader
+ // based on the information in the file footer.
+ // The bound is less tight if only few columns are read or compression is
+ // used.
+ // @param stripeIx index of the stripe to be read (if not specified,
+ // all stripes are considered).
+ // @return upper bound on memory use
+ virtual uint64_t getMemoryUse(int stripeIx = -1) = 0;
+
+ virtual void collectPredicateStats(uint32_t* scanned, uint32_t* skipped) = 0;
+
+ virtual std::unique_ptr<orc::InputStream> ownInputStream() = 0;
+};
+
+// Create a reader to the for the ORC file.
+// @param stream the stream to read
+// @param options the options for reading the file
+std::unique_ptr<Reader> createReader(std::unique_ptr<InputStream> stream,
+ const ReaderOptions& options);
+
+class StripeStreamsImpl;
+class ReaderImpl : public Reader {
+ private:
+ const Timezone& localTimezone;
+
+ // inputs
+ std::unique_ptr<InputStream> stream;
+ ReaderOptions options;
+ const uint64_t fileLength;
+ const uint64_t postscriptLength;
+ std::vector<bool> selectedColumns;
+
+ // custom memory pool
+ dbcommon::MemoryPool& memoryPool;
+
+ // postscript
+ std::unique_ptr<proto::PostScript> postscript;
+ const uint64_t blockSize;
+ const CompressionKind compression;
+
+ // footer
+ std::unique_ptr<proto::Footer> footer;
+ DataBuffer<uint64_t> firstRowOfStripe;
+ uint64_t numberOfStripes;
+ std::unique_ptr<Type> schema;
+ mutable std::unique_ptr<Type> selectedSchema;
+
+ // metadata
+ mutable std::unique_ptr<proto::Metadata> metadata;
+ mutable bool isMetadataLoaded;
+
+ // reading state
+ uint64_t previousRow;
+ uint64_t firstStripe;
+ uint64_t currentStripe;
+ uint64_t lastStripe; // the stripe AFTER the last one
+ uint64_t currentRowInStripe;
+ uint64_t rowsInCurrentStripe;
+ proto::StripeInformation currentStripeInfo;
+ std::unique_ptr<StripeStreamsImpl> currentStripeStream = nullptr;
+ std::vector<proto::StripeFooter> stripeFooters;
+ std::unique_ptr<ColumnReader> curReader;
+ std::map<std::string, uint64_t> nameIdMap;
+ std::map<uint64_t, const Type*> idTypeMap;
+
+ // count for filter push down
+ uint32_t skippedStripe = 0;
+ uint32_t scannedStripe = 0;
+
+ // for read stats only
+ std::unique_ptr<univplan::Statistics> currentStripeStats;
+
+ // internal methods
+ proto::StripeFooter getStripeFooter(
+ const proto::StripeInformation& info) const;
+ void startNextStripe();
+ void checkOrcVersion();
+ void readMetadata() const;
+ bool notIncludeType(ColumnVectorBatch* data, orc::ORCTypeKind typekind);
+
+ // build map from type name and id, id to Type
+ void buildTypeNameIdMap(const Type* type,
+ std::vector<std::string>& columns); // NOLINT
+ std::string toDotColumnPath(const std::vector<std::string>& columns);
+
+ // Select the columns from the options object
+ void updateSelected();
+
+ // Select a field by name
+ void updateSelectedByName(const std::string& name);
+ // Select a field by id
+ void updateSelectedByFieldId(uint64_t fieldId);
+ // Select a type by id
+ void updateSelectedByTypeId(uint64_t typeId);
+
+ // Select all of the recursive children of the given type.
+ void selectChildren(const Type& type);
+
+ // For each child of type, select it if one of its children
+ // is selected.
+ bool selectParents(const Type& type);
+
+ public:
+ // Constructor that lets the user specify additional options.
+ // @param stream the stream to read from
+ // @param options options for reading
+ // @param postscript the postscript for the file
+ // @param footer the footer for the file
+ // @param fileLength the length of the file in bytes
+ // @param postscriptLength the length of the postscript in bytes
+ ReaderImpl(std::unique_ptr<InputStream> stream, const ReaderOptions& options,
+ std::unique_ptr<proto::PostScript> postscript,
+ std::unique_ptr<proto::Footer> footer, uint64_t fileLength,
+ uint64_t postscriptLength);
+
+ const ReaderOptions& getReaderOptions() const;
+
+ CompressionKind getCompression() const override;
+
+ std::string getFormatVersion() const override;
+
+ WriterVersion getWriterVersion() const override;
+
+ uint64_t getNumberOfRows() const override;
+
+ uint64_t getRowIndexStride() const override;
+
+ const std::string& getStreamName() const override;
+
+ std::list<std::string> getMetadataKeys() const override;
+
+ std::string getMetadataValue(const std::string& key) const override;
+
+ bool hasMetadataValue(const std::string& key) const override;
+
+ uint64_t getCompressionSize() const override;
+
+ uint64_t getNumberOfStripes() const override;
+
+ std::unique_ptr<StripeInformation> getStripe(uint64_t) const override;
+
+ uint64_t getNumberOfStripeStatistics() const override;
+
+ std::unique_ptr<univplan::Statistics> getStripeStatistics(
+ uint64_t stripeIndex) const override;
+
+ uint64_t getContentLength() const override;
+ uint64_t getStripeStatisticsLength() const override;
+ uint64_t getFileFooterLength() const override;
+ uint64_t getFilePostscriptLength() const override;
+ uint64_t getFileLength() const override;
+
+ std::unique_ptr<univplan::Statistics> getStatistics() const override;
+
+ std::unique_ptr<univplan::ColumnStatistics> getColumnStatistics(
+ uint32_t columnId) const override;
+
+ const Type& getType() const override;
+
+ const Type& getSelectedType() const override;
+
+ const std::vector<bool> getSelectedColumns() const override;
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(
+ uint64_t size) const override;
+
+ bool next(ColumnVectorBatch& data) override;
+
+ uint64_t getRowNumber() const override;
+
+ void seekToRow(uint64_t rowNumber) override;
+
+ bool hasCorrectStatistics() const override;
+
+ std::string getSerializedFileTail() const override;
+
+ uint64_t getMemoryUse(int stripeIx = -1) override;
+
+ void collectPredicateStats(uint32_t* scanned, uint32_t* skipped) override;
+
+ std::unique_ptr<orc::InputStream> ownInputStream() override;
+
+ proto::BloomFilterIndex rebuildBloomFilter(uint32_t colId);
+
+ bool doReadStatsOnly(ColumnVectorBatch* data);
+};
+
+// Create a reader for the given stripe.
+// @param type The reader type
+// @param stripe The strip stream
+std::unique_ptr<ColumnReader> buildReader(const Type& type,
+ StripeStreams& stripe); // NOLINT
+
+class StripeStreamsImpl : public StripeStreams {
+ private:
+ const ReaderImpl& reader;
+ const proto::StripeFooter& footer;
+ const uint64_t stripeStart;
+ InputStream& input;
+ dbcommon::MemoryPool& memoryPool;
+ const Timezone& writerTimezone;
+
+ public:
+ StripeStreamsImpl(const ReaderImpl& reader, const proto::StripeFooter& footer,
+ uint64_t stripeStart,
+ InputStream& input, // NOLINT
+ dbcommon::MemoryPool& memoryPool, // NOLINT
+ const Timezone& writerTimezone);
+
+ virtual ~StripeStreamsImpl();
+
+ const ReaderOptions& getReaderOptions() const override;
+
+ const std::vector<bool> getSelectedColumns() const override;
+
+ proto::ColumnEncoding getEncoding(uint64_t columnId) const override;
+
+ std::unique_ptr<SeekableInputStream> getStream(
+ uint64_t columnId, proto::Stream_Kind kind,
+ bool shouldStream) const override;
+
+ std::unique_ptr<SeekableInputStream> getStreamForBloomFilter(
+ uint64_t columnId, proto::Stream_Kind kind,
+ bool shouldStream) const override;
+
+ dbcommon::MemoryPool& getMemoryPool() const override;
+
+ const Timezone& getWriterTimezone() const override;
+};
+
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_READER_H_
diff --git a/depends/storage/src/storage/format/orc/rle-v0.h b/depends/storage/src/storage/format/orc/rle-v0.h
new file mode 100644
index 0000000..01906d3
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/rle-v0.h
@@ -0,0 +1,137 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V0_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V0_H_
+
+#include <algorithm>
+
+#include "storage/format/orc/rle.h"
+
+namespace orc {
+
+template <class IntType>
+class RleDecoderV0 : public RleDecoder {
+ public:
+ explicit RleDecoderV0(std::unique_ptr<SeekableInputStream> input)
+ : inputStream(std::move(input)) {}
+
+ void seek(PositionProvider &location) override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "skip not supported yet");
+ }
+
+ void skip(uint64_t numValues) override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "skip not supported yet");
+ }
+
+ void next(void *data, uint64_t numValues, const char *notNull) override {
+ if (notNull) {
+ uint64_t notNullValues = 0;
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (notNull[i]) {
+ ++notNullValues;
+ }
+ }
+ IntType *dat = reinterpret_cast<IntType *>(data);
+ std::unique_ptr<IntType> dataNotNull(new IntType(notNullValues));
+ readData(dataNotNull.get(), notNullValues);
+ IntType *datNotNull = dataNotNull.get();
+ for (uint64_t j = 0, k = 0; j < numValues; ++j) {
+ if (notNull[j]) {
+ dat[j] = datNotNull[k++];
+ }
+ }
+ } else {
+ readData(data, numValues);
+ }
+ }
+
+ private:
+ void nextBuffer() {
+ int bufferLength = 0;
+ const void *bufferPointer = nullptr;
+ bool result = inputStream->Next(&bufferPointer, &bufferLength);
+ if (!result) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in nextBuffer");
+ }
+ bufferStart = static_cast<const char *>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+
+ void readData(void *data, uint64_t numValues) {
+ uint64_t i = 0;
+ uint64_t count = numValues * sizeof(IntType);
+ while (i < count) {
+ if (bufferStart == bufferEnd) {
+ nextBuffer();
+ }
+ uint64_t copyBytes =
+ std::min(count - i, static_cast<uint64_t>(bufferEnd - bufferStart));
+ memcpy(data, bufferStart, copyBytes);
+ bufferStart += copyBytes;
+ i += copyBytes;
+ }
+ }
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+
+ const char *bufferStart = nullptr;
+ const char *bufferEnd = nullptr;
+};
+
+template <class IntType>
+class RleCoderV0 : public RleCoder {
+ public:
+ explicit RleCoderV0(std::unique_ptr<SeekableOutputStream> stream)
+ : output(std::move(stream)) {}
+
+ void write(void *data, uint64_t numValues, const char *notNull) override {
+ IntType *d = reinterpret_cast<IntType *>(data);
+ if (notNull) {
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (notNull[i]) {
+ output->write<IntType>(d[i]);
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < numValues; i++) {
+ output->write<IntType>(d[i]);
+ }
+ }
+ }
+
+ void flushToStream(OutputStream *stream) override {
+ output->flushToStream(stream);
+ }
+
+ uint64_t getStreamSize() override { return output->getStreamSize(); }
+
+ void reset() override { output->reset(); }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return output->getEstimatedSpaceNeeded();
+ }
+
+ private:
+ std::unique_ptr<SeekableOutputStream> output;
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V0_H_
diff --git a/depends/storage/src/storage/format/orc/rle-v1.h b/depends/storage/src/storage/format/orc/rle-v1.h
new file mode 100644
index 0000000..41de9a5
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/rle-v1.h
@@ -0,0 +1,371 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V1_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V1_H_
+
+#include <algorithm>
+#include <memory>
+#include <vector>
+
+#include "storage/format/orc/rle.h"
+
+namespace orc {
+
+const uint64_t MINIMUM_REPEAT = 3;
+const uint64_t BASE_128_MASK = 0x7f;
+
+template <class IntType, class UIntType>
+class RleDecoderV1 : public RleDecoder {
+ public:
+ RleDecoderV1(std::unique_ptr<SeekableInputStream> input, bool hasSigned)
+ : inputStream(std::move(input)), isSigned(hasSigned) {}
+
+ // Seek to a particular spot.
+ void seek(PositionProvider& location) override {
+ // move the input stream
+ inputStream->seek(location);
+ // force a re-read from the stream
+ bufferEnd = bufferStart;
+ // read a new header
+ readHeader();
+ // skip ahead the given number of records
+ skip(location.next());
+ }
+
+ // Seek over a given number of values.
+ void skip(uint64_t numValues) override {
+ while (numValues > 0) {
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ uint64_t count = std::min(numValues, remainingValues);
+ remainingValues -= count;
+ numValues -= count;
+ if (repeating) {
+ value += delta * static_cast<int64_t>(count);
+ } else {
+ skipLongs(count);
+ }
+ }
+ }
+
+ // Read a number of values into the batch.
+ void next(void* data, uint64_t numValues, const char* notNull) override {
+ uint64_t position = 0;
+ IntType* dat = reinterpret_cast<IntType*>(data);
+
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ while (position < numValues) {
+ // If we are out of values, read more.
+ if (remainingValues == 0) {
+ readHeader();
+ }
+ // How many do we read out of this block?
+ uint64_t count = std::min(numValues - position, remainingValues);
+ uint64_t consumed = 0;
+ if (repeating) {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ dat[position + i] = static_cast<IntType>(
+ value + static_cast<int64_t>(consumed) * delta);
+ consumed += 1;
+ }
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ dat[position + i] =
+ static_cast<IntType>(value + static_cast<int64_t>(i) * delta);
+ }
+ consumed = count;
+ }
+ value += static_cast<int64_t>(consumed) * delta;
+ } else {
+ if (notNull) {
+ for (uint64_t i = 0; i < count; ++i) {
+ if (notNull[position + i]) {
+ dat[position + i] =
+ isSigned ? static_cast<IntType>(unZigZag(readLong()))
+ : static_cast<IntType>(readLong());
+ ++consumed;
+ }
+ }
+ } else {
+ if (isSigned) {
+ for (uint64_t i = 0; i < count; ++i) {
+ dat[position + i] = static_cast<IntType>(unZigZag(readLong()));
+ }
+ } else {
+ for (uint64_t i = 0; i < count; ++i) {
+ dat[position + i] = static_cast<IntType>(readLong());
+ }
+ }
+ consumed = count;
+ }
+ }
+ remainingValues -= consumed;
+ position += count;
+
+ // skipNulls()
+ if (notNull) {
+ // Skip over null values.
+ while (position < numValues && !notNull[position]) {
+ ++position;
+ }
+ }
+ }
+ }
+
+ private:
+ signed char readByte() {
+ if (bufferStart == bufferEnd) {
+ int32_t bufferLength;
+ const void* bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in readByte");
+ }
+ bufferStart = static_cast<const char*>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+ return *(bufferStart++);
+ }
+
+ void readHeader() {
+ signed char ch = readByte();
+ if (ch < 0) {
+ remainingValues = static_cast<uint64_t>(-ch);
+ repeating = false;
+ } else {
+ remainingValues = static_cast<uint64_t>(ch) + MINIMUM_REPEAT;
+ repeating = true;
+ delta = readByte();
+ value =
+ isSigned ? unZigZag(readLong()) : static_cast<int64_t>(readLong());
+ }
+ }
+
+ uint64_t readLong() {
+ uint64_t result = 0;
+ int64_t offset = 0;
+ signed char ch = readByte();
+ if (ch >= 0) {
+ result = static_cast<uint64_t>(ch);
+ } else {
+ result = static_cast<uint64_t>(ch) & BASE_128_MASK;
+ while ((ch = readByte()) < 0) {
+ offset += 7;
+ result |= (static_cast<uint64_t>(ch) & BASE_128_MASK) << offset;
+ }
+ result |= static_cast<uint64_t>(ch) << (offset + 7);
+ }
+ return result;
+ }
+
+ void skipLongs(uint64_t numValues) {
+ while (numValues > 0) {
+ if (readByte() >= 0) {
+ --numValues;
+ }
+ }
+ }
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned;
+ uint64_t remainingValues = 0;
+ int64_t value = 0;
+ const char* bufferStart = nullptr;
+ const char* bufferEnd = nullptr;
+ int64_t delta = 0;
+ bool repeating = false;
+};
+
+// Integer Run Length Encoding version 1
+// Source link:
+// https://cwiki.apache.org/confluence/display/Hive/LanguageManual+ORC#LanguageManualORC-IntegerRunLengthEncodingversion1
+// //NOLINT
+//
+// In Hive 0.11 ORC files used Run Length Encoding version 1 (RLEv1),
+// which provides a lightweight compression of signed or unsigned integer
+// sequences. RLEv1 has two sub-encodings:
+// 1) Run - a sequence of values that differ by a small fixed delta
+// 2) Literals - a sequence of varint encoded values
+//
+// Runs start with an initial byte of 0x00 to 0x7f, which encodes the
+// length of the run - 3. A second byte provides the fixed delta in the
+// range of -128 to 127. Finally, the first value of the run is encoded
+// as a base 128 varint.
+// For example, if the sequence is 100 instances of 7 the encoding would
+// start with 100 - 3, followed by a delta of 0, and a varint of 7 for
+// an encoding of [0x61, 0x00, 0x07]. To encode the sequence of numbers
+// running from 100 to 1, the first byte is 100 - 3, the delta is -1,
+// and the varint is 100 for an encoding of [0x61, 0xff, 0x64].
+//
+// Literals start with an initial byte of 0x80 to 0xff, which corresponds
+// to the negative of number of literals in the sequence. Following the
+// header byte, the list of N varints is encoded. Thus, if there are
+// no runs, the overhead is 1 byte for each 128 integers. The first 5
+// prime numbers [2, 3, 4, 7, 11] would encoded as [0xfb, 0x02, 0x03,
+// 0x04, 0x07, 0xb].
+
+template <class IntType>
+class RleCoderV1 : public RleCoder {
+ public:
+ explicit RleCoderV1(std::unique_ptr<SeekableOutputStream> stream,
+ bool hasSigned)
+ : output(std::move(stream)),
+ isSigned(hasSigned),
+ literals(MAX_LITERAL_SIZE) {}
+
+ void write(void* data, uint64_t numValues, const char* notNull) override {
+ IntType* d = reinterpret_cast<IntType*>(data);
+ for (uint64_t i = 0; i < numValues; i++) {
+ if ((notNull == nullptr) || notNull[i]) {
+ // LOG_INFO("write value %lld", static_cast<int64_t>(d[i]));
+ write(static_cast<int64_t>(d[i]));
+ }
+ }
+ }
+
+ void flushToStream(OutputStream* stream) override {
+ writeValues();
+ output->flushToStream(stream);
+ }
+
+ uint64_t getStreamSize() override { return output->getStreamSize(); }
+
+ void reset() override { output->reset(); }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return output->getEstimatedSpaceNeeded() + numLiterals * sizeof(int64_t) +
+ sizeof(uint8_t) // delta
+ + sizeof(uint8_t); // control bytes
+ }
+
+ private:
+ // Write the input value
+ // @param value The input value
+ // @return Void
+ void write(int64_t value) {
+ if (numLiterals == 0) {
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ } else if (repeat) {
+ if (value == literals[0] + delta * numLiterals) {
+ numLiterals += 1;
+ if (numLiterals == MAX_REPEAT_SIZE) {
+ writeValues();
+ }
+ } else {
+ writeValues();
+ literals[numLiterals++] = value;
+ tailRunLength = 1;
+ }
+ } else {
+ if (tailRunLength == 1) {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ } else if (value == literals[numLiterals - 1] + delta) {
+ tailRunLength += 1;
+ } else {
+ delta = value - literals[numLiterals - 1];
+ if (delta < MIN_DELTA || delta > MAX_DELTA) {
+ tailRunLength = 1;
+ } else {
+ tailRunLength = 2;
+ }
+ }
+ if (tailRunLength == MIN_REPEAT_SIZE) {
+ if (numLiterals + 1 == MIN_REPEAT_SIZE) {
+ repeat = true;
+ numLiterals += 1;
+ } else {
+ numLiterals -= MIN_REPEAT_SIZE - 1;
+ int64_t base = literals[numLiterals];
+ writeValues();
+ literals[0] = base;
+ repeat = true;
+ numLiterals = MIN_REPEAT_SIZE;
+ }
+ } else {
+ literals[numLiterals++] = value;
+ if (numLiterals == MAX_LITERAL_SIZE) {
+ writeValues();
+ }
+ }
+ }
+ }
+
+ void writeValues() {
+ if (numLiterals != 0) {
+ if (repeat) {
+ output->writeByte(numLiterals - MIN_REPEAT_SIZE);
+ output->writeByte((int8_t)delta);
+
+ if (isSigned) {
+ writeInt64(output.get(), literals[0]);
+ } else {
+ writeUInt64(output.get(), literals[0]);
+ }
+ } else {
+ output->writeByte(-numLiterals);
+ for (uint32_t i = 0; i < numLiterals; ++i) {
+ if (isSigned) {
+ writeInt64(output.get(), literals[i]);
+ } else {
+ writeUInt64(output.get(), literals[i]);
+ }
+ }
+ }
+ repeat = false;
+ numLiterals = 0;
+ tailRunLength = 0;
+ }
+ }
+
+ private:
+ std::unique_ptr<SeekableOutputStream> output;
+ const bool isSigned = false;
+
+ const int32_t MIN_REPEAT_SIZE = 3;
+ const int32_t MAX_DELTA = 127;
+ const int32_t MIN_DELTA = -128;
+ const int32_t MAX_LITERAL_SIZE = 128;
+ const int32_t MAX_REPEAT_SIZE = 127 + MIN_REPEAT_SIZE;
+
+ std::vector<int64_t> literals;
+ int32_t numLiterals = 0;
+ int64_t delta = 0;
+ bool repeat = false;
+ int32_t tailRunLength = 0;
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V1_H_
diff --git a/depends/storage/src/storage/format/orc/rle-v2.h b/depends/storage/src/storage/format/orc/rle-v2.h
new file mode 100644
index 0000000..14b8883
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/rle-v2.h
@@ -0,0 +1,1768 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V2_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V2_H_
+
+#include <algorithm>
+#include <cmath>
+#include <memory>
+#include <utility>
+#include <vector>
+
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/rle.h"
+
+namespace orc {
+
+#define MIN_REPEAT 3
+
+struct FixedBitSizes {
+ enum FBS {
+ ONE = 0,
+ TWO,
+ THREE,
+ FOUR,
+ FIVE,
+ SIX,
+ SEVEN,
+ EIGHT,
+ NINE,
+ TEN,
+ ELEVEN,
+ TWELVE,
+ THIRTEEN,
+ FOURTEEN,
+ FIFTEEN,
+ SIXTEEN,
+ SEVENTEEN,
+ EIGHTEEN,
+ NINETEEN,
+ TWENTY,
+ TWENTYONE,
+ TWENTYTWO,
+ TWENTYTHREE,
+ TWENTYFOUR,
+ TWENTYSIX,
+ TWENTYEIGHT,
+ THIRTY,
+ THIRTYTWO,
+ FORTY,
+ FORTYEIGHT,
+ FIFTYSIX,
+ SIXTYFOUR
+ };
+};
+
+enum EncodingType {
+ SHORT_REPEAT = 0,
+ DIRECT = 1,
+ PATCHED_BASE = 2,
+ DELTA = 3,
+ UNKNOWN = 4
+};
+
+// Decodes the ordinal fixed bit value to actual fixed bit width value
+// @param n - encoded fixed bit width
+// @return decoded fixed bit width
+inline uint32_t decodeBitWidth(uint32_t n) {
+ if (n <= FixedBitSizes::TWENTYFOUR) {
+ return n + 1;
+ } else if (n == FixedBitSizes::TWENTYSIX) {
+ return 26;
+ } else if (n == FixedBitSizes::TWENTYEIGHT) {
+ return 28;
+ } else if (n == FixedBitSizes::THIRTY) {
+ return 30;
+ } else if (n == FixedBitSizes::THIRTYTWO) {
+ return 32;
+ } else if (n == FixedBitSizes::FORTY) {
+ return 40;
+ } else if (n == FixedBitSizes::FORTYEIGHT) {
+ return 48;
+ } else if (n == FixedBitSizes::FIFTYSIX) {
+ return 56;
+ } else {
+ return 64;
+ }
+}
+
+template <class IntType, class UIntType>
+class RleDecoderV2 : public RleDecoder {
+ public:
+ RleDecoderV2(std::unique_ptr<SeekableInputStream> input, bool isSigned,
+ dbcommon::MemoryPool &pool) // NOLINT
+ : inputStream(std::move(input)),
+ isSigned(isSigned),
+ unpacked(pool, 0),
+ unpackedPatch(pool, 0) {
+ // PASS
+ }
+
+ // Seek to a particular spot.
+ void seek(PositionProvider &location) override {
+ // move the input stream
+ inputStream->seek(location);
+ // clear state
+ bufferEnd = bufferStart = 0;
+ runRead = runLength = 0;
+ // skip ahead the given number of records
+ skip(location.next());
+ }
+
+ // Seek over a given number of values.
+ void skip(uint64_t numValues) override {
+ // simple for now, until perf tests indicate something
+ // encoding specific is needed
+ const uint64_t N = 64;
+ int64_t dummy[N];
+
+ while (numValues) {
+ uint64_t nRead = std::min(N, numValues);
+ next(dummy, nRead, nullptr);
+ numValues -= nRead;
+ }
+ }
+
+ // Read a number of values into the batch.
+ void next(void *data, uint64_t numValues, const char *notNull) override {
+ uint64_t nRead = 0;
+ IntType *dat = reinterpret_cast<IntType *>(data);
+
+ while (nRead < numValues) {
+ // Skip any nulls before attempting to read first byte.
+ if (notNull) {
+ while (!notNull[nRead]) {
+ if (++nRead == numValues) {
+ return; // ended with null values
+ }
+ }
+ }
+
+ if (runRead == runLength) {
+ resetRun();
+ firstByte = readByte();
+ }
+
+ uint64_t offset = nRead, length = numValues - nRead;
+
+ EncodingType enc = static_cast<EncodingType>((firstByte >> 6) & 0x03);
+ switch (static_cast<int64_t>(enc)) {
+ case SHORT_REPEAT:
+ nRead += nextShortRepeats(dat, offset, length, notNull);
+ break;
+ case DIRECT:
+ nRead += nextDirect(dat, offset, length, notNull);
+ break;
+ case PATCHED_BASE:
+ nRead += nextPatched(dat, offset, length, notNull);
+ break;
+ case DELTA:
+ nRead += nextDelta(dat, offset, length, notNull);
+ break;
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "unknown encoding");
+ }
+ }
+ }
+
+ private:
+ // Used by PATCHED_BASE
+ void adjustGapAndPatch() {
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ actualGap = 0;
+
+ // special case: gap is >255 then patch value will be 0.
+ // if gap is <=255 then patch value cannot be 0
+ while (curGap == 255 && curPatch == 0) {
+ actualGap += 255;
+ ++patchIdx;
+ curGap = static_cast<uint64_t>(unpackedPatch[patchIdx]) >> patchBitSize;
+ curPatch = unpackedPatch[patchIdx] & patchMask;
+ }
+ // add the left over gap
+ actualGap += curGap;
+ }
+
+ void resetReadLongs() {
+ bitsLeft = 0;
+ curByte = 0;
+ }
+
+ void resetRun() {
+ resetReadLongs();
+ bitSize = 0;
+ }
+
+ unsigned char readByte() {
+ assert(bufferStart <= bufferEnd);
+ if (bufferStart == bufferEnd) {
+ int32_t bufferLength;
+ const void *bufferPointer;
+ if (!inputStream->Next(&bufferPointer, &bufferLength)) {
+ // fixme: LOG_ERROR should be alarmed
+ // LOG_ERROR(ERRCODE_INTERNAL_ERROR, "bad read in
+ // RleDecoderV2::readByte");
+ }
+ bufferStart = static_cast<const char *>(bufferPointer);
+ bufferEnd = bufferStart + bufferLength;
+ }
+
+ unsigned char result = static_cast<unsigned char>(*bufferStart++);
+ return result;
+ }
+
+ int64_t readLongBE(uint64_t bsz) {
+ int64_t ret = 0;
+ uint64_t n = bsz;
+ while (n > 0) {
+ n--;
+ auto val = readByte();
+ ret <<= 8;
+ ret |= val;
+ }
+ return ret;
+ }
+
+ template <std::size_t N>
+ int64_t readLongBEQuickInternal(const char *bufferStart) {
+ // todo(xxx) more optimization on a big endian machine
+ int64_t ret = 0;
+#pragma clang loop unroll(full)
+ for (uint64_t n = N; n > 0; n--) {
+ auto val = static_cast<unsigned char>(*bufferStart++);
+ ret <<= 8;
+ ret |= val;
+ }
+ return ret;
+ }
+ inline int64_t readLongBEQuick(uint64_t bsz) {
+ assert(bufferStart + bsz - 1 < bufferEnd);
+
+ int64_t ret;
+ switch (bsz) {
+ case 8:
+ ret = readLongBEQuickInternal<8>(bufferStart);
+ break;
+ case 7:
+ ret = readLongBEQuickInternal<7>(bufferStart);
+ break;
+ case 6:
+ ret = readLongBEQuickInternal<6>(bufferStart);
+ break;
+ case 5:
+ ret = readLongBEQuickInternal<5>(bufferStart);
+ break;
+ case 4:
+ ret = readLongBEQuickInternal<4>(bufferStart);
+ break;
+ case 3:
+ ret = readLongBEQuickInternal<3>(bufferStart);
+ break;
+ case 2:
+ ret = readLongBEQuickInternal<2>(bufferStart);
+ break;
+ case 1:
+ ret = readLongBEQuickInternal<1>(bufferStart);
+ break;
+ }
+ bufferStart += bsz;
+ return ret;
+ }
+
+ int64_t readVslong() { return unZigZag(readVulong()); }
+
+ uint64_t readVulong() {
+ uint64_t ret = 0, b;
+ uint64_t offset = 0;
+ do {
+ b = readByte();
+ ret |= (0x7f & b) << offset;
+ offset += 7;
+ } while (b >= 0x80);
+ return ret;
+ }
+
+ template <class ReadIntType>
+ uint64_t readLongs(ReadIntType *data, uint64_t offset, uint64_t len,
+ uint64_t fb, const char *notNull = nullptr) {
+ switch (fb) {
+ case 1:
+ return readLongs<ReadIntType, 1>(data, offset, len, notNull);
+ case 2:
+ return readLongs<ReadIntType, 2>(data, offset, len, notNull);
+ case 3:
+ return readLongs<ReadIntType, 3>(data, offset, len, notNull);
+ case 4:
+ return readLongs<ReadIntType, 4>(data, offset, len, notNull);
+ case 5:
+ return readLongs<ReadIntType, 5>(data, offset, len, notNull);
+ case 6:
+ return readLongs<ReadIntType, 6>(data, offset, len, notNull);
+ case 7:
+ return readLongs<ReadIntType, 7>(data, offset, len, notNull);
+ case 8:
+ return readLongs<ReadIntType, 8>(data, offset, len, notNull);
+ case 9:
+ return readLongs<ReadIntType, 9>(data, offset, len, notNull);
+ case 10:
+ return readLongs<ReadIntType, 10>(data, offset, len, notNull);
+ case 11:
+ return readLongs<ReadIntType, 11>(data, offset, len, notNull);
+ case 12:
+ return readLongs<ReadIntType, 12>(data, offset, len, notNull);
+ case 13:
+ return readLongs<ReadIntType, 13>(data, offset, len, notNull);
+ case 14:
+ return readLongs<ReadIntType, 14>(data, offset, len, notNull);
+ case 15:
+ return readLongs<ReadIntType, 15>(data, offset, len, notNull);
+ case 16:
+ return readLongs<ReadIntType, 16>(data, offset, len, notNull);
+ case 17:
+ return readLongs<ReadIntType, 17>(data, offset, len, notNull);
+ case 18:
+ return readLongs<ReadIntType, 18>(data, offset, len, notNull);
+ case 19:
+ return readLongs<ReadIntType, 19>(data, offset, len, notNull);
+ case 20:
+ return readLongs<ReadIntType, 20>(data, offset, len, notNull);
+ case 21:
+ return readLongs<ReadIntType, 21>(data, offset, len, notNull);
+ case 22:
+ return readLongs<ReadIntType, 22>(data, offset, len, notNull);
+ case 23:
+ return readLongs<ReadIntType, 23>(data, offset, len, notNull);
+ case 24:
+ return readLongs<ReadIntType, 24>(data, offset, len, notNull);
+ case 25:
+ return readLongs<ReadIntType, 25>(data, offset, len, notNull);
+ case 26:
+ return readLongs<ReadIntType, 26>(data, offset, len, notNull);
+ case 27:
+ return readLongs<ReadIntType, 27>(data, offset, len, notNull);
+ case 28:
+ return readLongs<ReadIntType, 28>(data, offset, len, notNull);
+ case 29:
+ return readLongs<ReadIntType, 29>(data, offset, len, notNull);
+ case 30:
+ return readLongs<ReadIntType, 30>(data, offset, len, notNull);
+ case 31:
+ return readLongs<ReadIntType, 31>(data, offset, len, notNull);
+ case 32:
+ return readLongs<ReadIntType, 32>(data, offset, len, notNull);
+ case 33:
+ return readLongs<ReadIntType, 33>(data, offset, len, notNull);
+ case 34:
+ return readLongs<ReadIntType, 34>(data, offset, len, notNull);
+ case 35:
+ return readLongs<ReadIntType, 35>(data, offset, len, notNull);
+ case 36:
+ return readLongs<ReadIntType, 36>(data, offset, len, notNull);
+ case 37:
+ return readLongs<ReadIntType, 37>(data, offset, len, notNull);
+ case 38:
+ return readLongs<ReadIntType, 38>(data, offset, len, notNull);
+ case 39:
+ return readLongs<ReadIntType, 39>(data, offset, len, notNull);
+ case 40:
+ return readLongs<ReadIntType, 40>(data, offset, len, notNull);
+ case 41:
+ return readLongs<ReadIntType, 41>(data, offset, len, notNull);
+ case 42:
+ return readLongs<ReadIntType, 42>(data, offset, len, notNull);
+ case 43:
+ return readLongs<ReadIntType, 43>(data, offset, len, notNull);
+ case 44:
+ return readLongs<ReadIntType, 44>(data, offset, len, notNull);
+ case 45:
+ return readLongs<ReadIntType, 45>(data, offset, len, notNull);
+ case 46:
+ return readLongs<ReadIntType, 46>(data, offset, len, notNull);
+ case 47:
+ return readLongs<ReadIntType, 47>(data, offset, len, notNull);
+ case 48:
+ return readLongs<ReadIntType, 48>(data, offset, len, notNull);
+ case 49:
+ return readLongs<ReadIntType, 49>(data, offset, len, notNull);
+ case 50:
+ return readLongs<ReadIntType, 50>(data, offset, len, notNull);
+ case 51:
+ return readLongs<ReadIntType, 51>(data, offset, len, notNull);
+ case 52:
+ return readLongs<ReadIntType, 52>(data, offset, len, notNull);
+ case 53:
+ return readLongs<ReadIntType, 53>(data, offset, len, notNull);
+ case 54:
+ return readLongs<ReadIntType, 54>(data, offset, len, notNull);
+ case 55:
+ return readLongs<ReadIntType, 55>(data, offset, len, notNull);
+ case 56:
+ return readLongs<ReadIntType, 56>(data, offset, len, notNull);
+ case 57:
+ return readLongs<ReadIntType, 57>(data, offset, len, notNull);
+ case 58:
+ return readLongs<ReadIntType, 58>(data, offset, len, notNull);
+ case 59:
+ return readLongs<ReadIntType, 59>(data, offset, len, notNull);
+ case 60:
+ return readLongs<ReadIntType, 60>(data, offset, len, notNull);
+ case 61:
+ return readLongs<ReadIntType, 61>(data, offset, len, notNull);
+ case 62:
+ return readLongs<ReadIntType, 62>(data, offset, len, notNull);
+ case 63:
+ return readLongs<ReadIntType, 63>(data, offset, len, notNull);
+ case 64:
+ return readLongs<ReadIntType, 64>(data, offset, len, notNull);
+ }
+ return 0;
+ }
+ template <class ReadIntType, uint64_t FixedBitLength,
+ uint64_t Mask = (FixedBitLength == 64)
+ ? -1
+ : ((uint64_t)1 << FixedBitLength) - 1>
+ uint64_t readLongs(ReadIntType *data, uint64_t offset, uint64_t len,
+ const char *notNull = nullptr) {
+ auto curByte = this->curByte;
+ auto bitsLeft = this->bitsLeft;
+ // TODO(xxx): unroll to improve performance
+ if (notNull) {
+ uint64_t ret = 0;
+ for (uint64_t i = offset; i < (offset + len); i++) {
+ // skip null positions
+ if (!notNull[i]) {
+ continue;
+ }
+ uint64_t result = 0;
+ uint64_t bitsLeftToRead = FixedBitLength;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= curByte & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ curByte = readByte();
+ bitsLeft = 8;
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
+ result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ data[i] = static_cast<ReadIntType>(result);
+ ++ret;
+ }
+ this->curByte = curByte;
+ this->bitsLeft = bitsLeft;
+ return ret;
+ } else {
+ auto bufferStart = this->bufferStart;
+ auto bufferEnd = this->bufferEnd;
+ for (uint64_t i = offset; i < (offset + len); i++) {
+ // 1. Fast decode
+ { // performance: reduce the cost on the checking the buffer and
+ // reassigning the bitsLeft
+
+ uint64_t curLong = curByte;
+ while (bufferStart + 8 < bufferEnd && i < (offset + len)) {
+ uint64_t result;
+ if (bitsLeft >= FixedBitLength) {
+ result = (curLong >> (bitsLeft - FixedBitLength));
+ result &= Mask;
+ } else {
+ uint64_t lastLong = curLong;
+ curLong = (*reinterpret_cast<const uint64_t *>(bufferStart));
+ curLong = __builtin_bswap64(curLong);
+ bufferStart += 8;
+ if (FixedBitLength != 64) {
+ result = (lastLong << (FixedBitLength - bitsLeft));
+ result |= (curLong >> (64 - (FixedBitLength - bitsLeft)));
+ } else {
+ result = curLong;
+ }
+ result &= Mask;
+ bitsLeft += 64;
+ }
+ bitsLeft -= FixedBitLength;
+
+ data[i] = static_cast<ReadIntType>(result);
+ i++;
+ }
+ uint8_t trimBits = 0;
+ while (bitsLeft >= 8) {
+ bufferStart -= 1;
+ bitsLeft -= 8;
+ trimBits += 8;
+ }
+ curByte = static_cast<uint8_t>(curLong >> trimBits);
+ assert(bufferStart <= bufferEnd);
+ }
+ if (i >= (offset + len)) break;
+
+ // 2. Normal decode
+
+ // update buffer info
+ this->bufferStart = bufferStart;
+ this->bufferEnd = bufferEnd;
+
+ uint64_t result = 0;
+ uint64_t bitsLeftToRead = FixedBitLength;
+ while (bitsLeftToRead > bitsLeft) {
+ result <<= bitsLeft;
+ result |= curByte & ((1 << bitsLeft) - 1);
+ bitsLeftToRead -= bitsLeft;
+ curByte = readByte();
+ bitsLeft = 8;
+ }
+
+ // handle the left over bits
+ if (bitsLeftToRead > 0) {
+ result <<= bitsLeftToRead;
+ bitsLeft -= static_cast<uint32_t>(bitsLeftToRead);
+ result |= (curByte >> bitsLeft) & ((1 << bitsLeftToRead) - 1);
+ }
+ data[i] = static_cast<ReadIntType>(result);
+
+ // update buffer info
+ bufferStart = this->bufferStart;
+ bufferEnd = this->bufferEnd;
+ }
+ assert(bufferStart <= bufferEnd);
+ this->bufferStart = bufferStart;
+ this->bufferEnd = bufferEnd;
+ }
+ this->curByte = curByte;
+ this->bitsLeft = bitsLeft;
+
+ return len;
+ }
+
+ inline uint32_t getClosestFixedBits(uint32_t n) {
+ if (n == 0) {
+ return 1;
+ }
+
+ if (n >= 1 && n <= 24) {
+ return n;
+ } else if (n > 24 && n <= 26) {
+ return 26;
+ } else if (n > 26 && n <= 28) {
+ return 28;
+ } else if (n > 28 && n <= 30) {
+ return 30;
+ } else if (n > 30 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ uint64_t nextShortRepeats(IntType *data, uint64_t offset, uint64_t numValues,
+ const char *notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bytes
+ byteSize = (firstByte >> 3) & 0x07;
+ byteSize += 1;
+
+ runLength = firstByte & 0x07;
+ // run lengths values are stored only after MIN_REPEAT value is met
+ runLength += MIN_REPEAT;
+ runRead = 0;
+
+ // read the repeated value which is store using fixed bytes
+ if (bufferStart + byteSize - 1 < bufferEnd)
+ firstValue = readLongBEQuick(byteSize);
+ else
+ firstValue = readLongBE(byteSize);
+
+ if (isSigned) {
+ firstValue = unZigZag(static_cast<uint64_t>(firstValue));
+ }
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ int64_t firstValue = this->firstValue;
+ if (notNull) {
+ // performance: reduce mem acces on this->runRead
+ auto runRead = this->runRead;
+ for (auto pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ data[pos] = firstValue;
+ runRead++;
+ }
+ }
+ this->runRead = runRead;
+ } else {
+#pragma clang vectorize(enable)
+ auto pos = offset;
+ for (pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = firstValue;
+ }
+ runRead += pos - offset;
+ }
+
+ return nRead;
+ }
+
+ uint64_t nextDirect(IntType *data, uint64_t offset, uint64_t numValues,
+ const char *notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ runRead += readLongs<IntType>(data, offset, nRead, bitSize, notNull);
+
+ if (isSigned) {
+ if (notNull) {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ if (notNull[pos]) {
+ // note here, we must cast to UIntType first, instead of
+ // casting to uint64_t directly. since if data[pos] is negative
+ // casting it to uint64_t will become a very big number.
+ // This is why we add UIntType template class to this class.
+ data[pos] = unZigZag(static_cast<UIntType>(data[pos]));
+ }
+ }
+ } else {
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ data[pos] = unZigZag(static_cast<UIntType>(data[pos]));
+ }
+ }
+ }
+
+ return nRead;
+ }
+
+ uint64_t nextPatched(IntType *data, uint64_t offset, uint64_t numValues,
+ const char *notNull) {
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ bitSize = decodeBitWidth(fbo);
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ // runs are one off
+ runLength += 1;
+ runRead = 0;
+
+ // extract the number of bytes occupied by base
+ uint64_t thirdByte = readByte();
+ byteSize = (thirdByte >> 5) & 0x07;
+ // base width is one off
+ byteSize += 1;
+
+ // extract patch width
+ uint32_t pwo = thirdByte & 0x1f;
+ patchBitSize = decodeBitWidth(pwo);
+
+ // read fourth byte and extract patch gap width
+ uint64_t fourthByte = readByte();
+ uint32_t pgw = (fourthByte >> 5) & 0x07;
+ // patch gap width is one off
+ pgw += 1;
+
+ // extract the length of the patch list
+ size_t pl = fourthByte & 0x1f;
+ if (pl == 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Corrupt PATCHED_BASE encoded data (pl==0)!");
+ }
+
+ // read the next base width number of bytes to extract base value
+ base = readLongBE(byteSize);
+ int64_t mask = (static_cast<int64_t>(1) << ((byteSize * 8) - 1));
+ // if mask of base value is 1 then base is negative value else positive
+ if ((base & mask) != 0) {
+ base = base & ~mask;
+ base = -base;
+ }
+
+ // TODO(xxx): something more efficient than resize
+ unpacked.resize(runLength);
+ unpackedIdx = 0;
+ readLongs<int64_t>(unpacked.data(), 0, runLength, bitSize);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // TODO(xxx): something more efficient than resize
+ unpackedPatch.resize(pl);
+ patchIdx = 0;
+ // TODO(xxx): Skip corrupt?
+ // if ((patchBitSize + pgw) > 64 && !skipCorrupt) {
+ if ((patchBitSize + pgw) > 64) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Corrupt PATCHED_BASE encoded data "
+ "(patchBitSize + pgw > 64)!");
+ }
+ uint32_t cfb = getClosestFixedBits(patchBitSize + pgw);
+ readLongs<int64_t>(unpackedPatch.data(), 0, pl, cfb);
+ // any remaining bits are thrown out
+ resetReadLongs();
+
+ // apply the patch directly when decoding the packed data
+ patchMask = ((static_cast<int64_t>(1) << patchBitSize) - 1);
+
+ adjustGapAndPatch();
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ for (uint64_t pos = offset; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ if (static_cast<int64_t>(unpackedIdx) != actualGap) {
+ // no patching required. add base to unpacked value to get final value
+ data[pos] = base + unpacked[unpackedIdx];
+ } else {
+ // extract the patch value
+ int64_t patchedVal = unpacked[unpackedIdx] | (curPatch << bitSize);
+
+ // add base to patched value
+ data[pos] = base + patchedVal;
+
+ // increment the patch to point to next entry in patch list
+ ++patchIdx;
+
+ if (patchIdx < unpackedPatch.size()) {
+ adjustGapAndPatch();
+
+ // next gap is relative to the current gap
+ actualGap += unpackedIdx;
+ }
+ }
+
+ ++runRead;
+ ++unpackedIdx;
+ }
+
+ return nRead;
+ }
+
+ uint64_t nextDelta(IntType *data, uint64_t offset, uint64_t numValues,
+ const char *notNull) {
+ auto runRead = this->runRead;
+ auto prevValue = this->prevValue;
+ auto deltaBase = this->deltaBase;
+ if (runRead == runLength) {
+ // extract the number of fixed bits
+ unsigned char fbo = (firstByte >> 1) & 0x1f;
+ if (fbo != 0) {
+ bitSize = decodeBitWidth(fbo);
+ } else {
+ bitSize = 0;
+ }
+
+ // extract the run length
+ runLength = static_cast<uint64_t>(firstByte & 0x01) << 8;
+ runLength |= readByte();
+ ++runLength; // account for first value
+ runRead = deltaBase = 0;
+
+ // read the first value stored as vint
+ if (isSigned) {
+ firstValue = static_cast<int64_t>(readVslong());
+ } else {
+ firstValue = static_cast<int64_t>(readVulong());
+ }
+
+ prevValue = firstValue;
+
+ // read the fixed delta value stored as vint (deltas can be negative even
+ // if all number are positive)
+ deltaBase = static_cast<int64_t>(readVslong());
+ }
+
+ uint64_t nRead = std::min(runLength - runRead, numValues);
+
+ uint64_t pos = offset;
+ for (; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead == 0 && pos < offset + nRead) {
+ data[pos++] = firstValue;
+ ++runRead;
+ }
+
+ if (bitSize == 0) {
+ // add fixed deltas to adjacent values
+ if (notNull) {
+ for (; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + deltaBase;
+ ++runRead;
+ }
+ } else {
+ for (; pos < offset + nRead; ++pos) {
+ prevValue = data[pos] = prevValue + deltaBase;
+ ++runRead;
+ }
+ }
+ } else {
+ for (; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (!notNull || notNull[pos]) break;
+ }
+ if (runRead < 2 && pos < offset + nRead) {
+ // add delta base and first value
+ prevValue = data[pos++] = firstValue + deltaBase;
+ ++runRead;
+ }
+
+ // write the unpacked values, add it to previous value and store final
+ // value to result buffer. if the delta base value is negative then it
+ // is a decreasing sequence else an increasing sequence
+ uint64_t remaining = (offset + nRead) - pos;
+ runRead += readLongs<IntType>(data, pos, remaining, bitSize, notNull);
+
+ if (deltaBase < 0) {
+ for (; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue - data[pos];
+ }
+ } else {
+ for (; pos < offset + nRead; ++pos) {
+ // skip null positions
+ if (notNull && !notNull[pos]) {
+ continue;
+ }
+ prevValue = data[pos] = prevValue + data[pos];
+ }
+ }
+ }
+ this->runRead = runRead;
+ this->prevValue = prevValue;
+ this->deltaBase = deltaBase;
+ return nRead;
+ }
+
+ const std::unique_ptr<SeekableInputStream> inputStream;
+ const bool isSigned = false;
+
+ unsigned char firstByte = 0;
+ uint64_t runLength = 0;
+ uint64_t runRead = 0;
+ const char *bufferStart = nullptr;
+ const char *bufferEnd = nullptr;
+ int64_t deltaBase = 0; // Used by DELTA
+ uint64_t byteSize = 0; // Used by SHORT_REPEAT and PATCHED_BASE
+ int64_t firstValue = 0; // Used by SHORT_REPEAT and DELTA
+ int64_t prevValue = 0; // Used by DELTA
+ uint32_t bitSize = 0; // Used by DIRECT, PATCHED_BASE and DELTA
+ uint8_t bitsLeft = 0; // Used by anything that uses readLongs
+ uint8_t curByte = 0; // Used by anything that uses readLongs
+ uint32_t patchBitSize = 0; // Used by PATCHED_BASE
+ uint64_t unpackedIdx = 0; // Used by PATCHED_BASE
+ uint64_t patchIdx = 0; // Used by PATCHED_BASE
+ int64_t base = 0; // Used by PATCHED_BASE
+ uint64_t curGap = 0; // Used by PATCHED_BASE
+ int64_t curPatch = 0; // Used by PATCHED_BASE
+ int64_t patchMask = 0; // Used by PATCHED_BASE
+ int64_t actualGap = 0; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpacked; // Used by PATCHED_BASE
+ DataBuffer<int64_t> unpackedPatch; // Used by PATCHED_BASE
+};
+
+template <class IntType>
+class RleCoderV2 : public RleCoder {
+ public:
+ explicit RleCoderV2(std::unique_ptr<SeekableOutputStream> stream,
+ bool isSigned)
+ : output(std::move(stream)),
+ isSigned(isSigned),
+ literals(MAX_SCOPE),
+ zigzagLiterals(MAX_SCOPE),
+ baseRedLiterals(MAX_SCOPE),
+ adjDeltas(MAX_SCOPE) {
+ clear();
+ }
+
+ RleCoderV2(std::unique_ptr<SeekableOutputStream> stream, bool isSigned,
+ bool alignedBitpacking)
+ : output(std::move(stream)),
+ isSigned(isSigned),
+ literals(MAX_SCOPE),
+ zigzagLiterals(MAX_SCOPE),
+ baseRedLiterals(MAX_SCOPE),
+ adjDeltas(MAX_SCOPE),
+ alignedBitpacking(alignedBitpacking) {
+ clear();
+ }
+
+ void write(void *data, uint64_t numValues, const char *notNull) override {
+ IntType *d = reinterpret_cast<IntType *>(data);
+ for (uint64_t i = 0; i < numValues; i++) {
+ if ((notNull == nullptr) || notNull[i]) {
+ write(static_cast<int64_t>(d[i]));
+ }
+ }
+ }
+
+ uint64_t getStreamSize() override { return output->getStreamSize(); }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return output->getEstimatedSpaceNeeded() + numLiterals * sizeof(int64_t) +
+ sizeof(uint8_t) * 10; // maximal value: header + base + delta
+ }
+
+ void flushToStream(OutputStream *os) override {
+ if (numLiterals != 0) {
+ if (variableRunLength != 0) {
+ determineEncoding();
+ writeValues();
+ } else if (fixedRunLength != 0) {
+ if (fixedRunLength < MIN_REPEAT) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ determineEncoding();
+ writeValues();
+ } else if (fixedRunLength >= MIN_REPEAT &&
+ fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ encoding = EncodingType::SHORT_REPEAT;
+ writeValues();
+ } else {
+ encoding = EncodingType::DELTA;
+ isFixedDelta = true;
+ writeValues();
+ }
+ }
+ }
+ output->flushToStream(os);
+ }
+
+ void reset() override { output->reset(); }
+
+ private:
+ void write(int64_t val) {
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ if (numLiterals == 1) {
+ prevDelta = val - literals[0];
+ literals[numLiterals++] = val;
+ // if both values are same count as fixed run else variable run
+ if (val == literals[0]) {
+ fixedRunLength = 2;
+ variableRunLength = 0;
+ } else {
+ fixedRunLength = 0;
+ variableRunLength = 2;
+ }
+ } else {
+ int64_t currentDelta = val - literals[numLiterals - 1];
+ if (prevDelta == 0 && currentDelta == 0) {
+ // fixed delta run
+
+ literals[numLiterals++] = val;
+
+ // if variable run is non-zero then we are seeing repeating
+ // values at the end of variable run in which case keep
+ // updating variable and fixed runs
+ if (variableRunLength > 0) {
+ fixedRunLength = 2;
+ }
+ fixedRunLength += 1;
+
+ // if fixed run met the minimum condition and if variable
+ // run is non-zero then flush the variable run and shift the
+ // tail fixed runs to start of the buffer
+ if (fixedRunLength >= MIN_REPEAT && variableRunLength > 0) {
+ numLiterals -= MIN_REPEAT;
+ variableRunLength -= MIN_REPEAT - 1;
+ // copy the tail fixed runs
+ std::vector<int64_t> tailVals(MIN_REPEAT);
+ assert(literals.size() > numLiterals);
+
+ memcpy(tailVals.data(), &literals[numLiterals],
+ MIN_REPEAT * sizeof(int64_t));
+
+ // System.arraycopy(literals, numLiterals, tailVals, 0, MIN_REPEAT);
+
+ // determine variable encoding and flush values
+ determineEncoding();
+ writeValues();
+
+ // shift tail fixed runs to beginning of the buffer
+ for (int64_t l : tailVals) {
+ literals[numLiterals++] = l;
+ }
+ }
+
+ // if fixed runs reached max repeat length then write values
+ if (fixedRunLength == MAX_SCOPE) {
+ determineEncoding();
+ writeValues();
+ }
+ } else {
+ // variable delta run
+
+ // if fixed run length is non-zero and if it satisfies the
+ // short repeat conditions then write the values as short repeats
+ // else use delta encoding
+ if (fixedRunLength >= MIN_REPEAT) {
+ if (fixedRunLength <= MAX_SHORT_REPEAT_LENGTH) {
+ encoding = EncodingType::SHORT_REPEAT;
+ writeValues();
+ } else {
+ encoding = EncodingType::DELTA;
+ isFixedDelta = true;
+ writeValues();
+ }
+ }
+
+ // if fixed run length is <MIN_REPEAT and current value is
+ // different from previous then treat it as variable run
+ if (fixedRunLength > 0 && fixedRunLength < MIN_REPEAT) {
+ if (val != literals[numLiterals - 1]) {
+ variableRunLength = fixedRunLength;
+ fixedRunLength = 0;
+ }
+ }
+
+ // after writing values re-initialize the variables
+ if (numLiterals == 0) {
+ initializeLiterals(val);
+ } else {
+ // keep updating variable run lengths
+ prevDelta = val - literals[numLiterals - 1];
+ literals[numLiterals++] = val;
+ variableRunLength += 1;
+
+ // if variable run length reach the max scope, write it
+ if (variableRunLength == MAX_SCOPE) {
+ determineEncoding();
+ writeValues();
+ }
+ }
+ }
+ }
+ }
+ }
+
+ // Compute the bits required to represent pth percentile value
+ // @param data - array
+ // @param p - percentile value (>=0.0 to <=1.0)
+ // @return pth percentile bits
+ int32_t percentileBits(int64_t *data, int32_t offset, int32_t length,
+ double p) {
+ if ((p > 1.0) || (p <= 0.0)) {
+ return -1;
+ }
+
+ // histogram that store the encoded bit requirement for each values.
+ // maximum number of bits that can encoded is 32 (refer FixedBitSizes)
+ // int[] hist = new int[32];
+ std::vector<int32_t> hist(32);
+
+ // compute the histogram
+ for (int32_t i = offset; i < (offset + length); i++) {
+ int32_t idx = encodeBitWidth(findClosestNumBits(data[i]));
+ hist[idx] += 1;
+ }
+
+ int32_t perLen = (int32_t)(length * (1.0 - p));
+
+ // return the bits required by pth percentile length
+ for (int32_t i = hist.size() - 1; i >= 0; i--) {
+ perLen -= hist[i];
+ if (perLen < 0) {
+ return decodeBitWidth(i);
+ }
+ }
+
+ return 0;
+ }
+
+ // Do not want to use Guava LongMath.checkedSubtract() here as it will throw
+ // ArithmeticException in case of overflow
+ bool isSafeSubtract(int64_t left, int64_t right) {
+ return (left ^ right) >= 0 | (left ^ (left - right)) >= 0;
+ }
+
+ void determineEncoding() {
+ // we need to compute zigzag values for DIRECT encoding if we decide to
+ // break early for delta overflows or for shorter runs
+ computeZigZagLiterals();
+
+ zzBits100p = percentileBits(zigzagLiterals.data(), 0, numLiterals, 1.0);
+
+ // not a big win for shorter runs to determine encoding
+ if (numLiterals <= MIN_REPEAT) {
+ encoding = EncodingType::DIRECT;
+ return;
+ }
+
+ // DELTA encoding check
+
+ // for identifying monotonic sequences
+ bool isIncreasing = true;
+ bool isDecreasing = true;
+ this->isFixedDelta = true;
+
+ this->min = literals[0];
+ int64_t max = literals[0];
+ int64_t initialDelta = literals[1] - literals[0];
+ int64_t currDelta = initialDelta;
+ int64_t deltaMax = initialDelta;
+ this->adjDeltas[0] = initialDelta;
+
+ for (int32_t i = 1; i < numLiterals; i++) {
+ int64_t l1 = literals[i];
+ int64_t l0 = literals[i - 1];
+ currDelta = l1 - l0;
+ min = std::min(min, l1);
+ max = std::max(max, l1);
+
+ isIncreasing &= (l0 <= l1);
+ isDecreasing &= (l0 >= l1);
+
+ isFixedDelta &= (currDelta == initialDelta);
+ if (i > 1) {
+ adjDeltas[i - 1] = std::abs(currDelta);
+ deltaMax = std::max(deltaMax, adjDeltas[i - 1]);
+ }
+ }
+
+ // its faster to exit under delta overflow condition without checking for
+ // PATCHED_BASE condition as encoding using DIRECT is faster and has less
+ // overhead than PATCHED_BASE
+ if (!isSafeSubtract(max, min)) {
+ encoding = EncodingType::DIRECT;
+ return;
+ }
+
+ // invariant - subtracting any number from any other in the literals after
+ // this point won't overflow
+
+ // if min is equal to max then the delta is 0, this condition happens for
+ // fixed values run >10 which cannot be encoded with SHORT_REPEAT
+ if (min == max) {
+ assert(isFixedDelta == true);
+ assert(currDelta == 0);
+ fixedDelta = 0;
+ encoding = EncodingType::DELTA;
+ return;
+ }
+
+ if (isFixedDelta) {
+ assert(currDelta == initialDelta);
+ encoding = EncodingType::DELTA;
+ fixedDelta = currDelta;
+ return;
+ }
+
+ // if initialDelta is 0 then we cannot delta encode as we cannot identify
+ // the sign of deltas (increasing or decreasing)
+ if (initialDelta != 0) {
+ // stores the number of bits required for packing delta blob in
+ // delta encoding
+ bitsDeltaMax = findClosestNumBits(deltaMax);
+
+ // monotonic condition
+ if (isIncreasing || isDecreasing) {
+ encoding = EncodingType::DELTA;
+ return;
+ }
+ }
+
+ // PATCHED_BASE encoding check
+
+ // percentile values are computed for the zigzag encoded values. if the
+ // number of bit requirement between 90th and 100th percentile varies
+ // beyond a threshold then we need to patch the values. if the variation
+ // is not significant then we can use direct encoding
+
+ zzBits90p = percentileBits(zigzagLiterals.data(), 0, numLiterals, 0.9);
+ int32_t diffBitsLH = zzBits100p - zzBits90p;
+
+ // if the difference between 90th percentile and 100th percentile fixed
+ // bits is > 1 then we need patch the values
+ if (diffBitsLH > 1) {
+ // patching is done only on base reduced values.
+ // remove base from literals
+ for (int32_t i = 0; i < numLiterals; i++) {
+ baseRedLiterals[i] = literals[i] - min;
+ }
+
+ // 95th percentile width is used to determine max allowed value
+ // after which patching will be done
+ brBits95p = percentileBits(baseRedLiterals.data(), 0, numLiterals, 0.95);
+
+ // 100th percentile is used to compute the max patch width
+ brBits100p = percentileBits(baseRedLiterals.data(), 0, numLiterals, 1.0);
+
+ // after base reducing the values, if the difference in bits between
+ // 95th percentile and 100th percentile value is zero then there
+ // is no point in patching the values, in which case we will
+ // fallback to DIRECT encoding.
+ // The decision to use patched base was based on zigzag values, but the
+ // actual patching is done on base reduced literals.
+ if ((brBits100p - brBits95p) != 0) {
+ encoding = EncodingType::PATCHED_BASE;
+ preparePatchedBlob();
+ return;
+ } else {
+ encoding = EncodingType::DIRECT;
+ return;
+ }
+ } else {
+ // if difference in bits between 90th percentile and 100th percentile is
+ // 0, then patch length will become 0. Hence we will fallback to direct
+ encoding = EncodingType::DIRECT;
+ return;
+ }
+ }
+
+ void computeZigZagLiterals() {
+ // populate zigzag encoded literals
+ int64_t zzEncVal = 0;
+ for (int32_t i = 0; i < numLiterals; i++) {
+ if (isSigned) {
+ zzEncVal = zigzagEncode(literals[i]);
+ } else {
+ zzEncVal = literals[i];
+ }
+ zigzagLiterals[i] = zzEncVal;
+ }
+ }
+
+ void preparePatchedBlob() {
+ // mask will be max value beyond which patch will be generated
+ int64_t mask = (1LL << brBits95p) - 1;
+
+ // since we are considering only 95 percentile, the size of gap and
+ // patch array can contain only be 5% values
+ patchLength = (int32_t)std::ceil((numLiterals * 0.05));
+
+ std::vector<int32_t> gapList(patchLength);
+ std::vector<int64_t> patchList(patchLength);
+
+ // #bit for patch
+ patchWidth = brBits100p - brBits95p;
+ patchWidth = getClosestFixedBits(patchWidth);
+
+ // if patch bit requirement is 64 then it will not possible to pack
+ // gap and patch together in a long. To make sure gap and patch can be
+ // packed together adjust the patch width
+ if (patchWidth == 64) {
+ patchWidth = 56;
+ brBits95p = 8;
+ mask = (1LL << brBits95p) - 1;
+ }
+
+ int32_t gapIdx = 0;
+ int32_t patchIdx = 0;
+ int32_t prev = 0;
+ int32_t gap = 0;
+ int32_t maxGap = 0;
+
+ for (int32_t i = 0; i < numLiterals; i++) {
+ // if value is above mask then create the patch and record the gap
+ if (baseRedLiterals[i] > mask) {
+ gap = i - prev;
+ if (gap > maxGap) {
+ maxGap = gap;
+ }
+
+ // gaps are relative, so store the previous patched value index
+ prev = i;
+ gapList[gapIdx++] = gap;
+
+ // extract the most significant bits that are over mask bits
+ int64_t patch = ((uint64_t)baseRedLiterals[i] >> brBits95p);
+ patchList[patchIdx++] = patch;
+
+ // strip off the MSB to enable safe bit packing
+ baseRedLiterals[i] &= mask;
+ }
+ }
+
+ // adjust the patch length to number of entries in gap list
+ patchLength = gapIdx;
+
+ // if the element to be patched is the first and only element then
+ // max gap will be 0, but to store the gap as 0 we need atleast 1 bit
+ if (maxGap == 0 && patchLength != 0) {
+ patchGapWidth = 1;
+ } else {
+ patchGapWidth = findClosestNumBits(maxGap);
+ }
+
+ // special case: if the patch gap width is greater than 256, then
+ // we need 9 bits to encode the gap width. But we only have 3 bits in
+ // header to record the gap width. To deal with this case, we will save
+ // two entries in patch list in the following way
+ // 256 gap width => 0 for patch value
+ // actual gap - 256 => actual patch value
+ // We will do the same for gap width = 511. If the element to be patched is
+ // the last element in the scope then gap width will be 511. In this case we
+ // will have 3 entries in the patch list in the following way
+ // 255 gap width => 0 for patch value
+ // 255 gap width => 0 for patch value
+ // 1 gap width => actual patch value
+ if (patchGapWidth > 8) {
+ patchGapWidth = 8;
+ // for gap = 511, we need two additional entries in patch list
+ if (maxGap == 511) {
+ patchLength += 2;
+ } else {
+ patchLength += 1;
+ }
+ }
+
+ // create gap vs patch list
+ gapIdx = 0;
+ patchIdx = 0;
+ gapVsPatchList.resize(patchLength);
+ for (int32_t i = 0; i < patchLength; i++) {
+ int64_t g = gapList[gapIdx++];
+ int64_t p = patchList[patchIdx++];
+ while (g > 255) {
+ gapVsPatchList[i++] = (255L << patchWidth);
+ g -= 255;
+ }
+
+ // store patch value in LSBs and gap in MSBs
+ gapVsPatchList[i] = (g << patchWidth) | p;
+ }
+ }
+
+ void writeValues() {
+ if (numLiterals != 0) {
+ if (encoding == EncodingType::SHORT_REPEAT) {
+ writeShortRepeatValues();
+ } else if (encoding == EncodingType::DIRECT) {
+ writeDirectValues();
+ } else if (encoding == EncodingType::PATCHED_BASE) {
+ writePatchedBaseValues();
+ } else {
+ writeDeltaValues();
+ }
+
+ // clear all the variables
+ clear();
+ }
+ }
+
+ int32_t getClosestAlignedFixedBits(int32_t n) {
+ if (n == 0 || n == 1) {
+ return 1;
+ } else if (n > 1 && n <= 2) {
+ return 2;
+ } else if (n > 2 && n <= 4) {
+ return 4;
+ } else if (n > 4 && n <= 8) {
+ return 8;
+ } else if (n > 8 && n <= 16) {
+ return 16;
+ } else if (n > 16 && n <= 24) {
+ return 24;
+ } else if (n > 24 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ // For a given fixed bit this function will return the closest available fixed
+ // bit
+ // @param n
+ // @return closest valid fixed bit
+ int32_t getClosestFixedBits(int32_t n) {
+ if (n == 0) {
+ return 1;
+ }
+
+ if (n >= 1 && n <= 24) {
+ return n;
+ } else if (n > 24 && n <= 26) {
+ return 26;
+ } else if (n > 26 && n <= 28) {
+ return 28;
+ } else if (n > 28 && n <= 30) {
+ return 30;
+ } else if (n > 30 && n <= 32) {
+ return 32;
+ } else if (n > 32 && n <= 40) {
+ return 40;
+ } else if (n > 40 && n <= 48) {
+ return 48;
+ } else if (n > 48 && n <= 56) {
+ return 56;
+ } else {
+ return 64;
+ }
+ }
+
+ // Finds the closest available fixed bit width match and returns its encoded
+ // value (ordinal)
+ // @param n Fixed bit width to encode
+ // @return Encoded fixed bit width
+ int32_t encodeBitWidth(int32_t n) {
+ n = getClosestFixedBits(n);
+
+ if (n >= 1 && n <= 24) {
+ return n - 1;
+ } else if (n > 24 && n <= 26) {
+ return FixedBitSizes::FBS::TWENTYSIX;
+ } else if (n > 26 && n <= 28) {
+ return FixedBitSizes::FBS::TWENTYEIGHT;
+ } else if (n > 28 && n <= 30) {
+ return FixedBitSizes::FBS::THIRTY;
+ } else if (n > 30 && n <= 32) {
+ return FixedBitSizes::FBS::THIRTYTWO;
+ } else if (n > 32 && n <= 40) {
+ return FixedBitSizes::FBS::FORTY;
+ } else if (n > 40 && n <= 48) {
+ return FixedBitSizes::FBS::FORTYEIGHT;
+ } else if (n > 48 && n <= 56) {
+ return FixedBitSizes::FBS::FIFTYSIX;
+ } else {
+ return FixedBitSizes::FBS::SIXTYFOUR;
+ }
+ }
+
+ // Store the opcode in 2 MSB bits
+ // @return opcode
+ int32_t getOpcode() { return encoding << 6; }
+
+ void writeDeltaValues() {
+ int32_t len = 0;
+ int32_t fb = bitsDeltaMax;
+ int32_t efb = 0;
+
+ if (alignedBitpacking) {
+ fb = getClosestAlignedFixedBits(fb);
+ }
+
+ if (isFixedDelta) {
+ // if fixed run length is greater than threshold then it will be fixed
+ // delta sequence with delta value 0 else fixed delta sequence with
+ // non-zero delta value
+ if (fixedRunLength > MIN_REPEAT) {
+ // ex. sequence: 2 2 2 2 2 2 2 2
+ len = fixedRunLength - 1;
+ fixedRunLength = 0;
+ } else {
+ // ex. sequence: 4 6 8 10 12 14 16
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+ } else {
+ // fixed width 0 is used for long repeating values.
+ // sequences that require only 1 bit to encode will have an additional bit
+ if (fb == 1) {
+ fb = 2;
+ }
+ efb = encodeBitWidth(fb);
+ efb = efb << 1;
+ len = variableRunLength - 1;
+ variableRunLength = 0;
+ }
+
+ // extract the 9th bit of run length
+ int32_t tailBits = (uint32_t)(len & 0x100) >> 8;
+
+ // create first byte of the header
+ int32_t headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ int32_t headerSecondByte = len & 0xff;
+
+ // write header
+ output->writeByte(headerFirstByte);
+ output->writeByte(headerSecondByte);
+
+ // store the first value from zigzag literal array
+ if (isSigned) {
+ writeInt64(output.get(), literals[0]);
+ } else {
+ writeUInt64(output.get(), literals[0]);
+ }
+
+ if (isFixedDelta) {
+ // if delta is fixed then we don't need to store delta blob
+ writeInt64(output.get(), fixedDelta);
+ } else {
+ // store the first value as delta value using zigzag encoding
+ writeInt64(output.get(), adjDeltas[0]);
+
+ // adjacent delta values are bit packed. The length of adjDeltas array is
+ // always one less than the number of literals (delta difference for n
+ // elements is n-1). We have already written one element, write the
+ // remaining numLiterals - 2 elements here
+
+ writeInts(adjDeltas.data(), 1, numLiterals - 2, fb, output.get());
+ }
+ }
+
+ // Count the number of bits required to encode the given value
+ // @param value
+ // @return bits required to store value
+ int32_t findClosestNumBits(int64_t value) {
+ int32_t count = 0;
+ while (value != 0) {
+ count++;
+ value = ((uint64_t)(value) >> 1);
+ }
+ return getClosestFixedBits(count);
+ }
+
+ void writePatchedBaseValues() {
+ // NOTE: Aligned bit packing cannot be applied for PATCHED_BASE encoding
+ // because patch is applied to MSB bits. For example: If fixed bit width of
+ // base value is 7 bits and if patch is 3 bits, the actual value is
+ // constructed by shifting the patch to left by 7 positions.
+ // actual_value = patch << 7 | base_value
+ // So, if we align base_value then actual_value can not be reconstructed.
+
+ // write the number of fixed bits required in next 5 bits
+ int32_t fb = brBits95p;
+ int32_t efb = encodeBitWidth(fb) << 1;
+
+ // adjust variable run length, they are one off
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ int32_t tailBits = (uint32_t)(variableRunLength & 0x100) >> 8;
+
+ // create first byte of the header
+ int32_t headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ int32_t headerSecondByte = variableRunLength & 0xff;
+
+ // if the min value is negative toggle the sign
+ bool isNegative = min < 0 ? true : false;
+ if (isNegative) {
+ min = -min;
+ }
+
+ // find the number of bytes required for base and shift it by 5 bits
+ // to accommodate patch width. The additional bit is used to store the sign
+ // of the base value.
+ int32_t baseWidth = findClosestNumBits(min) + 1;
+ int32_t baseBytes =
+ baseWidth % 8 == 0 ? baseWidth / 8 : (baseWidth / 8) + 1;
+ int32_t bb = (baseBytes - 1) << 5;
+
+ // if the base value is negative then set MSB to 1
+ if (isNegative) {
+ min |= (1LL << ((baseBytes * 8) - 1));
+ }
+
+ // third byte contains 3 bits for number of bytes occupied by base
+ // and 5 bits for patchWidth
+ int32_t headerThirdByte = bb | encodeBitWidth(patchWidth);
+
+ // fourth byte contains 3 bits for page gap width and 5 bits for
+ // patch length
+ int32_t headerFourthByte = (patchGapWidth - 1) << 5 | patchLength;
+
+ // write header
+ output->writeByte(headerFirstByte);
+ output->writeByte(headerSecondByte);
+ output->writeByte(headerThirdByte);
+ output->writeByte(headerFourthByte);
+
+ // write the base value using fixed bytes in big endian order
+ for (int32_t i = baseBytes - 1; i >= 0; i--) {
+ int8_t b = (int8_t)(((uint64_t)min >> (i * 8)) & 0xff);
+ output->writeByte(b);
+ }
+
+ // base reduced literals are bit packed
+ int32_t closestFixedBits = getClosestFixedBits(fb);
+
+ writeInts(baseRedLiterals.data(), 0, numLiterals, closestFixedBits,
+ output.get());
+
+ // write patch list
+ closestFixedBits = getClosestFixedBits(patchGapWidth + patchWidth);
+
+ writeInts(gapVsPatchList.data(), 0, gapVsPatchList.size(), closestFixedBits,
+ output.get());
+
+ // reset run length
+ variableRunLength = 0;
+ }
+
+ void writeDirectValues() {
+ // write the number of fixed bits required in next 5 bits
+ int32_t fb = zzBits100p;
+
+ if (alignedBitpacking) {
+ fb = getClosestAlignedFixedBits(fb);
+ }
+
+ int32_t efb = encodeBitWidth(fb) << 1;
+
+ // adjust variable run length
+ variableRunLength -= 1;
+
+ // extract the 9th bit of run length
+ int32_t tailBits = (uint32_t)(variableRunLength & 0x100) >> 8;
+
+ // create first byte of the header
+ int32_t headerFirstByte = getOpcode() | efb | tailBits;
+
+ // second byte of the header stores the remaining 8 bits of runlength
+ int32_t headerSecondByte = variableRunLength & 0xff;
+
+ // write header
+ output->writeByte(headerFirstByte);
+ output->writeByte(headerSecondByte);
+
+ // bit packing the zigzag encoded literals
+ writeInts(zigzagLiterals.data(), 0, numLiterals, fb, output.get());
+
+ // reset run length
+ variableRunLength = 0;
+ }
+
+ void writeShortRepeatValues() {
+ // get the value that is repeating, compute the bits and bytes required
+ int64_t repeatVal = 0;
+ if (isSigned) {
+ repeatVal = zigzagEncode(literals[0]);
+ } else {
+ repeatVal = literals[0];
+ }
+
+ int32_t numBitsRepeatVal = findClosestNumBits(repeatVal);
+ int32_t numBytesRepeatVal = numBitsRepeatVal % 8 == 0
+ ? (uint32_t)numBitsRepeatVal >> 3
+ : ((uint32_t)numBitsRepeatVal >> 3) + 1;
+
+ // write encoding type in top 2 bits
+ int32_t header = getOpcode();
+
+ // write the number of bytes required for the value
+ header |= ((numBytesRepeatVal - 1) << 3);
+
+ // write the run length
+ fixedRunLength -= MIN_REPEAT;
+ header |= fixedRunLength;
+
+ // write the header
+ output->writeByte(header);
+
+ // write the repeating value in big endian byte order
+ for (int32_t i = numBytesRepeatVal - 1; i >= 0; i--) {
+ int32_t b = (int32_t)(((uint64_t)repeatVal >> (i * 8)) & 0xff);
+ output->writeByte(b);
+ }
+
+ fixedRunLength = 0;
+ }
+
+ void clear() {
+ numLiterals = 0;
+ encoding = UNKNOWN;
+ prevDelta = 0;
+ fixedDelta = 0;
+ zzBits90p = 0;
+ zzBits100p = 0;
+ brBits95p = 0;
+ brBits100p = 0;
+ bitsDeltaMax = 0;
+ patchGapWidth = 0;
+ patchLength = 0;
+ patchWidth = 0;
+ gapVsPatchList.resize(0);
+ min = 0;
+ isFixedDelta = true;
+ }
+
+ void initializeLiterals(int64_t val) {
+ literals[numLiterals++] = val;
+ fixedRunLength = 1;
+ variableRunLength = 1;
+ }
+
+ private:
+ std::unique_ptr<SeekableOutputStream> output;
+ const bool isSigned = false;
+
+ const int32_t MAX_SCOPE = 512;
+ const int32_t MAX_SHORT_REPEAT_LENGTH = 10;
+ int64_t prevDelta = 0;
+ int32_t fixedRunLength = 0;
+ int32_t variableRunLength = 0;
+ std::vector<int64_t> literals;
+ EncodingType encoding = EncodingType::UNKNOWN;
+ int32_t numLiterals = 0;
+
+ std::vector<int64_t> zigzagLiterals;
+ std::vector<int64_t> baseRedLiterals;
+ std::vector<int64_t> adjDeltas;
+ int64_t fixedDelta = 0;
+ int32_t zzBits90p = 0;
+ int32_t zzBits100p = 0;
+ int32_t brBits95p = 0;
+ int32_t brBits100p = 0;
+ int32_t bitsDeltaMax = 0;
+ int32_t patchWidth = 0;
+ int32_t patchGapWidth = 0;
+ int32_t patchLength = 0;
+ std::vector<int64_t> gapVsPatchList;
+ int64_t min = 0;
+ bool isFixedDelta = false;
+ bool alignedBitpacking = false;
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_V2_H_
diff --git a/depends/storage/src/storage/format/orc/rle.cc b/depends/storage/src/storage/format/orc/rle.cc
new file mode 100644
index 0000000..20dcd95
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/rle.cc
@@ -0,0 +1,139 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/rle.h"
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/rle-v0.h"
+#include "storage/format/orc/rle-v1.h"
+#include "storage/format/orc/rle-v2.h"
+
+namespace orc {
+
+// must be non-inline!
+RleDecoder::~RleDecoder() {}
+
+std::unique_ptr<RleDecoder> createRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, bool isSigned,
+ RleVersion version, dbcommon::MemoryPool& pool, // NOLINT
+ ORCTypeKind type) {
+ switch (static_cast<int64_t>(version)) {
+ case RleVersion_1:
+ // We don't have std::make_unique() yet.
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV1<int64_t, uint64_t>(std::move(input), isSigned));
+ case INT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV1<int32_t, uint64_t>(std::move(input), isSigned));
+ case SHORT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV1<int16_t, uint64_t>(std::move(input), isSigned));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ case RleVersion_2:
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV2<int64_t, uint64_t>(std::move(input), isSigned,
+ pool));
+ case INT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV2<int32_t, uint32_t>(std::move(input), isSigned,
+ pool));
+ case SHORT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV2<int16_t, uint16_t>(std::move(input), isSigned,
+ pool));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ case RleVersion_0:
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV0<int64_t>(std::move(input)));
+ case INT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV0<int32_t>(std::move(input)));
+ case SHORT:
+ return std::unique_ptr<RleDecoder>(
+ new RleDecoderV0<int16_t>(std::move(input)));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+}
+
+std::unique_ptr<RleCoder> createRleCoder(bool isSigned, RleVersion version,
+ ORCTypeKind type, CompressionKind kind,
+ bool alignedBitpacking) { // NOLINT
+ switch (static_cast<int64_t>(version)) {
+ case RleVersion_1:
+ // We don't have std::make_unique() yet.
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV1<int64_t>(createBlockCompressor(kind), isSigned));
+ case INT:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV1<int32_t>(createBlockCompressor(kind), isSigned));
+ case SHORT:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV1<int16_t>(createBlockCompressor(kind), isSigned));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ case RleVersion_2:
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleCoder>(new RleCoderV2<int64_t>(
+ createBlockCompressor(kind), isSigned, alignedBitpacking));
+ case INT:
+ return std::unique_ptr<RleCoder>(new RleCoderV2<int32_t>(
+ createBlockCompressor(kind), isSigned, alignedBitpacking));
+ case SHORT:
+ return std::unique_ptr<RleCoder>(new RleCoderV2<int16_t>(
+ createBlockCompressor(kind), isSigned, alignedBitpacking));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ case RleVersion_0:
+ switch (type) {
+ case LONG:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV0<int64_t>(createBlockCompressor(kind)));
+ case INT:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV0<int32_t>(createBlockCompressor(kind)));
+ case SHORT:
+ return std::unique_ptr<RleCoder>(
+ new RleCoderV0<int16_t>(createBlockCompressor(kind)));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Not implemented yet");
+ }
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/rle.h b/depends/storage/src/storage/format/orc/rle.h
new file mode 100644
index 0000000..f9be04c
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/rle.h
@@ -0,0 +1,596 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_H_
+
+#include <memory>
+#include <vector>
+
+#include "storage/format/orc/seekable-input-stream.h"
+#include "storage/format/orc/seekable-output-stream.h"
+
+namespace orc {
+
+inline int64_t unZigZag(uint64_t value) { return value >> 1 ^ -(value & 1); }
+
+class RleDecoder {
+ public:
+ // must be non-inline!
+ virtual ~RleDecoder();
+
+ // Seek to a particular spot.
+ virtual void seek(PositionProvider &) = 0;
+
+ // Seek over a given number of values.
+ virtual void skip(uint64_t numValues) = 0;
+
+ // Read a number of values into the batch.
+ // @param data the array to read into
+ // @param numValues the number of values to read
+ // @param notNull If the pointer is null, all values are read. If the
+ // pointer is not null, positions that are false are skipped.
+ virtual void next(void *data, uint64_t numValues, const char *notNull) = 0;
+};
+
+enum RleVersion { RleVersion_1, RleVersion_2, RleVersion_0 };
+
+// Create an RLE decoder.
+// @param input The input stream to read from
+// @param isSigned True if the number sequence is signed
+// @param version Version of RLE decoding to do
+// @param pool Memory pool to use for allocation
+// @return The RLE decoder
+std::unique_ptr<RleDecoder> createRleDecoder(
+ std::unique_ptr<SeekableInputStream> input, bool isSigned,
+ RleVersion version, dbcommon::MemoryPool &pool, // NOLINT
+ ORCTypeKind type = LONG);
+
+class RleCoder {
+ public:
+ RleCoder() { this->writeBuffer.resize(BUFFER_SIZE); }
+ virtual ~RleCoder() {}
+
+ // Write a number of values out.
+ // @param data The array to write
+ // @param numValues The number of values to write
+ // @param notNull If the pointer is null, all values are read. If the
+ // pointer is not null, positions that are false are skipped.
+ virtual void write(void *data, uint64_t numValues, const char *notNull) = 0;
+
+ // Flush the buffer to the given output stream
+ // @param os The output stream
+ // @return Void
+ virtual void flushToStream(OutputStream *os) = 0;
+
+ // Get stream size. This function just calls the
+ // getStreamSize() function of the underlying stream.
+ // So this size should be obtained after flushToStream.
+ // Otherwise there might be some buffers in RleCoders that are not
+ // been flushed to underlying stream.
+ // @return The stream size.
+ virtual uint64_t getStreamSize() = 0;
+
+ // Get the estimated space for the data that have been written to
+ // this coder.
+ // @return The estimated space
+ virtual uint64_t getEstimatedSpaceNeeded() = 0;
+
+ // Rest this RleCoder, and everything is reset
+ // @return Void
+ virtual void reset() = 0;
+
+ protected:
+ // Here varint encoding is used:
+ // https://developers.google.com/protocol-buffers/docs/encoding#varints
+ // ">>" is arithmetic shift
+ // @param os The output stream.
+ // @param value The signed 64-bit integer to write out
+ // @return Void
+ void writeInt64(orc::SeekableOutputStream *os, int64_t value) {
+ writeUInt64(os, zigzagEncode(value));
+ }
+
+ // Write out the value in Varint (ittle endian format)
+ // https://developers.google.com/protocol-buffers/docs/encoding#varints
+ // @param os The output stream
+ // @param value The unsigned 64-bit integer to write out
+ // @return Void
+ void writeUInt64(orc::SeekableOutputStream *os, uint64_t value) {
+ while (true) {
+ if ((value & ~0x7f) == 0) {
+ os->writeByte((int8_t)value);
+ return;
+ } else {
+ os->writeByte((int8_t)(0x80 | (value & 0x7f)));
+ value >>= 7;
+ }
+ }
+ }
+
+ // Bitpack and write the input values to underlying output stream
+ // @param input - values to write
+ // @param offset - offset
+ // @param len - length
+ // @param bitSize - bit width
+ // @param output - output stream
+ // @return Void
+ void writeInts(int64_t *input, int32_t offset, int32_t len, int32_t bitSize,
+ SeekableOutputStream *output) {
+ if (input == nullptr || offset < 0 || len < 1 || bitSize < 1) {
+ LOG_ERROR(ERRCODE_INVALID_PARAMETER_VALUE, "invalid parameter value");
+ }
+
+ switch (bitSize) {
+ case 1:
+ unrolledBitPack1(input, offset, len, output);
+ return;
+ case 2:
+ unrolledBitPack2(input, offset, len, output);
+ return;
+ case 4:
+ unrolledBitPack4(input, offset, len, output);
+ return;
+ case 8:
+ unrolledBitPack8(input, offset, len, output);
+ return;
+ case 16:
+ unrolledBitPack16(input, offset, len, output);
+ return;
+ case 24:
+ unrolledBitPack24(input, offset, len, output);
+ return;
+ case 32:
+ unrolledBitPack32(input, offset, len, output);
+ return;
+ case 40:
+ unrolledBitPack40(input, offset, len, output);
+ return;
+ case 48:
+ unrolledBitPack48(input, offset, len, output);
+ return;
+ case 56:
+ unrolledBitPack56(input, offset, len, output);
+ return;
+ case 64:
+ unrolledBitPack64(input, offset, len, output);
+ return;
+ default:
+ break;
+ }
+
+ int32_t bitsLeft = 8;
+ int8_t current = 0;
+ for (int32_t i = offset; i < (offset + len); i++) {
+ int64_t value = input[i];
+ int32_t bitsToWrite = bitSize;
+ while (bitsToWrite > bitsLeft) {
+ // add the bits to the bottom of the current word
+ current |= ((uint64_t)value) >> (bitsToWrite - bitsLeft);
+ // subtract out the bits we just added
+ bitsToWrite -= bitsLeft;
+ // zero out the bits above bitsToWrite
+ value &= (1LL << bitsToWrite) - 1;
+ output->write(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ bitsLeft -= bitsToWrite;
+ current |= value << bitsLeft;
+ if (bitsLeft == 0) {
+ output->writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ // flush
+ if (bitsLeft != 8) {
+ output->writeByte(current);
+ current = 0;
+ bitsLeft = 8;
+ }
+ }
+
+ // Bitpack and write the input values (only 1 bit per input value)
+ // to underlying output stream
+ // @param input - values to write
+ // @param offset - offset
+ // @param len - length
+ // @param bitSize - bit width
+ // @param output - output stream
+ // @return Void
+ void unrolledBitPack1(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ int32_t numHops = 8;
+ int32_t remainder = len % numHops;
+ int32_t endOffset = offset + len;
+ int32_t endUnroll = endOffset - remainder;
+ int32_t val = 0;
+ for (int32_t i = offset; i < endUnroll; i = i + numHops) {
+ val = (int32_t)(val | ((input[i] & 1) << 7) | ((input[i + 1] & 1) << 6) |
+ ((input[i + 2] & 1) << 5) | ((input[i + 3] & 1) << 4) |
+ ((input[i + 4] & 1) << 3) | ((input[i + 5] & 1) << 2) |
+ ((input[i + 6] & 1) << 1) | ((input[i + 7]) & 1));
+ output->writeByte(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int32_t startShift = 7;
+ for (int32_t i = endUnroll; i < endOffset; i++) {
+ val = (int32_t)(val | (input[i] & 1) << startShift);
+ startShift -= 1;
+ }
+ output->writeByte(val);
+ }
+ }
+
+ void unrolledBitPack2(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ int32_t numHops = 4;
+ int32_t remainder = len % numHops;
+ int32_t endOffset = offset + len;
+ int32_t endUnroll = endOffset - remainder;
+ int32_t val = 0;
+ for (int32_t i = offset; i < endUnroll; i = i + numHops) {
+ val = static_cast<int32_t>(
+ val | ((input[i] & 3) << 6) | ((input[i + 1] & 3) << 4) |
+ ((input[i + 2] & 3) << 2) | ((input[i + 3]) & 3));
+ output->writeByte(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int32_t startShift = 6;
+ for (int32_t i = endUnroll; i < endOffset; i++) {
+ val = static_cast<int32_t>(val | (input[i] & 3) << startShift);
+ startShift -= 2;
+ }
+ output->writeByte(val);
+ }
+ }
+
+ void unrolledBitPack4(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ int32_t numHops = 2;
+ int32_t remainder = len % numHops;
+ int32_t endOffset = offset + len;
+ int32_t endUnroll = endOffset - remainder;
+ int val = 0;
+ for (int32_t i = offset; i < endUnroll; i = i + numHops) {
+ val = (int32_t)(val | ((input[i] & 15) << 4) | ((input[i + 1]) & 15));
+ output->writeByte(val);
+ val = 0;
+ }
+
+ if (remainder > 0) {
+ int32_t startShift = 4;
+ for (int32_t i = endUnroll; i < endOffset; i++) {
+ val = (int32_t)(val | (input[i] & 15) << startShift);
+ startShift -= 4;
+ }
+ output->writeByte(val);
+ }
+ }
+
+ void unrolledBitPack8(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 1);
+ }
+
+ void unrolledBitPack16(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 2);
+ }
+
+ void unrolledBitPack24(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 3);
+ }
+
+ void unrolledBitPack32(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 4);
+ }
+
+ void unrolledBitPack40(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 5);
+ }
+
+ void unrolledBitPack48(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 6);
+ }
+
+ void unrolledBitPack56(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 7);
+ }
+
+ void unrolledBitPack64(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output) {
+ unrolledBitPackBytes(input, offset, len, output, 8);
+ }
+
+ void unrolledBitPackBytes(int64_t *input, int32_t offset, int32_t len,
+ SeekableOutputStream *output, int32_t numBytes) {
+ int32_t numHops = 8;
+ int32_t remainder = len % numHops;
+ int32_t endOffset = offset + len;
+ int32_t endUnroll = endOffset - remainder;
+ int32_t i = offset;
+ for (; i < endUnroll; i = i + numHops) {
+ writeLongBE(output, input, i, numHops, numBytes);
+ }
+
+ if (remainder > 0) {
+ writeRemainingLongs(output, i, input, remainder, numBytes);
+ }
+ }
+
+ void writeRemainingLongs(SeekableOutputStream *output, int32_t offset,
+ int64_t *input, int32_t remainder,
+ int32_t numBytes) {
+ int32_t numHops = remainder;
+
+ int idx = 0;
+ switch (numBytes) {
+ case 1:
+ while (remainder > 0) {
+ writeBuffer[idx] = (int8_t)(input[offset + idx] & 255);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 2:
+ while (remainder > 0) {
+ writeLongBE2(output, input[offset + idx], idx * 2);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 3:
+ while (remainder > 0) {
+ writeLongBE3(output, input[offset + idx], idx * 3);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 4:
+ while (remainder > 0) {
+ writeLongBE4(output, input[offset + idx], idx * 4);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 5:
+ while (remainder > 0) {
+ writeLongBE5(output, input[offset + idx], idx * 5);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 6:
+ while (remainder > 0) {
+ writeLongBE6(output, input[offset + idx], idx * 6);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 7:
+ while (remainder > 0) {
+ writeLongBE7(output, input[offset + idx], idx * 7);
+ remainder--;
+ idx++;
+ }
+ break;
+ case 8:
+ while (remainder > 0) {
+ writeLongBE8(output, input[offset + idx], idx * 8);
+ remainder--;
+ idx++;
+ }
+ break;
+ default:
+ break;
+ }
+
+ int32_t toWrite = numHops * numBytes;
+ output->write(reinterpret_cast<const char *>(writeBuffer.data()), toWrite);
+ }
+
+ void writeLongBE(SeekableOutputStream *output, int64_t *input, int32_t offset,
+ int32_t numHops, int32_t numBytes) {
+ switch (numBytes) {
+ case 1:
+ writeBuffer[0] = (uint8_t)(input[offset + 0] & 255);
+ writeBuffer[1] = (uint8_t)(input[offset + 1] & 255);
+ writeBuffer[2] = (uint8_t)(input[offset + 2] & 255);
+ writeBuffer[3] = (uint8_t)(input[offset + 3] & 255);
+ writeBuffer[4] = (uint8_t)(input[offset + 4] & 255);
+ writeBuffer[5] = (uint8_t)(input[offset + 5] & 255);
+ writeBuffer[6] = (uint8_t)(input[offset + 6] & 255);
+ writeBuffer[7] = (uint8_t)(input[offset + 7] & 255);
+ break;
+ case 2:
+ writeLongBE2(output, input[offset + 0], 0);
+ writeLongBE2(output, input[offset + 1], 2);
+ writeLongBE2(output, input[offset + 2], 4);
+ writeLongBE2(output, input[offset + 3], 6);
+ writeLongBE2(output, input[offset + 4], 8);
+ writeLongBE2(output, input[offset + 5], 10);
+ writeLongBE2(output, input[offset + 6], 12);
+ writeLongBE2(output, input[offset + 7], 14);
+ break;
+ case 3:
+ writeLongBE3(output, input[offset + 0], 0);
+ writeLongBE3(output, input[offset + 1], 3);
+ writeLongBE3(output, input[offset + 2], 6);
+ writeLongBE3(output, input[offset + 3], 9);
+ writeLongBE3(output, input[offset + 4], 12);
+ writeLongBE3(output, input[offset + 5], 15);
+ writeLongBE3(output, input[offset + 6], 18);
+ writeLongBE3(output, input[offset + 7], 21);
+ break;
+ case 4:
+ writeLongBE4(output, input[offset + 0], 0);
+ writeLongBE4(output, input[offset + 1], 4);
+ writeLongBE4(output, input[offset + 2], 8);
+ writeLongBE4(output, input[offset + 3], 12);
+ writeLongBE4(output, input[offset + 4], 16);
+ writeLongBE4(output, input[offset + 5], 20);
+ writeLongBE4(output, input[offset + 6], 24);
+ writeLongBE4(output, input[offset + 7], 28);
+ break;
+ case 5:
+ writeLongBE5(output, input[offset + 0], 0);
+ writeLongBE5(output, input[offset + 1], 5);
+ writeLongBE5(output, input[offset + 2], 10);
+ writeLongBE5(output, input[offset + 3], 15);
+ writeLongBE5(output, input[offset + 4], 20);
+ writeLongBE5(output, input[offset + 5], 25);
+ writeLongBE5(output, input[offset + 6], 30);
+ writeLongBE5(output, input[offset + 7], 35);
+ break;
+ case 6:
+ writeLongBE6(output, input[offset + 0], 0);
+ writeLongBE6(output, input[offset + 1], 6);
+ writeLongBE6(output, input[offset + 2], 12);
+ writeLongBE6(output, input[offset + 3], 18);
+ writeLongBE6(output, input[offset + 4], 24);
+ writeLongBE6(output, input[offset + 5], 30);
+ writeLongBE6(output, input[offset + 6], 36);
+ writeLongBE6(output, input[offset + 7], 42);
+ break;
+ case 7:
+ writeLongBE7(output, input[offset + 0], 0);
+ writeLongBE7(output, input[offset + 1], 7);
+ writeLongBE7(output, input[offset + 2], 14);
+ writeLongBE7(output, input[offset + 3], 21);
+ writeLongBE7(output, input[offset + 4], 28);
+ writeLongBE7(output, input[offset + 5], 35);
+ writeLongBE7(output, input[offset + 6], 42);
+ writeLongBE7(output, input[offset + 7], 49);
+ break;
+ case 8:
+ writeLongBE8(output, input[offset + 0], 0);
+ writeLongBE8(output, input[offset + 1], 8);
+ writeLongBE8(output, input[offset + 2], 16);
+ writeLongBE8(output, input[offset + 3], 24);
+ writeLongBE8(output, input[offset + 4], 32);
+ writeLongBE8(output, input[offset + 5], 40);
+ writeLongBE8(output, input[offset + 6], 48);
+ writeLongBE8(output, input[offset + 7], 56);
+ break;
+ default:
+ break;
+ }
+
+ int32_t toWrite = numHops * numBytes;
+ output->write(reinterpret_cast<const char *>(writeBuffer.data()), toWrite);
+ }
+
+ void writeLongBE2(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE3(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE4(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 24);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 3] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE5(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 32);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 24);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 3] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 4] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE6(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 40);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 32);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 24);
+ writeBuffer[wbOffset + 3] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 4] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 5] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE7(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 48);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 40);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 32);
+ writeBuffer[wbOffset + 3] = (uint8_t)((uint64_t)val >> 24);
+ writeBuffer[wbOffset + 4] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 5] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 6] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ void writeLongBE8(SeekableOutputStream *output, int64_t val,
+ int32_t wbOffset) {
+ writeBuffer[wbOffset + 0] = (uint8_t)((uint64_t)val >> 56);
+ writeBuffer[wbOffset + 1] = (uint8_t)((uint64_t)val >> 48);
+ writeBuffer[wbOffset + 2] = (uint8_t)((uint64_t)val >> 40);
+ writeBuffer[wbOffset + 3] = (uint8_t)((uint64_t)val >> 32);
+ writeBuffer[wbOffset + 4] = (uint8_t)((uint64_t)val >> 24);
+ writeBuffer[wbOffset + 5] = (uint8_t)((uint64_t)val >> 16);
+ writeBuffer[wbOffset + 6] = (uint8_t)((uint64_t)val >> 8);
+ writeBuffer[wbOffset + 7] = (uint8_t)((uint64_t)val >> 0);
+ }
+
+ // zigzag encode the given value
+ // @param val
+ // @return zigzag encoded value
+ uint64_t zigzagEncode(int64_t val) { return (val << 1) ^ (val >> 63); }
+
+ private:
+ const int32_t BUFFER_SIZE = 64;
+ std::vector<uint8_t> writeBuffer;
+};
+
+// Create an RLE coder.
+// @param isSigned true if the number sequence is signed
+// @param version version of RLE decoding to do
+// @param type The type
+// @param kind The compression method
+// @param alignedBitpacking Whether to use aligned bitpacking
+// @return The RLE coder
+std::unique_ptr<RleCoder> createRleCoder(
+ bool isSigned, RleVersion version, ORCTypeKind type, CompressionKind kind,
+ bool alignedBitpacking = false); // NOLINT
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_RLE_H_
diff --git a/depends/storage/src/storage/format/orc/seekable-input-stream.cc b/depends/storage/src/storage/format/orc/seekable-input-stream.cc
new file mode 100644
index 0000000..91fae56
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/seekable-input-stream.cc
@@ -0,0 +1,624 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <lz4.h>
+
+#include <algorithm>
+#include <iomanip>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/lzo-decompressor.h"
+#include "storage/format/orc/seekable-input-stream.h"
+
+namespace orc {
+
+void printBuffer(std::ostream& out, const char* buffer, // NOLINT
+ uint64_t length) {
+ const uint64_t width = 24;
+ out << std::hex;
+ for (uint64_t line = 0; line < (length + width - 1) / width; ++line) {
+ out << std::setfill('0') << std::setw(7) << (line * width);
+ for (uint64_t byte = 0; byte < width && line * width + byte < length;
+ ++byte) {
+ out << " " << std::setfill('0') << std::setw(2)
+ << static_cast<uint64_t>(0xff & buffer[line * width + byte]);
+ }
+ out << "\n";
+ }
+ out << std::dec;
+}
+
+PositionProvider::PositionProvider(const std::list<uint64_t>& posns) {
+ position = posns.begin();
+}
+
+uint64_t PositionProvider::next() {
+ uint64_t result = *position;
+ ++position;
+ return result;
+}
+
+SeekableInputStream::~SeekableInputStream() {
+ // PASS
+}
+
+SeekableArrayInputStream::~SeekableArrayInputStream() {
+ // PASS
+}
+
+SeekableArrayInputStream::SeekableArrayInputStream(const unsigned char* values,
+ uint64_t size,
+ uint64_t blkSize)
+ : data(reinterpret_cast<const char*>(values)) {
+ length = size;
+ position = 0;
+ blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+}
+
+SeekableArrayInputStream::SeekableArrayInputStream(const char* values,
+ uint64_t size,
+ uint64_t blkSize)
+ : data(values) {
+ length = size;
+ position = 0;
+ blockSize = blkSize == 0 ? length : static_cast<uint64_t>(blkSize);
+}
+
+bool SeekableArrayInputStream::Next(const void** buffer, int* size) {
+ uint64_t currentSize = std::min(length - position, blockSize);
+ if (currentSize > 0) {
+ *buffer = data + position;
+ *size = static_cast<int>(currentSize);
+ position += currentSize;
+ return true;
+ }
+ *size = 0;
+ return false;
+}
+
+void SeekableArrayInputStream::BackUp(int count) {
+ if (count >= 0) {
+ uint64_t unsignedCount = static_cast<uint64_t>(count);
+ if (unsignedCount <= blockSize && unsignedCount <= position) {
+ position -= unsignedCount;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Can't backup that much!");
+ }
+ }
+}
+
+bool SeekableArrayInputStream::Skip(int count) {
+ if (count >= 0) {
+ uint64_t unsignedCount = static_cast<uint64_t>(count);
+ if (unsignedCount + position <= length) {
+ position += unsignedCount;
+ return true;
+ } else {
+ position = length;
+ }
+ }
+ return false;
+}
+
+google::protobuf::int64 SeekableArrayInputStream::ByteCount() const {
+ return static_cast<google::protobuf::int64>(position);
+}
+
+void SeekableArrayInputStream::seek(PositionProvider& seekPosition) {
+ position = seekPosition.next();
+}
+
+std::string SeekableArrayInputStream::getName() const {
+ std::ostringstream result;
+ result << "SeekableArrayInputStream " << position << " of " << length;
+ return result.str();
+}
+
+static uint64_t computeBlock(uint64_t request, uint64_t length) {
+ return std::min(length, request == 0 ? 256 * 1024 : request);
+}
+
+SeekableFileInputStream::SeekableFileInputStream(InputStream* stream,
+ uint64_t offset,
+ uint64_t byteCount,
+ dbcommon::MemoryPool& pool,
+ uint64_t _blockSize)
+ : memoryPool(pool),
+ input(stream),
+ start(offset),
+ length(byteCount),
+ blockSize(computeBlock(length, length)) {
+ position = 0;
+ buffer.reset(new DataBuffer<char>(pool, length));
+ pushBack = 0;
+}
+
+SeekableFileInputStream::SeekableFileInputStream(InputStream* input,
+ dbcommon::MemoryPool& pool)
+ : memoryPool(pool),
+ input(input),
+ start(0),
+ length(input->getLength()),
+ blockSize(computeBlock(0, length)) {
+ position = 0;
+ buffer.reset(new DataBuffer<char>(pool, length));
+ pushBack = 0;
+}
+
+SeekableFileInputStream::~SeekableFileInputStream() {
+ // PASS
+}
+
+bool SeekableFileInputStream::Next(const void** data, int* size) {
+ uint64_t bytesRead;
+ if (pushBack != 0) {
+ *data = buffer->data() + (buffer->size() - pushBack);
+ bytesRead = pushBack;
+ } else {
+ bytesRead = std::min(length - position, blockSize);
+ buffer->resize(bytesRead);
+ if (bytesRead > 0) {
+ input->read(buffer->data(), bytesRead, start + position);
+ *data = static_cast<void*>(buffer->data());
+ }
+ }
+ position += bytesRead;
+ pushBack = 0;
+ *size = static_cast<int>(bytesRead);
+ return bytesRead != 0;
+}
+
+void SeekableFileInputStream::BackUp(int signedCount) {
+ if (signedCount < 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "can't backup negative distances");
+ }
+ uint64_t count = static_cast<uint64_t>(signedCount);
+ if (pushBack > 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "can't backup unless we just called Next");
+ }
+ if (count > blockSize || count > position) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "can't backup that far");
+ }
+ pushBack = static_cast<uint64_t>(count);
+ position -= pushBack;
+}
+
+bool SeekableFileInputStream::Skip(int signedCount) {
+ if (signedCount < 0) {
+ return false;
+ }
+ uint64_t count = static_cast<uint64_t>(signedCount);
+ position = std::min(position + count, length);
+ pushBack = 0;
+ return position < length;
+}
+
+int64_t SeekableFileInputStream::ByteCount() const {
+ return static_cast<int64_t>(position);
+}
+
+void SeekableFileInputStream::seek(PositionProvider& location) {
+ position = location.next();
+ if (position > length) {
+ position = length;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "seek too far");
+ }
+ pushBack = 0;
+}
+
+std::string SeekableFileInputStream::getName() const {
+ std::ostringstream result;
+ result << input->getName() << " from " << start << " for " << length;
+ return result.str();
+}
+
+SeekableFileBloomFilterInputStream::SeekableFileBloomFilterInputStream(
+ InputStream* stream, uint64_t offset, uint64_t byteCount,
+ dbcommon::MemoryPool& pool, uint64_t _blockSize)
+ : SeekableFileInputStream(stream, offset, byteCount, pool, _blockSize) {}
+
+SeekableFileBloomFilterInputStream::~SeekableFileBloomFilterInputStream() {}
+
+bool SeekableFileBloomFilterInputStream::Next(const void** data, int* size) {
+ uint64_t bytesRead;
+ if (pushBack != 0) {
+ *data = buffer->data() + (buffer->size() - pushBack);
+ bytesRead = pushBack;
+ } else {
+ bytesRead = std::min(length - position, blockSize);
+ buffer->resize(bytesRead);
+ if (bytesRead > 0) {
+ input->readBloomFilter(buffer->data(), bytesRead, start + position);
+ *data = static_cast<void*>(buffer->data());
+ }
+ }
+ position += bytesRead;
+ pushBack = 0;
+ *size = static_cast<int>(bytesRead);
+ return bytesRead != 0;
+}
+
+ZlibDecompressionStream::ZlibDecompressionStream(
+ std::unique_ptr<SeekableInputStream> inStream, size_t _blockSize,
+ dbcommon::MemoryPool& pool)
+ : memoryPool(pool), blockSize(_blockSize), buffer(pool, _blockSize) {
+ input.reset(inStream.release());
+ zstream.next_in = Z_NULL;
+ zstream.avail_in = 0;
+ zstream.zalloc = Z_NULL;
+ zstream.zfree = Z_NULL;
+ zstream.opaque = Z_NULL;
+ zstream.next_out = reinterpret_cast<Bytef*>(buffer.data());
+ zstream.avail_out = static_cast<uInt>(blockSize);
+ int64_t result = inflateInit2(&zstream, -15);
+ switch (result) {
+ case Z_OK:
+ break;
+ case Z_MEM_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Memory error from inflateInit2");
+ case Z_VERSION_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Version error from inflateInit2");
+ case Z_STREAM_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Stream error from inflateInit2");
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown error from inflateInit2");
+ }
+ outputBuffer = nullptr;
+ outputBufferLength = 0;
+ remainingLength = 0;
+ state = DECOMPRESS_HEADER;
+ inputBuffer = nullptr;
+ inputBufferEnd = nullptr;
+ bytesReturned = 0;
+}
+
+ZlibDecompressionStream::~ZlibDecompressionStream() {
+ int64_t result = inflateEnd(&zstream);
+ if (result != Z_OK) {
+ // really can't throw in destructors
+ std::cout << "Error in ~ZlibDecompressionStream() " << result << "\n";
+ }
+}
+
+bool ZlibDecompressionStream::Next(const void** data, int* size) {
+ // if the user pushed back, return them the partial buffer
+ if (outputBufferLength) {
+ *data = outputBuffer;
+ *size = static_cast<int>(outputBufferLength);
+ outputBuffer += outputBufferLength;
+ outputBufferLength = 0;
+ return true;
+ }
+ if (state == DECOMPRESS_HEADER || remainingLength == 0) {
+ readHeader();
+ }
+ if (state == DECOMPRESS_EOF) {
+ return false;
+ }
+ if (inputBuffer == inputBufferEnd) {
+ readBuffer(true);
+ }
+ size_t availSize = std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
+ remainingLength);
+ if (state == DECOMPRESS_ORIGINAL) {
+ *data = inputBuffer;
+ *size = static_cast<int>(availSize);
+ outputBuffer = inputBuffer + availSize;
+ outputBufferLength = 0;
+ } else if (state == DECOMPRESS_START) {
+ zstream.next_in = reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availSize);
+ outputBuffer = buffer.data();
+ zstream.next_out =
+ reinterpret_cast<Bytef*>(const_cast<char*>(outputBuffer));
+ zstream.avail_out = static_cast<uInt>(blockSize);
+ if (inflateReset(&zstream) != Z_OK) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Bad inflateReset in "
+ "ZlibDecompressionStream::Next");
+ }
+ int64_t result;
+ do {
+ result = inflate(&zstream,
+ availSize == remainingLength ? Z_FINISH : Z_SYNC_FLUSH);
+ switch (result) {
+ case Z_OK:
+ remainingLength -= availSize;
+ inputBuffer += availSize;
+ readBuffer(true);
+ availSize =
+ std::min(static_cast<size_t>(inputBufferEnd - inputBuffer),
+ remainingLength);
+ zstream.next_in =
+ reinterpret_cast<Bytef*>(const_cast<char*>(inputBuffer));
+ zstream.avail_in = static_cast<uInt>(availSize);
+ break;
+ case Z_STREAM_END:
+ break;
+ case Z_BUF_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Buffer error in "
+ "ZlibDecompressionStream::Next");
+ case Z_DATA_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Data error in "
+ "ZlibDecompressionStream::Next");
+ case Z_STREAM_ERROR:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Stream error in "
+ "ZlibDecompressionStream::Next");
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Unknown error in "
+ "ZlibDecompressionStream::Next");
+ }
+ } while (result != Z_STREAM_END);
+ *size = static_cast<int>(blockSize - zstream.avail_out);
+ *data = outputBuffer;
+ outputBufferLength = 0;
+ outputBuffer += *size;
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Unknown compression state in "
+ "ZlibDecompressionStream::Next");
+ }
+ inputBuffer += availSize;
+ remainingLength -= availSize;
+ bytesReturned += *size;
+ return true;
+}
+
+void ZlibDecompressionStream::BackUp(int count) {
+ if (outputBuffer == nullptr || outputBufferLength != 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Backup without previous Next in "
+ "ZlibDecompressionStream");
+ }
+ outputBuffer -= static_cast<size_t>(count);
+ outputBufferLength = static_cast<size_t>(count);
+ bytesReturned -= count;
+}
+
+bool ZlibDecompressionStream::Skip(int count) {
+ bytesReturned += count;
+ // this is a stupid implementation for now.
+ // should skip entire blocks without decompressing
+ while (count > 0) {
+ const void* ptr;
+ int len;
+ if (!Next(&ptr, &len)) {
+ return false;
+ }
+ if (len > count) {
+ BackUp(len - count);
+ count = 0;
+ } else {
+ count -= len;
+ }
+ }
+ return true;
+}
+
+int64_t ZlibDecompressionStream::ByteCount() const { return bytesReturned; }
+
+void ZlibDecompressionStream::seek(PositionProvider& position) {
+ input->seek(position);
+ bytesReturned = input->ByteCount();
+ if (!Skip(static_cast<int>(position.next()))) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Bad skip in ZlibDecompressionStream::seek");
+ }
+}
+
+std::string ZlibDecompressionStream::getName() const {
+ std::ostringstream result;
+ result << "zlib(" << input->getName() << ")";
+ return result.str();
+}
+
+BlockDecompressionStream::BlockDecompressionStream(
+ std::unique_ptr<SeekableInputStream> inStream, size_t bufferSize,
+ dbcommon::MemoryPool& pool)
+ : memoryPool(pool),
+ inputBuffer(pool, bufferSize),
+ outputBuffer(pool, bufferSize),
+ state(DECOMPRESS_HEADER),
+ outputBufferPtr(0),
+ outputBufferLength(0),
+ remainingLength(0),
+ inputBufferPtr(0),
+ inputBufferPtrEnd(0),
+ bytesReturned(0) {
+ input.reset(inStream.release());
+}
+
+bool BlockDecompressionStream::Next(const void** data, int* size) {
+ // if the user pushed back, return them the partial buffer
+ if (outputBufferLength) {
+ *data = outputBufferPtr;
+ *size = static_cast<int>(outputBufferLength);
+ outputBufferPtr += outputBufferLength;
+ bytesReturned += outputBufferLength;
+ outputBufferLength = 0;
+ return true;
+ }
+ if (state == DECOMPRESS_HEADER || remainingLength == 0) {
+ readHeader();
+ }
+ if (state == DECOMPRESS_EOF) {
+ return false;
+ }
+ if (inputBufferPtr == inputBufferPtrEnd) {
+ readBuffer(true);
+ }
+
+ size_t availSize = std::min(
+ static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr), remainingLength);
+ if (state == DECOMPRESS_ORIGINAL) {
+ *data = inputBufferPtr;
+ *size = static_cast<int>(availSize);
+ outputBufferPtr = inputBufferPtr + availSize;
+ outputBufferLength = 0;
+ inputBufferPtr += availSize;
+ remainingLength -= availSize;
+ } else if (state == DECOMPRESS_START) {
+ // Get contiguous bytes of compressed block.
+ const char* compressed = inputBufferPtr;
+ if (remainingLength == availSize) {
+ inputBufferPtr += availSize;
+ } else {
+ // Did not read enough from input.
+ if (inputBuffer.capacity() < remainingLength) {
+ inputBuffer.resize(remainingLength);
+ }
+ ::memcpy(inputBuffer.data(), inputBufferPtr, availSize);
+ inputBufferPtr += availSize;
+ compressed = inputBuffer.data();
+
+ for (size_t pos = availSize; pos < remainingLength;) {
+ readBuffer(true);
+ size_t avail =
+ std::min(static_cast<size_t>(inputBufferPtrEnd - inputBufferPtr),
+ remainingLength - pos);
+ ::memcpy(inputBuffer.data() + pos, inputBufferPtr, avail);
+ pos += avail;
+ inputBufferPtr += avail;
+ }
+ }
+
+ outputBufferLength =
+ decompress(compressed, remainingLength, outputBuffer.data(),
+ outputBuffer.capacity());
+
+ remainingLength = 0;
+ state = DECOMPRESS_HEADER;
+ *data = outputBuffer.data();
+ *size = static_cast<int>(outputBufferLength);
+ outputBufferPtr = outputBuffer.data() + outputBufferLength;
+ outputBufferLength = 0;
+ }
+
+ bytesReturned += *size;
+ return true;
+}
+
+void BlockDecompressionStream::BackUp(int count) {
+ if (outputBufferPtr == nullptr || outputBufferLength != 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Backup without previous Next in %s",
+ getName().c_str());
+ }
+ outputBufferPtr -= static_cast<size_t>(count);
+ outputBufferLength = static_cast<size_t>(count);
+ bytesReturned -= count;
+}
+
+bool BlockDecompressionStream::Skip(int count) {
+ bytesReturned += count;
+ // this is a stupid implementation for now.
+ // should skip entire blocks without decompressing
+ while (count > 0) {
+ const void* ptr;
+ int len;
+ if (!Next(&ptr, &len)) {
+ return false;
+ }
+ if (len > count) {
+ BackUp(len - count);
+ count = 0;
+ } else {
+ count -= len;
+ }
+ }
+ return true;
+}
+
+int64_t BlockDecompressionStream::ByteCount() const { return bytesReturned; }
+
+void BlockDecompressionStream::seek(PositionProvider& position) {
+ input->seek(position);
+ if (!Skip(static_cast<int>(position.next()))) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Bad skip in %s", getName().c_str());
+ }
+}
+
+uint64_t SnappyDecompressionStream::decompress(const char* input,
+ uint64_t length, char* output,
+ size_t maxOutputLength) {
+ size_t outLength;
+ if (!snappy::GetUncompressedLength(input, length, &outLength)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "SnappyDecompressionStream choked on corrupt input");
+ }
+
+ if (outLength > maxOutputLength) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Snappy length exceeds block size");
+ }
+
+ if (!snappy::RawUncompress(input, length, output)) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "SnappyDecompressionStream choked on corrupt input");
+ }
+ return outLength;
+}
+
+uint64_t LzoDecompressionStream::decompress(const char* input, uint64_t length,
+ char* output,
+ size_t maxOutputLength) {
+ return lzoDecompress(input, input + length, output, output + maxOutputLength);
+}
+
+uint64_t Lz4DecompressionStream::decompress(const char* input, uint64_t length,
+ char* output,
+ size_t maxOutputLength) {
+ int result = LZ4_decompress_safe(input, output, static_cast<int>(length),
+ static_cast<int>(maxOutputLength));
+ if (result < 0) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s - failed to decompress",
+ getName().c_str());
+ }
+ return static_cast<uint64_t>(result);
+}
+
+std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input,
+ uint64_t blockSize, dbcommon::MemoryPool& pool) { // NOLINT
+ switch (static_cast<int64_t>(kind)) {
+ case CompressionKind_NONE:
+ return std::move(input);
+ case CompressionKind_ZLIB:
+ return std::unique_ptr<SeekableInputStream>(
+ new ZlibDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_SNAPPY:
+ return std::unique_ptr<SeekableInputStream>(
+ new SnappyDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_LZO:
+ return std::unique_ptr<SeekableInputStream>(
+ new LzoDecompressionStream(std::move(input), blockSize, pool));
+ case CompressionKind_LZ4:
+ return std::unique_ptr<SeekableInputStream>(
+ new Lz4DecompressionStream(std::move(input), blockSize, pool));
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown compression codec %lu", kind);
+ }
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/seekable-input-stream.h b/depends/storage/src/storage/format/orc/seekable-input-stream.h
new file mode 100644
index 0000000..7697d11
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/seekable-input-stream.h
@@ -0,0 +1,378 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_INPUT_STREAM_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_INPUT_STREAM_H_
+
+#include <google/protobuf/io/zero_copy_stream.h>
+#include <snappy.h>
+
+#include <algorithm>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <sstream>
+#include <string>
+#include <vector>
+
+#include "storage/format/orc/input-stream.h"
+#include "storage/format/orc/orc-proto-definition.h"
+#include "zlib.h" //NOLINT
+
+namespace orc {
+
+void printBuffer(std::ostream& out, const char* buffer, // NOLINT
+ uint64_t length);
+
+class PositionProvider {
+ private:
+ std::list<uint64_t>::const_iterator position;
+
+ public:
+ explicit PositionProvider(const std::list<uint64_t>& positions);
+ uint64_t next();
+};
+
+enum DecompressState {
+ DECOMPRESS_HEADER,
+ DECOMPRESS_START,
+ DECOMPRESS_CONTINUE,
+ DECOMPRESS_ORIGINAL,
+ DECOMPRESS_EOF
+};
+
+// A subclass of Google's ZeroCopyInputStream that supports seek.
+// By extending Google's class, we get the ability to pass it directly
+// to the protobuf readers.
+class SeekableInputStream : public google::protobuf::io::ZeroCopyInputStream {
+ public:
+ virtual ~SeekableInputStream();
+ virtual void seek(PositionProvider& position) = 0; // NOLINT
+ virtual std::string getName() const = 0;
+};
+
+// Create a seekable input stream based on a memory range.
+class SeekableArrayInputStream : public SeekableInputStream {
+ private:
+ const char* data;
+ uint64_t length;
+ uint64_t position;
+ uint64_t blockSize;
+
+ public:
+ SeekableArrayInputStream(const unsigned char* list, uint64_t length,
+ uint64_t block_size = 0);
+ SeekableArrayInputStream(const char* list, uint64_t length,
+ uint64_t block_size = 0);
+ virtual ~SeekableArrayInputStream();
+ bool Next(const void** data, int* size) override;
+ void BackUp(int count) override;
+ bool Skip(int count) override;
+ google::protobuf::int64 ByteCount() const override;
+ void seek(PositionProvider& position) override;
+ std::string getName() const override;
+};
+
+// Create a seekable input stream based on an input stream.
+class SeekableFileInputStream : public SeekableInputStream {
+ protected:
+ dbcommon::MemoryPool& memoryPool;
+ InputStream* const input;
+ const uint64_t start;
+ const uint64_t length;
+ const uint64_t blockSize;
+ std::unique_ptr<DataBuffer<char> > buffer;
+ uint64_t position;
+ uint64_t pushBack;
+
+ public:
+ SeekableFileInputStream(InputStream* input, uint64_t offset,
+ uint64_t byteCount,
+ dbcommon::MemoryPool& pool, // NOLINT
+ uint64_t blockSize = 0);
+ explicit SeekableFileInputStream(InputStream* input,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~SeekableFileInputStream();
+
+ bool Next(const void** data, int* size) override;
+ void BackUp(int count) override;
+ bool Skip(int count) override;
+ int64_t ByteCount() const override;
+ void seek(PositionProvider& position) override;
+ std::string getName() const override;
+};
+
+class SeekableFileBloomFilterInputStream : public SeekableFileInputStream {
+ public:
+ SeekableFileBloomFilterInputStream(InputStream* input, uint64_t offset,
+ uint64_t byteCount,
+ dbcommon::MemoryPool& pool, // NOLINT
+ uint64_t blockSize = 0);
+ virtual ~SeekableFileBloomFilterInputStream();
+
+ bool Next(const void** data, int* size) override;
+};
+
+class ZlibDecompressionStream : public SeekableInputStream {
+ public:
+ ZlibDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~ZlibDecompressionStream();
+ bool Next(const void** data, int* size) override;
+ void BackUp(int count) override;
+ bool Skip(int count) override;
+ int64_t ByteCount() const override;
+ void seek(PositionProvider& position) override;
+ std::string getName() const override;
+
+ private:
+ void readBuffer(bool failOnEof) {
+ int length;
+ if (!input->Next(reinterpret_cast<const void**>(&inputBuffer), &length)) {
+ if (failOnEof) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "Read past EOF in "
+ "ZlibDecompressionStream::readBuffer");
+ }
+ state = DECOMPRESS_EOF;
+ inputBuffer = nullptr;
+ inputBufferEnd = nullptr;
+ } else {
+ inputBufferEnd = inputBuffer + length;
+ }
+ }
+
+ uint32_t readByte(bool failOnEof) {
+ if (inputBuffer == inputBufferEnd) {
+ readBuffer(failOnEof);
+ if (state == DECOMPRESS_EOF) {
+ return 0;
+ }
+ }
+ return static_cast<unsigned char>(*(inputBuffer++));
+ }
+
+ void readHeader() {
+ uint32_t header = readByte(false);
+ if (state != DECOMPRESS_EOF) {
+ header |= readByte(true) << 8;
+ header |= readByte(true) << 16;
+ if (header & 1) {
+ state = DECOMPRESS_ORIGINAL;
+ } else {
+ state = DECOMPRESS_START;
+ }
+ remainingLength = header >> 1;
+ } else {
+ remainingLength = 0;
+ }
+ }
+
+ dbcommon::MemoryPool& memoryPool;
+ const size_t blockSize;
+ std::unique_ptr<SeekableInputStream> input;
+ z_stream zstream;
+ DataBuffer<char> buffer;
+
+ // the current state
+ DecompressState state;
+
+ // the start of the current buffer
+ // This pointer is not owned by us. It is either owned by zstream or
+ // the underlying stream.
+ const char* outputBuffer;
+ // the size of the current buffer
+ size_t outputBufferLength;
+ // the size of the current chunk
+ size_t remainingLength;
+
+ // the last buffer returned from the input
+ const char* inputBuffer;
+ const char* inputBufferEnd;
+
+ // roughly the number of bytes returned
+ off_t bytesReturned;
+};
+
+class BlockDecompressionStream : public SeekableInputStream {
+ public:
+ BlockDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ dbcommon::MemoryPool& pool); // NOLINT
+
+ virtual ~BlockDecompressionStream() {}
+ bool Next(const void** data, int* size) override;
+ void BackUp(int count) override;
+ bool Skip(int count) override;
+ int64_t ByteCount() const override;
+ void seek(PositionProvider& position) override;
+ std::string getName() const override = 0;
+
+ protected:
+ virtual uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) = 0;
+
+ std::string getStreamName() const { return input->getName(); }
+
+ private:
+ void readBuffer(bool failOnEof) {
+ int length;
+ if (!input->Next(reinterpret_cast<const void**>(&inputBufferPtr),
+ &length)) {
+ if (failOnEof) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s getName() read past EOF",
+ getName().c_str());
+ }
+ state = DECOMPRESS_EOF;
+ inputBufferPtr = nullptr;
+ inputBufferPtrEnd = nullptr;
+ } else {
+ inputBufferPtrEnd = inputBufferPtr + length;
+ }
+ }
+
+ uint32_t readByte(bool failOnEof) {
+ if (inputBufferPtr == inputBufferPtrEnd) {
+ readBuffer(failOnEof);
+ if (state == DECOMPRESS_EOF) {
+ return 0;
+ }
+ }
+ return static_cast<unsigned char>(*(inputBufferPtr++));
+ }
+
+ void readHeader() {
+ uint32_t header = readByte(false);
+ if (state != DECOMPRESS_EOF) {
+ header |= readByte(true) << 8;
+ header |= readByte(true) << 16;
+ if (header & 1) {
+ state = DECOMPRESS_ORIGINAL;
+ } else {
+ state = DECOMPRESS_START;
+ }
+ remainingLength = header >> 1;
+ } else {
+ remainingLength = 0;
+ }
+ }
+
+ std::unique_ptr<SeekableInputStream> input;
+ dbcommon::MemoryPool& memoryPool;
+
+ // may need to stitch together multiple input buffers;
+ // to give snappy a contiguous block
+ DataBuffer<char> inputBuffer;
+
+ // uncompressed output
+ DataBuffer<char> outputBuffer;
+
+ // the current state
+ DecompressState state;
+
+ // the start of the current output buffer
+ const char* outputBufferPtr;
+ // the size of the current output buffer
+ size_t outputBufferLength;
+
+ // the size of the current chunk
+ size_t remainingLength;
+
+ // the last buffer returned from the input
+ const char* inputBufferPtr;
+ const char* inputBufferPtrEnd;
+
+ // bytes returned by this stream
+ off_t bytesReturned;
+};
+
+class SnappyDecompressionStream : public BlockDecompressionStream {
+ public:
+ SnappyDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "snappy(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
+};
+
+class LzoDecompressionStream : public BlockDecompressionStream {
+ public:
+ LzoDecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lzo(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
+};
+
+class Lz4DecompressionStream : public BlockDecompressionStream {
+ public:
+ Lz4DecompressionStream(std::unique_ptr<SeekableInputStream> inStream,
+ size_t blockSize,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : BlockDecompressionStream(std::move(inStream), blockSize, pool) {
+ // PASS
+ }
+
+ std::string getName() const override {
+ std::ostringstream result;
+ result << "lz4(" << getStreamName() << ")";
+ return result.str();
+ }
+
+ protected:
+ uint64_t decompress(const char* input, uint64_t length, char* output,
+ size_t maxOutputLength) override;
+};
+
+// Create a decompressor for the given compression kind.
+// @param kind the compression type to implement
+// @param input the input stream that is the underlying source
+// @param bufferSize the maximum size of the buffer
+// @param pool the memory pool
+std::unique_ptr<SeekableInputStream> createDecompressor(
+ CompressionKind kind, std::unique_ptr<SeekableInputStream> input,
+ uint64_t bufferSize, dbcommon::MemoryPool& pool); // NOLINT
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_INPUT_STREAM_H_
diff --git a/depends/storage/src/storage/format/orc/seekable-output-stream.cc b/depends/storage/src/storage/format/orc/seekable-output-stream.cc
new file mode 100644
index 0000000..d90921e
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/seekable-output-stream.cc
@@ -0,0 +1,46 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/seekable-output-stream.h"
+
+namespace orc {
+
+uint64_t SeekableOutputStream::COMPRESS_BLOCK_SIZE = ORC_COMPRESSION_BLOCK_SIZE;
+
+std::unique_ptr<SeekableOutputStream> createBlockCompressor(
+ orc::CompressionKind kind) {
+ std::unique_ptr<SeekableOutputStream> stream;
+ switch (kind) {
+ case orc::CompressionKind::CompressionKind_SNAPPY:
+ stream.reset(new SnappyCompressionStream());
+ break;
+ case orc::CompressionKind::CompressionKind_LZ4:
+ stream.reset(new LZ4CompressionStream());
+ break;
+ case orc::CompressionKind::CompressionKind_NONE:
+ stream.reset(new BufferedStream());
+ break;
+ default:
+ break;
+ }
+
+ return std::move(stream);
+}
+
+} // end of namespace orc
diff --git a/depends/storage/src/storage/format/orc/seekable-output-stream.h b/depends/storage/src/storage/format/orc/seekable-output-stream.h
new file mode 100644
index 0000000..be86cd5
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/seekable-output-stream.h
@@ -0,0 +1,261 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_OUTPUT_STREAM_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_OUTPUT_STREAM_H_
+
+#include <google/protobuf/io/zero_copy_stream.h>
+
+#include <snappy.h>
+#include <vector>
+
+#include "dbcommon/log/logger.h"
+#include "dbcommon/utils/byte-buffer.h"
+#include "dbcommon/utils/comp/lz4-compressor.h"
+#include "dbcommon/utils/comp/snappy-compressor.h"
+#include "storage/format/orc/orc-proto-definition.h"
+#include "storage/format/orc/output-stream.h"
+
+namespace orc {
+
+class PositionRecorder {
+ public:
+ void addPosition(uint64_t offset) { positions.push_back(offset); }
+
+ const std::vector<uint64_t> *getPositions() const { return &positions; }
+
+ private:
+ std::vector<uint64_t> positions;
+};
+
+// corresponding to PositionedOutputStream in JAVA code.
+// This is the base class for all memory based stream.
+class SeekableOutputStream {
+ public:
+ SeekableOutputStream() : plainBuffer(true) {}
+
+ virtual ~SeekableOutputStream() {}
+
+ // Record the current position to the recorder.
+ // @param recorder the object that receives the position
+ // @throws IOException
+ void recordPosition(PositionRecorder *recorder) {
+ recorder->addPosition(plainBuffer.size());
+ }
+
+ // Get the stream size. This is the real size used by this stream.
+ // For uncompressed stream, return the plain size.
+ // And for compressed stream, return the size after compression.
+ // @return void
+ virtual uint64_t getStreamSize() { return plainBuffer.size(); }
+
+ virtual uint64_t getEstimatedSpaceNeeded() { return plainBuffer.size(); }
+
+ // Write the buffer of length "len" to the stream.
+ // @param buffer The input buffer
+ // @param len The length of the buffer
+ // @return Void
+ virtual void write(const char *buffer, uint64_t len) {
+ plainBuffer.append(buffer, len);
+ }
+
+ // Write the input value
+ // @param value The input value
+ // @return Void
+ template <class T>
+ void write(const T value) {
+ plainBuffer.append<T>(value);
+ }
+
+ // Write the give byte
+ // @param value The input value
+ // @return Void
+ virtual void writeByte(int8_t value) {
+ plainBuffer.append<int8_t>((int8_t)(value));
+ }
+
+ // Flush to the given output stream.
+ // @param os The output stream
+ // @return Void
+ virtual void flushToStream(OutputStream *os) {
+ // when the input is all null values, the plainBuffer size is 0
+ // so we do not need to write anything here.
+ if (plainBuffer.size() > 0) {
+ os->write(plainBuffer.data(), plainBuffer.size());
+ }
+ }
+
+ // Clear the internal buffer
+ // @return Void
+ virtual void reset() { plainBuffer.resize(0); }
+
+ public:
+ static uint64_t COMPRESS_BLOCK_SIZE;
+
+ protected:
+ dbcommon::ByteBuffer plainBuffer;
+};
+
+class BufferedStream : public SeekableOutputStream {
+ public:
+ BufferedStream() {}
+ virtual ~BufferedStream() {}
+};
+
+class BlockCompressionStream : public SeekableOutputStream {
+ public:
+ BlockCompressionStream() : compressedBuffer(true) {}
+
+ virtual ~BlockCompressionStream() {}
+
+ uint64_t getStreamSize() override { return compressedBuffer.size(); }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return plainBuffer.size() + compressedBuffer.size();
+ }
+
+ void write(const char *buffer, uint64_t len) override {
+ plainBuffer.append(buffer, len);
+ if (plainBuffer.size() >= COMPRESS_BLOCK_SIZE) {
+ compress(false);
+ }
+ }
+
+ template <class T>
+ void write(const T value) {
+ plainBuffer.append<T>(value);
+ if (plainBuffer.size() >= COMPRESS_BLOCK_SIZE) {
+ compress(false);
+ }
+ }
+
+ void writeByte(int8_t value) override {
+ plainBuffer.append<int8_t>((int8_t)(value));
+ if (plainBuffer.size() >= COMPRESS_BLOCK_SIZE) {
+ compress(false);
+ }
+ }
+
+ void flushToStream(OutputStream *os) override {
+ compress(true);
+ if (compressedBuffer.size() > 0) {
+ os->write(compressedBuffer.data(), compressedBuffer.size());
+ }
+ }
+
+ void reset() override {
+ plainBuffer.resize(0);
+ compressedBuffer.resize(0);
+ }
+
+ private:
+ void compress(bool compressLastBlock) {
+ uint64_t bufSize = plainBuffer.size();
+ uint64_t oldBufSize = bufSize;
+ char *oldData = plainBuffer.data();
+ char *data = plainBuffer.data();
+
+ uint64_t stopSize = compressLastBlock ? 0 : (bufSize % COMPRESS_BLOCK_SIZE);
+ while (bufSize > stopSize) {
+ // reserve some space
+ size_t clen = 0;
+ uint64_t olen =
+ (bufSize >= COMPRESS_BLOCK_SIZE ? COMPRESS_BLOCK_SIZE : bufSize);
+ uint64_t maxSz = compressor->maxCompressedLength(olen);
+ compressedBuffer.reserve(compressedBuffer.size() + sizeof(char) * 3 +
+ maxSz);
+
+ // allocate header, after compressedBuffer.reserve() function,
+ // lenPtr can change, so we can only put lenPtr recording after reserve()
+ char *lenPtr = compressedBuffer.tail();
+ compressedBuffer.append<char>(0);
+ compressedBuffer.append<char>(0);
+ compressedBuffer.append<char>(0);
+
+ // compress
+ // snappy::RawCompress(data, olen, compressedBuffer.tail(), &clen);
+ clen = compressor->compress(data, olen, compressedBuffer.tail(), maxSz);
+
+ uint64_t ulen = clen << 1;
+ if (clen < olen) {
+ compressedBuffer.resize(compressedBuffer.size() + clen);
+ // LOG_INFO("Has compression: ulen %llu clen %zu olen %llu", ulen, clen,
+ // olen);
+ } else {
+ memcpy(compressedBuffer.tail(), data, olen);
+ compressedBuffer.resize(compressedBuffer.size() + olen);
+
+ ulen = olen << 1;
+ ulen |= 1; // set the last bit to 1
+ // LOG_INFO("NO compression: ulen %llu clen %zu olen %llu", ulen, clen,
+ // olen);
+ }
+
+ // set the header
+ lenPtr[0] = ulen & 0xFF;
+ lenPtr[1] = (ulen >> 8) & 0xFF;
+ lenPtr[2] = (ulen >> 16) & 0xFF;
+
+ // LOG_INFO("lenPtr %d %d %d", lenPtr[0], lenPtr[1], lenPtr[2]);
+
+ bufSize -= olen;
+ data += olen;
+ }
+
+ // move the last block to the beginning of plainBuffer;
+ if (bufSize < oldBufSize && bufSize > 0) {
+ assert(bufSize == (plainBuffer.tail() - data));
+ memcpy(oldData, data, bufSize);
+ }
+
+ plainBuffer.resize(bufSize);
+ }
+
+ protected:
+ std::unique_ptr<dbcommon::Compressor> compressor;
+
+ private:
+ dbcommon::ByteBuffer compressedBuffer;
+};
+
+class SnappyCompressionStream : public BlockCompressionStream {
+ public:
+ SnappyCompressionStream() {
+ compressor.reset(new dbcommon::SnappyCompressor());
+ }
+ ~SnappyCompressionStream() {}
+};
+
+class LZ4CompressionStream : public BlockCompressionStream {
+ public:
+ LZ4CompressionStream() { compressor.reset(new dbcommon::LZ4Compressor()); }
+ ~LZ4CompressionStream() {}
+};
+
+class ZlibCompressionStream : public SeekableOutputStream {
+ public:
+ ZlibCompressionStream() {}
+ ~ZlibCompressionStream() {}
+};
+
+std::unique_ptr<SeekableOutputStream> createBlockCompressor(
+ orc::CompressionKind kind);
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_SEEKABLE_OUTPUT_STREAM_H_
diff --git a/depends/storage/src/storage/format/orc/string-dictionary.cc b/depends/storage/src/storage/format/orc/string-dictionary.cc
new file mode 100644
index 0000000..6d9d0af
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/string-dictionary.cc
@@ -0,0 +1,60 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/string-dictionary.h"
+
+#include <cassert>
+
+namespace orc {
+
+uint32_t StringDictionary::add(const char *buffer, uint64_t len) {
+ std::string key;
+ key.append(buffer, len);
+ StringDictionaryMap::iterator it = myMap.find(key);
+ if (it != myMap.end()) {
+ return it->second;
+ } else {
+ myMap[key] = ++id;
+ bytes += len + 8;
+ return id;
+ }
+}
+
+void StringDictionary::dump(std::vector<const char *> *vals,
+ std::vector<uint64_t> *lens,
+ std::vector<uint32_t> *dumpOrder) const {
+ assert(vals != nullptr && lens != nullptr && dumpOrder != nullptr);
+ int32_t size = myMap.size();
+ vals->resize(size);
+ lens->resize(size);
+ dumpOrder->resize(size);
+ int32_t index = 0;
+ for (StringDictionaryMap::const_iterator it = myMap.begin();
+ it != myMap.end(); ++it) {
+ (*vals)[index] = it->first.data();
+ (*lens)[index] = it->first.length();
+ (*dumpOrder)[it->second] = index++;
+ }
+}
+
+uint32_t StringDictionary::size() const { return myMap.size(); }
+
+uint32_t StringDictionary::sizeInBytes() const { return bytes; }
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/string-dictionary.h b/depends/storage/src/storage/format/orc/string-dictionary.h
new file mode 100644
index 0000000..ffdbc35
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/string-dictionary.h
@@ -0,0 +1,62 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_STRING_DICTIONARY_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_STRING_DICTIONARY_H_
+
+#include <map>
+#include <string>
+#include <vector>
+
+namespace orc {
+
+typedef std::map<std::string, uint32_t> StringDictionaryMap;
+
+class StringDictionary {
+ public:
+ StringDictionary() { reset(); }
+ virtual ~StringDictionary() {}
+
+ uint32_t add(const char *buffer, uint64_t len);
+
+ void dump(std::vector<const char *> *vals, std::vector<uint64_t> *lens,
+ std::vector<uint32_t> *dumpOrder) const;
+
+ uint32_t size() const;
+
+ uint32_t sizeInBytes() const;
+
+ void clear() { reset(); }
+
+ private:
+ void reset() {
+ myMap.clear();
+ id = -1;
+ bytes = 0;
+ }
+
+ private:
+ StringDictionaryMap myMap;
+ uint32_t id;
+ uint32_t bytes;
+};
+
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_STRING_DICTIONARY_H_
diff --git a/depends/storage/src/storage/format/orc/timezone.cc b/depends/storage/src/storage/format/orc/timezone.cc
new file mode 100644
index 0000000..35bcc41
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/timezone.cc
@@ -0,0 +1,458 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <fcntl.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <sys/stat.h>
+#include <time.h>
+#include <unistd.h>
+
+#include <cerrno>
+#include <iostream>
+#include <map>
+#include <sstream>
+
+#include "dbcommon/log/logger.h"
+#include "storage/format/orc/timezone.h"
+
+namespace orc {
+
+// Find the position that is the closest and less than or equal to the
+// target.
+// @return -1 if the target < array[0] or array is empty or
+// i if array[i] <= target and (i == n or array[i] < array[i+1])
+int64_t binarySearch(const std::vector<int64_t>& array, int64_t target) {
+ uint64_t size = array.size();
+ if (size == 0) {
+ return -1;
+ }
+ uint64_t min = 0;
+ uint64_t max = size - 1;
+ uint64_t mid = (min + max) / 2;
+ while ((array[mid] != target) && (min < max)) {
+ if (array[mid] < target) {
+ min = mid + 1;
+ } else if (mid == 0) {
+ max = 0;
+ } else {
+ max = mid - 1;
+ }
+ mid = (min + max) / 2;
+ }
+ if (target < array[mid]) {
+ return static_cast<int64_t>(mid) - 1;
+ } else {
+ return static_cast<int64_t>(mid);
+ }
+}
+
+FutureRule::~FutureRule() {
+ // PASS
+}
+
+std::unique_ptr<FutureRule> parseFutureRule(const std::string& ruleString) {
+ std::unique_ptr<FutureRule> result(new FutureRuleImpl());
+ FutureRuleParser parser(ruleString,
+ dynamic_cast<FutureRuleImpl*>(result.get()));
+ return result;
+}
+
+FutureRuleImpl::~FutureRuleImpl() {
+ // PASS
+}
+
+bool FutureRuleImpl::isDefined() const { return ruleString.size() > 0; }
+
+const TimezoneVariant& FutureRuleImpl::getVariant(int64_t clk) const {
+ if (!hasDst) {
+ return standard;
+ } else {
+ int64_t adjusted = clk % SECONDS_PER_400_YEARS;
+ if (adjusted < 0) {
+ adjusted += SECONDS_PER_400_YEARS;
+ }
+ int64_t idx = binarySearch(offsets, adjusted);
+ if (startInStd == (idx % 2 == 0)) {
+ return standard;
+ } else {
+ return dst;
+ }
+ }
+}
+
+void FutureRuleImpl::print(std::ostream* out) const {
+ if (isDefined()) {
+ *out << " Future rule: " << ruleString << "\n";
+ *out << " standard " << standard.toString() << "\n";
+ if (hasDst) {
+ *out << " dst " << dst.toString() << "\n";
+ *out << " start " << start.toString() << "\n";
+ *out << " end " << end.toString() << "\n";
+ }
+ }
+}
+
+VersionParser::~VersionParser() {
+ // PASS
+}
+
+static uint32_t decode32(const unsigned char* ptr) {
+ return static_cast<uint32_t>(ptr[0] << 24) |
+ static_cast<uint32_t>(ptr[1] << 16) |
+ static_cast<uint32_t>(ptr[2] << 8) | static_cast<uint32_t>(ptr[3]);
+}
+
+class Version1Parser : public VersionParser {
+ public:
+ virtual ~Version1Parser();
+
+ uint64_t getVersion() const override { return 1; }
+
+ // Get the number of bytes
+ uint64_t getTimeSize() const override { return 4; }
+
+ // Parse the time at the given location.
+ int64_t parseTime(const unsigned char* ptr) const override {
+ // sign extend from 32 bits
+ return static_cast<int32_t>(decode32(ptr));
+ }
+
+ std::string parseFutureString(const unsigned char*, uint64_t,
+ uint64_t) const override {
+ return "";
+ }
+};
+
+Version1Parser::~Version1Parser() {
+ // PASS
+}
+
+class Version2Parser : public VersionParser {
+ public:
+ virtual ~Version2Parser();
+
+ uint64_t getVersion() const override { return 2; }
+
+ // Get the number of bytes
+ uint64_t getTimeSize() const override { return 8; }
+
+ // Parse the time at the given location.
+ int64_t parseTime(const unsigned char* ptr) const override {
+ return static_cast<int64_t>(decode32(ptr)) << 32 | decode32(ptr + 4);
+ }
+
+ std::string parseFutureString(const unsigned char* ptr, uint64_t offset,
+ uint64_t length) const override {
+ return std::string(reinterpret_cast<const char*>(ptr) + offset + 1,
+ length - 2);
+ }
+};
+
+Version2Parser::~Version2Parser() {
+ // PASS
+}
+
+static std::map<std::string, Timezone*> timezoneCache;
+
+Timezone::~Timezone() {
+ // PASS
+}
+
+TimezoneImpl::TimezoneImpl(const std::string& _filename,
+ const std::vector<unsigned char> buffer)
+ : filename(_filename) {
+ parseZoneFile(&buffer[0], 0, buffer.size(), Version1Parser());
+ // Build the literal for the ORC epoch
+ // 2015 Jan 1 00:00:00
+ tm epochStruct;
+ epochStruct.tm_sec = 0;
+ epochStruct.tm_min = 0;
+ epochStruct.tm_hour = 0;
+ epochStruct.tm_mday = 1;
+ epochStruct.tm_mon = 0;
+ epochStruct.tm_year = 2015 - 1900;
+ epochStruct.tm_isdst = 0;
+ time_t utcEpoch = timegm(&epochStruct);
+ epoch = utcEpoch - getVariant(utcEpoch).gmtOffset;
+}
+
+const char* getTimezoneDirectory() {
+ const char* dir = getenv("TZDIR");
+ if (!dir) {
+ dir = DEFAULT_TZDIR;
+ }
+ return dir;
+}
+
+// Get a timezone by absolute filename.
+// Results are cached.
+const Timezone& getTimezoneByFilename(const std::string& filename) {
+ std::map<std::string, Timezone*>::iterator itr = timezoneCache.find(filename);
+ if (itr != timezoneCache.end()) {
+ return *(itr->second);
+ }
+ int in = open(filename.c_str(), O_RDONLY);
+ if (in == -1) {
+ std::stringstream buffer;
+ buffer << "failed to open " << filename << " - " << strerror(errno);
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+ struct stat fileInfo;
+ if (fstat(in, &fileInfo) == -1) {
+ std::stringstream buffer;
+ buffer << "failed to stat " << filename << " - " << strerror(errno);
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+ if ((fileInfo.st_mode & S_IFMT) != S_IFREG) {
+ std::stringstream buffer;
+ buffer << "non-file in tzfile reader " << filename;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+ size_t size = static_cast<size_t>(fileInfo.st_size);
+ std::vector<unsigned char> buffer(size);
+ size_t posn = 0;
+ while (posn < size) {
+ ssize_t ret = read(in, &buffer[posn], size - posn);
+ if (ret == -1) {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Failure to read timezone file %s - %s",
+ filename.c_str(), strerror(errno));
+ }
+ posn += static_cast<size_t>(ret);
+ }
+ if (close(in) == -1) {
+ std::stringstream err;
+ err << "failed to close " << filename << " - " << strerror(errno);
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", err.str().c_str());
+ }
+ Timezone* result = new TimezoneImpl(filename, buffer);
+ timezoneCache[filename] = result;
+ return *result;
+}
+
+// Get the local timezone.
+const Timezone& getLocalTimezone() {
+ return getTimezoneByFilename(LOCAL_TIMEZONE);
+}
+
+// Get a timezone by name (eg. America/Los_Angeles).
+// Results are cached.
+const Timezone& getTimezoneByName(const std::string& zone) {
+ std::string filename(getTimezoneDirectory());
+ filename += "/";
+ filename += zone;
+ return getTimezoneByFilename(filename);
+}
+
+// Parse a set of bytes as a timezone file as if they came from filename.
+std::unique_ptr<Timezone> getTimezone(const std::string& filename,
+ const std::vector<unsigned char>& b) {
+ return std::unique_ptr<Timezone>(new TimezoneImpl(filename, b));
+}
+
+TimezoneImpl::~TimezoneImpl() {
+ // PASS
+}
+
+void TimezoneImpl::parseTimeVariants(const unsigned char* ptr,
+ uint64_t variantOffset,
+ uint64_t variantCount, uint64_t nameOffset,
+ uint64_t nameCount) {
+ for (uint64_t variant = 0; variant < variantCount; ++variant) {
+ variants[variant].gmtOffset =
+ static_cast<int32_t>(decode32(ptr + variantOffset + 6 * variant));
+ variants[variant].isDst = ptr[variantOffset + 6 * variant + 4];
+ uint nameStart = ptr[variantOffset + 6 * variant + 5];
+ if (nameStart >= nameCount) {
+ std::stringstream buffer;
+ buffer << "name out of range in variant " << variant << " - " << nameStart
+ << " >= " << nameCount;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+ variants[variant].name = std::string(reinterpret_cast<const char*>(ptr) +
+ nameOffset + nameStart);
+ }
+}
+
+//
+// Parse the zone file to get the bits we need.
+// There are two versions of the timezone file:
+//
+// Version 1(version = 0x00):
+// Magic(version)
+// Header
+// TransitionTimes(4 byte)
+// TransitionRules
+// Rules
+// LeapSeconds(4 byte)
+// IsStd
+// IsGmt
+//
+// Version2:
+// Version1(0x32) = a version 1 copy of the data for old clients
+// Magic(0x32)
+// Header
+// TransitionTimes(8 byte)
+// TransitionRules
+// Rules
+// LeapSeconds(8 byte)
+// IsStd
+// IsGmt
+// FutureString
+void TimezoneImpl::parseZoneFile(const unsigned char* ptr,
+ uint64_t sectionOffset, uint64_t fileLength,
+ const VersionParser& versionParser) {
+ const uint64_t magicOffset = sectionOffset + 0;
+ const uint64_t headerOffset = magicOffset + 20;
+
+ // check for validity before we start parsing
+ if (fileLength < headerOffset + 6 * 4 ||
+ strncmp(reinterpret_cast<const char*>(ptr) + magicOffset, "TZif", 4) !=
+ 0) {
+ std::stringstream buffer;
+ buffer << "non-tzfile " << filename;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+
+ const uint64_t isGmtCount = decode32(ptr + headerOffset + 0);
+ const uint64_t isStdCount = decode32(ptr + headerOffset + 4);
+ const uint64_t leapCount = decode32(ptr + headerOffset + 8);
+ const uint64_t timeCount = decode32(ptr + headerOffset + 12);
+ const uint64_t variantCount = decode32(ptr + headerOffset + 16);
+ const uint64_t nameCount = decode32(ptr + headerOffset + 20);
+
+ const uint64_t timeOffset = headerOffset + 24;
+ const uint64_t timeVariantOffset =
+ timeOffset + versionParser.getTimeSize() * timeCount;
+ const uint64_t variantOffset = timeVariantOffset + timeCount;
+ const uint64_t nameOffset = variantOffset + variantCount * 6;
+ const uint64_t sectionLength = nameOffset + nameCount +
+ (versionParser.getTimeSize() + 4) * leapCount +
+ isGmtCount + isStdCount;
+
+ if (sectionLength > fileLength) {
+ std::stringstream buffer;
+ buffer << "tzfile too short " << filename << " needs " << sectionLength
+ << " and has " << fileLength;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+
+ // if it is version 2, skip over the old layout and read the new one.
+ if (sectionOffset == 0 && ptr[magicOffset + 4] != 0) {
+ parseZoneFile(ptr, sectionLength, fileLength, Version2Parser());
+ return;
+ }
+ version = versionParser.getVersion();
+ variants.resize(variantCount);
+ transitions.resize(timeCount);
+ currentVariant.resize(timeCount);
+ parseTimeVariants(ptr, variantOffset, variantCount, nameOffset, nameCount);
+ bool foundAncient = false;
+ for (uint64_t t = 0; t < timeCount; ++t) {
+ transitions[t] = versionParser.parseTime(ptr + timeOffset +
+ t * versionParser.getTimeSize());
+ currentVariant[t] = ptr[timeVariantOffset + t];
+ if (currentVariant[t] >= variantCount) {
+ std::stringstream buffer;
+ buffer << "tzfile rule out of range " << filename << " references rule "
+ << currentVariant[t] << " of " << variantCount;
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+ // find the oldest standard time and use that as the ancient value
+ if (!foundAncient && !variants[currentVariant[t]].isDst) {
+ foundAncient = true;
+ ancientVariant = currentVariant[t];
+ }
+ }
+ if (!foundAncient) {
+ ancientVariant = 0;
+ }
+ futureRule = parseFutureRule(versionParser.parseFutureString(
+ ptr, sectionLength, fileLength - sectionLength));
+
+ // find the lower bound for applying the future rule
+ if (futureRule->isDefined()) {
+ if (timeCount > 0) {
+ lastTransition = transitions[timeCount - 1];
+ } else {
+ lastTransition = INT64_MIN;
+ }
+ } else {
+ lastTransition = INT64_MAX;
+ }
+}
+
+const TimezoneVariant& TimezoneImpl::getVariant(int64_t clk) const {
+ // if it is after the last explicit entry in the table,
+ // use the future rule to get an answer
+ if (clk > lastTransition) {
+ return futureRule->getVariant(clk);
+ } else {
+ int64_t transition = binarySearch(transitions, clk);
+ uint64_t idx;
+ if (transition < 0) {
+ idx = ancientVariant;
+ } else {
+ idx = currentVariant[static_cast<size_t>(transition)];
+ }
+ return variants[idx];
+ }
+}
+
+void TimezoneImpl::print(std::ostream& out) const {
+ out << "Timezone file: " << filename << "\n";
+ out << " Version: " << version << "\n";
+ futureRule->print(&out);
+ for (uint64_t r = 0; r < variants.size(); ++r) {
+ out << " Variant " << r << ": " << variants[r].toString() << "\n";
+ }
+ for (uint64_t t = 0; t < transitions.size(); ++t) {
+ tm timeStruct;
+ tm* result = nullptr;
+ char buffer[25];
+ if (sizeof(time_t) >= 8) {
+ time_t val = transitions[t];
+ result = gmtime_r(&val, &timeStruct);
+ if (result) {
+ strftime(buffer, sizeof(buffer), "%F %H:%M:%S", &timeStruct);
+ }
+ }
+ std::cout << " Transition: " << (result == nullptr ? "null" : buffer)
+ << " (" << transitions[t] << ") -> "
+ << variants[currentVariant[t]].name << "\n";
+ }
+}
+
+TimezoneError::TimezoneError(const std::string& what)
+ : std::runtime_error(what) {
+ // PASS
+}
+
+TimezoneError::TimezoneError(const TimezoneError& other)
+ : std::runtime_error(other) {
+ // PASS
+}
+
+TimezoneError::~TimezoneError() noexcept {
+ // PASS
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/timezone.h b/depends/storage/src/storage/format/orc/timezone.h
new file mode 100644
index 0000000..734a0cc
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/timezone.h
@@ -0,0 +1,502 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_TIMEZONE_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_TIMEZONE_H_
+
+// This file is for timezone routines.
+
+#include <stdint.h>
+
+#include <memory>
+#include <sstream>
+#include <stdexcept>
+#include <string>
+#include <vector>
+
+namespace orc {
+
+// default location of the timezone files
+const char DEFAULT_TZDIR[] = "/usr/share/zoneinfo";
+
+// location of a symlink to the local timezone
+const char LOCAL_TIMEZONE[] = "/etc/localtime";
+
+const int64_t MONTHS_PER_YEAR = 12;
+
+// The number of days in each month in non-leap and leap years.
+const int64_t DAYS_PER_MONTH[2][MONTHS_PER_YEAR] = {
+ {31, 28, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31},
+ {31, 29, 31, 30, 31, 30, 31, 31, 30, 31, 30, 31}};
+const int64_t SECONDS_PER_HOUR = 60 * 60;
+const int64_t SECONDS_PER_DAY = SECONDS_PER_HOUR * 24;
+const int64_t DAYS_PER_WEEK = 7;
+
+// Leap years and day of the week repeat every 400 years, which makes it
+// a good cycle length.
+const int64_t SECONDS_PER_400_YEARS =
+ SECONDS_PER_DAY * (365 * (300 + 3) + 366 * (100 - 3));
+
+// Is the given year a leap year?
+inline bool isLeap(int64_t year) {
+ return (year % 4 == 0) && ((year % 100 != 0) || (year % 400 == 0));
+}
+
+// A variant (eg. PST or PDT) of a timezone (eg. America/Los_Angeles).
+struct TimezoneVariant {
+ int64_t gmtOffset;
+ bool isDst;
+ std::string name;
+
+ std::string toString() const { return "Not-implemented"; }
+};
+
+// A region that shares the same legal rules for wall clock time and
+// day light savings transitions. They are typically named for the largest
+// city in the region (eg. America/Los_Angeles or America/Mexico_City).
+class Timezone {
+ public:
+ virtual ~Timezone();
+
+ // Get the variant for the given time (time_t).
+ virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
+
+ // Get the number of seconds between the ORC epoch in this timezone
+ // and Unix epoch.
+ // ORC epoch is 1 Jan 2015 00:00:00 local.
+ // Unix epoch is 1 Jan 1970 00:00:00 UTC.
+ virtual int64_t getEpoch() const = 0;
+
+ // Print the timezone to the stream.
+ virtual void print(std::ostream&) const = 0;
+
+ // Get the version of the zone file.
+ virtual uint64_t getVersion() const = 0;
+};
+
+// Get the local timezone.
+// Results are cached.
+const Timezone& getLocalTimezone();
+
+// Get a timezone by name (eg. America/Los_Angeles).
+// Results are cached.
+const Timezone& getTimezoneByName(const std::string& zone);
+
+// Parse a set of bytes as a timezone file as if they came from filename.
+std::unique_ptr<Timezone> getTimezone(const std::string& filename,
+ const std::vector<unsigned char>& b);
+
+class TimezoneError : public std::runtime_error {
+ public:
+ explicit TimezoneError(const std::string& what);
+ explicit TimezoneError(const TimezoneError&);
+ virtual ~TimezoneError() noexcept;
+};
+
+enum TransitionKind { TRANSITION_JULIAN, TRANSITION_DAY, TRANSITION_MONTH };
+
+struct Transition {
+ TransitionKind kind;
+ int64_t day;
+ int64_t week;
+ int64_t month;
+ int64_t time;
+
+ std::string toString() const {
+ std::stringstream buffer;
+ switch (kind) {
+ case TRANSITION_JULIAN:
+ buffer << "julian " << day;
+ break;
+ case TRANSITION_DAY:
+ buffer << "day " << day;
+ break;
+ case TRANSITION_MONTH:
+ buffer << "month " << month << " week " << week << " day " << day;
+ break;
+ }
+ buffer << " at " << (time / (60 * 60)) << ":" << ((time / 60) % 60) << ":"
+ << (time % 60);
+ return buffer.str();
+ }
+
+ // Get the transition time for the given year.
+ // @param year the year
+ // @return the number of seconds past local Jan 1 00:00:00 that the
+ // transition happens.
+ int64_t getTime(int64_t year) const {
+ int64_t result = time;
+ switch (kind) {
+ case TRANSITION_JULIAN:
+ result += SECONDS_PER_DAY * day;
+ if (day > 60 && isLeap(year)) {
+ result += SECONDS_PER_DAY;
+ }
+ break;
+ case TRANSITION_DAY:
+ result += SECONDS_PER_DAY * day;
+ break;
+ case TRANSITION_MONTH: {
+ bool inLeap = isLeap(year);
+ int64_t adjustedMonth = (month + 9) % 12 + 1;
+ int64_t adjustedYear = (month <= 2) ? (year - 1) : year;
+ int64_t adjustedCentury = adjustedYear / 100;
+ int64_t adjustedRemainder = adjustedYear % 100;
+
+ // day of the week of the first day of month
+ int64_t dayOfWeek = ((26 * adjustedMonth - 2) / 10 + 1 +
+ adjustedRemainder + adjustedRemainder / 4 +
+ adjustedCentury / 4 - 2 * adjustedCentury) %
+ 7;
+ if (dayOfWeek < 0) {
+ dayOfWeek += DAYS_PER_WEEK;
+ }
+
+ int64_t d = day - dayOfWeek;
+ if (d < 0) {
+ d += DAYS_PER_WEEK;
+ }
+ for (int w = 1; w < week; ++w) {
+ if (d + DAYS_PER_WEEK >= DAYS_PER_MONTH[inLeap][month - 1]) {
+ break;
+ }
+ d += DAYS_PER_WEEK;
+ }
+ result += d * SECONDS_PER_DAY;
+
+ // Add in the time for the month
+ for (int m = 0; m < month - 1; ++m) {
+ result += DAYS_PER_MONTH[inLeap][m] * SECONDS_PER_DAY;
+ }
+ break;
+ }
+ }
+ return result;
+ }
+};
+
+// Represents the parsed POSIX timezone rule strings that are used to
+// describe the future transitions, because they can go arbitrarily far into
+// the future.
+class FutureRule {
+ public:
+ virtual ~FutureRule();
+ virtual bool isDefined() const = 0;
+ virtual const TimezoneVariant& getVariant(int64_t clk) const = 0;
+ virtual void print(std::ostream* out) const = 0;
+};
+
+// Parse the POSIX TZ string.
+std::unique_ptr<FutureRule> parseFutureRule(const std::string& ruleString);
+
+// The current rule for finding timezone variants arbitrarily far in
+// the future. They are based on a string representation that
+// specifies the standard name and offset. For timezones with
+// daylight savings, the string specifies the daylight variant name
+// and offset and the rules for switching between them.
+//
+// rule = <standard name><standard offset><daylight>?
+// name = string with no numbers or '+', '-', or ','
+// offset = [-+]?hh(:mm(:ss)?)?
+// daylight = <name><offset>,<start day>(/<offset>)?,<end day>(/<offset>)?
+// day = J<day without 2/29>|<day with 2/29>|M<month>.<week>.<day of week>
+class FutureRuleImpl : public FutureRule {
+ std::string ruleString;
+ TimezoneVariant standard;
+ bool hasDst;
+ TimezoneVariant dst;
+ Transition start;
+ Transition end;
+
+ // expanded time_t offsets of transitions
+ std::vector<int64_t> offsets;
+
+ // Is the epoch (1 Jan 1970 00:00) in standard time?
+ // This code assumes that the transition dates fall in the same order
+ // each year. Hopefully no timezone regions decide to move across the
+ // equator, which is about what it would take.
+ bool startInStd;
+
+ void computeOffsets() {
+ if (!hasDst) {
+ startInStd = true;
+ offsets.resize(1);
+ } else {
+ // Insert a transition for the epoch and two per a year for the next
+ // 400 years. We assume that the all even positions are in standard
+ // time if and only if startInStd and the odd ones are the reverse.
+ offsets.resize(400 * 2 + 1);
+ startInStd = start.getTime(1970) < end.getTime(1970);
+ int64_t base = 0;
+ for (int64_t year = 1970; year < 1970 + 400; ++year) {
+ if (startInStd) {
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + start.getTime(year) - standard.gmtOffset;
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + end.getTime(year) - dst.gmtOffset;
+ } else {
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 1] =
+ base + end.getTime(year) - dst.gmtOffset;
+ offsets[static_cast<uint64_t>(year - 1970) * 2 + 2] =
+ base + start.getTime(year) - standard.gmtOffset;
+ }
+ base += (isLeap(year) ? 366 : 365) * SECONDS_PER_DAY;
+ }
+ }
+ offsets[0] = 0;
+ }
+
+ public:
+ virtual ~FutureRuleImpl();
+ bool isDefined() const override;
+ const TimezoneVariant& getVariant(int64_t clk) const override;
+ void print(std::ostream* out) const override;
+
+ friend class FutureRuleParser;
+};
+
+// A parser for the future rule strings.
+class FutureRuleParser {
+ public:
+ FutureRuleParser(const std::string& str, FutureRuleImpl* rule)
+ : ruleString(str), length(str.size()), position(0), output(*rule) {
+ output.ruleString = str;
+ if (position != length) {
+ parseName(&(output.standard.name));
+ output.standard.gmtOffset = -parseOffset();
+ output.standard.isDst = false;
+ output.hasDst = position < length;
+ if (output.hasDst) {
+ parseName(&(output.dst.name));
+ output.dst.isDst = true;
+ if (ruleString[position] != ',') {
+ output.dst.gmtOffset = -parseOffset();
+ } else {
+ output.dst.gmtOffset = output.standard.gmtOffset + 60 * 60;
+ }
+ parseTransition(&(output.start));
+ parseTransition(&(output.end));
+ }
+ if (position != length) {
+ throwError("Extra text");
+ }
+ output.computeOffsets();
+ }
+ }
+
+ private:
+ const std::string& ruleString;
+ size_t length;
+ size_t position;
+ FutureRuleImpl& output;
+
+ void throwError(const char* msg) {
+ std::stringstream buffer;
+ buffer << msg << " at " << position << " in '" << ruleString << "'";
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "%s", buffer.str().c_str());
+ }
+
+ // Parse the names of the form:
+ // ([^-+0-9,]+|<[^>]+>)
+ // and set the output string.
+ void parseName(std::string* result) {
+ if (position == length) {
+ throwError("name required");
+ }
+ size_t start = position;
+ if (ruleString[position] == '<') {
+ while (position < length && ruleString[position] != '>') {
+ position += 1;
+ }
+ if (position == length) {
+ throwError("missing close '>'");
+ }
+ position += 1;
+ } else {
+ while (position < length) {
+ char ch = ruleString[position];
+ if (isdigit(ch) || ch == '-' || ch == '+' || ch == ',') {
+ break;
+ }
+ position += 1;
+ }
+ }
+ if (position == start) {
+ throwError("empty string not allowed");
+ }
+ *result = ruleString.substr(start, position - start);
+ }
+
+ // Parse an integer of the form [0-9]+ and return it.
+ int64_t parseNumber() {
+ if (position >= length) {
+ throwError("missing number");
+ }
+ int64_t result = 0;
+ while (position < length) {
+ char ch = ruleString[position];
+ if (isdigit(ch)) {
+ result = result * 10 + (ch - '0');
+ position += 1;
+ } else {
+ break;
+ }
+ }
+ return result;
+ }
+
+ // Parse the offsets of the form:
+ // [-+]?[0-9]+(:[0-9]+(:[0-9]+)?)?
+ // and convert it into a number of seconds.
+ int64_t parseOffset() {
+ int64_t scale = 3600;
+ bool isNegative = false;
+ if (position < length) {
+ char ch = ruleString[position];
+ isNegative = ch == '-';
+ if (ch == '-' || ch == '+') {
+ position += 1;
+ }
+ }
+ int64_t result = parseNumber() * scale;
+ while (position < length && scale > 1 && ruleString[position] == ':') {
+ scale /= 60;
+ position += 1;
+ result += parseNumber() * scale;
+ }
+ if (isNegative) {
+ result = -result;
+ }
+ return result;
+ }
+
+ // Parse a transition of the following form:
+ // ,(J<number>|<number>|M<number>.<number>.<number>)(/<offset>)?
+ void parseTransition(Transition* transition) {
+ if (length - position < 2 || ruleString[position] != ',') {
+ throwError("missing transition");
+ }
+ position += 1;
+ char ch = ruleString[position];
+ if (ch == 'J') {
+ transition->kind = TRANSITION_JULIAN;
+ position += 1;
+ transition->day = parseNumber();
+ } else if (ch == 'M') {
+ transition->kind = TRANSITION_MONTH;
+ position += 1;
+ transition->month = parseNumber();
+ if (position == length || ruleString[position] != '.') {
+ throwError("missing first .");
+ }
+ position += 1;
+ transition->week = parseNumber();
+ if (position == length || ruleString[position] != '.') {
+ throwError("missing second .");
+ }
+ position += 1;
+ transition->day = parseNumber();
+ } else {
+ transition->kind = TRANSITION_DAY;
+ transition->day = parseNumber();
+ }
+ if (position < length && ruleString[position] == '/') {
+ position += 1;
+ transition->time = parseOffset();
+ } else {
+ transition->time = 2 * 60 * 60;
+ }
+ }
+};
+
+// Parse the POSIX TZ string.
+extern std::unique_ptr<FutureRule> parseFutureRule(
+ const std::string& ruleString);
+
+// An abstraction of the differences between versions.
+class VersionParser {
+ public:
+ virtual ~VersionParser();
+
+ // Get the version number.
+ virtual uint64_t getVersion() const = 0;
+
+ // Get the number of bytes
+ virtual uint64_t getTimeSize() const = 0;
+
+ // Parse the time at the given location.
+ virtual int64_t parseTime(const unsigned char* ptr) const = 0;
+
+ // Parse the future string
+ virtual std::string parseFutureString(const unsigned char* ptr,
+ uint64_t offset,
+ uint64_t length) const = 0;
+};
+
+class TimezoneImpl : public Timezone {
+ public:
+ TimezoneImpl(const std::string& name, const std::vector<unsigned char> bytes);
+ virtual ~TimezoneImpl();
+
+ // Get the variant for the given time (time_t).
+ const TimezoneVariant& getVariant(int64_t clk) const override;
+
+ void print(std::ostream&) const override;
+
+ uint64_t getVersion() const override { return version; }
+
+ int64_t getEpoch() const override { return epoch; }
+
+ private:
+ void parseTimeVariants(const unsigned char* ptr, uint64_t variantOffset,
+ uint64_t variantCount, uint64_t nameOffset,
+ uint64_t nameCount);
+ void parseZoneFile(const unsigned char* ptr, uint64_t sectionOffset,
+ uint64_t fileLength, const VersionParser& version);
+ // filename
+ std::string filename;
+
+ // the version of the file
+ uint64_t version;
+
+ // the list of variants for this timezone
+ std::vector<TimezoneVariant> variants;
+
+ // the list of the times where the local rules change
+ std::vector<int64_t> transitions;
+
+ // the variant that starts at this transition.
+ std::vector<uint64_t> currentVariant;
+
+ // the variant before the first transition
+ uint64_t ancientVariant;
+
+ // the rule for future times
+ std::unique_ptr<FutureRule> futureRule;
+
+ // the last explicit transition after which we use the future rule
+ int64_t lastTransition;
+
+ // The ORC epoch time in this timezone.
+ int64_t epoch;
+};
+
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_TIMEZONE_H_
diff --git a/depends/storage/src/storage/format/orc/type-impl.cc b/depends/storage/src/storage/format/orc/type-impl.cc
new file mode 100644
index 0000000..37b9872
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/type-impl.cc
@@ -0,0 +1,507 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/type-impl.h"
+#include "storage/format/orc/vector.h"
+
+namespace orc {
+Type::~Type() {
+ // PASS
+}
+
+TypeImpl::TypeImpl(ORCTypeKind _kind) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = 0;
+ precision = 0;
+ scale = 0;
+ subtypeCount = 0;
+}
+
+TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _maxLength) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = _maxLength;
+ precision = 0;
+ scale = 0;
+ subtypeCount = 0;
+}
+
+TypeImpl::TypeImpl(ORCTypeKind _kind, uint64_t _precision, uint64_t _scale) {
+ parent = nullptr;
+ columnId = -1;
+ maximumColumnId = -1;
+ kind = _kind;
+ maxLength = 0;
+ precision = _precision;
+ scale = _scale;
+ subtypeCount = 0;
+}
+
+uint64_t TypeImpl::assignIds(uint64_t root) const {
+ columnId = static_cast<int64_t>(root);
+ uint64_t current = root + 1;
+ for (uint64_t i = 0; i < subtypeCount; ++i) {
+ current = dynamic_cast<TypeImpl*>(subTypes[i])->assignIds(current);
+ }
+ maximumColumnId = static_cast<int64_t>(current) - 1;
+ return current;
+}
+
+TypeImpl::~TypeImpl() {
+ for (std::vector<Type*>::iterator it = subTypes.begin(); it != subTypes.end();
+ it++) {
+ delete (*it);
+ }
+}
+
+void TypeImpl::ensureIdAssigned() const {
+ if (columnId == -1) {
+ const TypeImpl* root = this;
+ while (root->parent != nullptr) {
+ root = root->parent;
+ }
+ root->assignIds(0);
+ }
+}
+
+uint64_t TypeImpl::getColumnId() const {
+ ensureIdAssigned();
+ return static_cast<uint64_t>(columnId);
+}
+
+uint64_t TypeImpl::getMaximumColumnId() const {
+ ensureIdAssigned();
+ return static_cast<uint64_t>(maximumColumnId);
+}
+
+ORCTypeKind TypeImpl::getKind() const { return kind; }
+
+uint64_t TypeImpl::getSubtypeCount() const { return subtypeCount; }
+
+const Type* TypeImpl::getSubtype(uint64_t i) const { return subTypes[i]; }
+
+const std::string& TypeImpl::getFieldName(uint64_t i) const {
+ return fieldNames[i];
+}
+
+uint64_t TypeImpl::getMaximumLength() const { return maxLength; }
+
+uint64_t TypeImpl::getPrecision() const { return precision; }
+
+uint64_t TypeImpl::getScale() const { return scale; }
+
+void TypeImpl::setIds(uint64_t _columnId, uint64_t _maxColumnId) {
+ columnId = static_cast<int64_t>(_columnId);
+ maximumColumnId = static_cast<int64_t>(_maxColumnId);
+}
+
+void TypeImpl::addChildType(std::unique_ptr<Type> childType) {
+ TypeImpl* child = dynamic_cast<TypeImpl*>(childType.release());
+ subTypes.push_back(child);
+ if (child != nullptr) {
+ child->parent = this;
+ }
+ subtypeCount += 1;
+}
+
+Type* TypeImpl::addStructField(const std::string& fieldName,
+ std::unique_ptr<Type> fieldType) {
+ addChildType(std::move(fieldType));
+ fieldNames.push_back(fieldName);
+ return this;
+}
+
+Type* TypeImpl::addUnionChild(std::unique_ptr<Type> fieldType) {
+ addChildType(std::move(fieldType));
+ return this;
+}
+
+std::string TypeImpl::toString() const {
+ switch (static_cast<int64_t>(kind)) {
+ case BOOLEAN:
+ return "boolean";
+ case BYTE:
+ return "tinyint";
+ case SHORT:
+ return "smallint";
+ case INT:
+ return "int";
+ case LONG:
+ return "bigint";
+ case FLOAT:
+ return "float";
+ case DOUBLE:
+ return "double";
+ case STRING:
+ return "string";
+ case BINARY:
+ return "binary";
+ case TIMESTAMP:
+ return "timestamp";
+ case LIST:
+ return "array<" + (subTypes[0] ? subTypes[0]->toString() : "void") + ">";
+ case MAP:
+ return "map<" + (subTypes[0] ? subTypes[0]->toString() : "void") + "," +
+ (subTypes[1] ? subTypes[1]->toString() : "void") + ">";
+ case STRUCT: {
+ std::string result = "struct<";
+ for (size_t i = 0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
+ }
+ result += fieldNames[i];
+ result += ":";
+ result += subTypes[i]->toString();
+ }
+ result += ">";
+ return result;
+ }
+ case UNION: {
+ std::string result = "uniontype<";
+ for (size_t i = 0; i < subTypes.size(); ++i) {
+ if (i != 0) {
+ result += ",";
+ }
+ result += subTypes[i]->toString();
+ }
+ result += ">";
+ return result;
+ }
+ case DECIMAL: {
+ std::stringstream result;
+ result << "decimal(" << precision << "," << scale << ")";
+ return result.str();
+ }
+ case DATE:
+ return "date";
+ case TIME:
+ return "time";
+ case VARCHAR: {
+ std::stringstream result;
+ result << "varchar(" << maxLength << ")";
+ return result.str();
+ }
+ case CHAR: {
+ std::stringstream result;
+ result << "char(" << maxLength << ")";
+ return result.str();
+ }
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type");
+ }
+}
+
+std::unique_ptr<ColumnVectorBatch> TypeImpl::createRowBatch(
+ uint64_t capacity, dbcommon::MemoryPool& pool) const {
+ switch (kind) {
+ case BYTE:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new ByteVectorBatch(capacity, pool));
+
+ case INT:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new IntVectorBatch(capacity, pool));
+
+ case SHORT:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new ShortVectorBatch(capacity, pool));
+
+ case LONG:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new LongVectorBatch(capacity, pool));
+
+ case FLOAT:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new FloatVectorBatch(capacity, pool));
+
+ case DOUBLE:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new DoubleVectorBatch(capacity, pool));
+
+ case STRING:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new StringVectorBatch(capacity, pool));
+ case BINARY:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new BinaryVectorBatch(capacity, pool));
+ case CHAR:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new BlankPaddedCharVectorBatch(capacity, pool, maxLength));
+ case VARCHAR:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new VaryingCharVectorBatch(capacity, pool, maxLength));
+
+ case BOOLEAN:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new BooleanVectorBatch(capacity, pool));
+
+ case DATE:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new DateVectorBatch(capacity, pool));
+
+ case TIME:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new TimeVectorBatch(capacity, pool));
+
+ case TIMESTAMP:
+ return std::unique_ptr<ColumnVectorBatch>(
+ new TimestampVectorBatch(capacity, pool));
+
+ case STRUCT: {
+ StructVectorBatch* result = new StructVectorBatch(capacity, pool);
+ for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
+ result->fields.push_back(
+ getSubtype(i)->createRowBatch(capacity, pool).release());
+ }
+ return std::unique_ptr<ColumnVectorBatch>(result);
+ }
+
+ case LIST: {
+ ListVectorBatch* result = new ListVectorBatch(capacity, pool);
+ if (getSubtype(0) != nullptr) {
+ result->elements = getSubtype(0)->createRowBatch(capacity, pool);
+ }
+ return std::unique_ptr<ColumnVectorBatch>(result);
+ }
+
+ case MAP: {
+ MapVectorBatch* result = new MapVectorBatch(capacity, pool);
+ if (getSubtype(0) != nullptr) {
+ result->keys = getSubtype(0)->createRowBatch(capacity, pool);
+ }
+ if (getSubtype(1) != nullptr) {
+ result->elements = getSubtype(1)->createRowBatch(capacity, pool);
+ }
+ return std::unique_ptr<ColumnVectorBatch>(result);
+ }
+
+ case DECIMAL: {
+ if (getPrecision() == 0 || getPrecision() > 18) {
+ return std::unique_ptr<ColumnVectorBatch>(
+ new Decimal128VectorBatch(capacity, pool));
+ } else {
+ return std::unique_ptr<ColumnVectorBatch>(
+ new Decimal64VectorBatch(capacity, pool));
+ }
+ }
+
+ case UNION: {
+ UnionVectorBatch* result = new UnionVectorBatch(capacity, pool);
+ for (uint64_t i = 0; i < getSubtypeCount(); ++i) {
+ result->children.push_back(
+ getSubtype(i)->createRowBatch(capacity, pool).release());
+ }
+ return std::unique_ptr<ColumnVectorBatch>(result);
+ }
+
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "not supported yet");
+ }
+}
+
+std::unique_ptr<Type> createPrimitiveType(ORCTypeKind kind) {
+ return std::unique_ptr<Type>(new TypeImpl(kind));
+}
+
+std::unique_ptr<Type> createCharType(ORCTypeKind kind, uint64_t maxLength) {
+ return std::unique_ptr<Type>(new TypeImpl(kind, maxLength));
+}
+
+std::unique_ptr<Type> createDecimalType(uint64_t precision, uint64_t scale) {
+ return std::unique_ptr<Type>(new TypeImpl(DECIMAL, precision, scale));
+}
+
+std::unique_ptr<Type> createStructType() {
+ return std::unique_ptr<Type>(new TypeImpl(STRUCT));
+}
+
+std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements) {
+ TypeImpl* result = new TypeImpl(LIST);
+ result->addChildType(std::move(elements));
+ return std::unique_ptr<Type>(result);
+}
+
+std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
+ std::unique_ptr<Type> value) {
+ TypeImpl* result = new TypeImpl(MAP);
+ result->addChildType(std::move(key));
+ result->addChildType(std::move(value));
+ return std::unique_ptr<Type>(result);
+}
+
+std::unique_ptr<Type> createUnionType() {
+ return std::unique_ptr<Type>(new TypeImpl(UNION));
+}
+
+std::string printProtobufMessage(const google::protobuf::Message& message);
+
+std::unique_ptr<Type> convertType(const proto::Type& type,
+ const proto::Footer& footer) {
+ switch (static_cast<int64_t>(type.kind())) {
+ case proto::Type_Kind_BOOLEAN:
+ case proto::Type_Kind_BYTE:
+ case proto::Type_Kind_SHORT:
+ case proto::Type_Kind_INT:
+ case proto::Type_Kind_LONG:
+ case proto::Type_Kind_FLOAT:
+ case proto::Type_Kind_DOUBLE:
+ case proto::Type_Kind_STRING:
+ case proto::Type_Kind_BINARY:
+ case proto::Type_Kind_TIMESTAMP:
+ case proto::Type_Kind_DATE:
+ case proto::Type_Kind_TIME:
+ return std::unique_ptr<Type>(
+ new TypeImpl(static_cast<ORCTypeKind>(type.kind())));
+
+ case proto::Type_Kind_CHAR:
+ case proto::Type_Kind_VARCHAR:
+ return std::unique_ptr<Type>(new TypeImpl(
+ static_cast<ORCTypeKind>(type.kind()), type.maximumlength()));
+
+ case proto::Type_Kind_DECIMAL:
+ return std::unique_ptr<Type>(
+ new TypeImpl(DECIMAL, type.precision(), type.scale()));
+
+ case proto::Type_Kind_LIST:
+ case proto::Type_Kind_MAP:
+ case proto::Type_Kind_UNION: {
+ TypeImpl* result = new TypeImpl(static_cast<ORCTypeKind>(type.kind()));
+ for (int i = 0; i < type.subtypes_size(); ++i) {
+ result->addUnionChild(convertType(
+ footer.types(static_cast<int>(type.subtypes(i))), footer));
+ }
+ return std::unique_ptr<Type>(result);
+ }
+
+ case proto::Type_Kind_STRUCT: {
+ TypeImpl* result = new TypeImpl(STRUCT);
+ uint64_t size = static_cast<uint64_t>(type.subtypes_size());
+ std::vector<Type*> typeList(size);
+ std::vector<std::string> fieldList(size);
+ for (int i = 0; i < type.subtypes_size(); ++i) {
+ result->addStructField(
+ type.fieldnames(i),
+ convertType(footer.types(static_cast<int>(type.subtypes(i))),
+ footer));
+ }
+ return std::unique_ptr<Type>(result);
+ }
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind");
+ }
+}
+
+/**
+ * Build a clone of the file type, projecting columns from the selected
+ * vector. This routine assumes that the parent of any selected column
+ * is also selected. The column ids are copied from the fileType.
+ * @param fileType the type in the file
+ * @param selected is each column by id selected
+ * @return a clone of the fileType filtered by the selection array
+ */
+std::unique_ptr<Type> buildSelectedType(const Type* fileType,
+ const std::vector<bool>& selected) {
+ if (fileType == nullptr || !selected[fileType->getColumnId()]) {
+ return std::unique_ptr<Type>();
+ }
+
+ TypeImpl* result;
+ switch (static_cast<int32_t>(fileType->getKind())) {
+ case BOOLEAN:
+ case BYTE:
+ case SHORT:
+ case INT:
+ case LONG:
+ case FLOAT:
+ case DOUBLE:
+ case STRING:
+ case BINARY:
+ case TIMESTAMP:
+ case DATE:
+ case TIME:
+ result = new TypeImpl(fileType->getKind());
+ break;
+
+ case DECIMAL:
+ result = new TypeImpl(fileType->getKind(), fileType->getPrecision(),
+ fileType->getScale());
+ break;
+
+ case VARCHAR:
+ case CHAR:
+ result = new TypeImpl(fileType->getKind(), fileType->getMaximumLength());
+ break;
+
+ case LIST:
+ result = new TypeImpl(fileType->getKind());
+ result->addChildType(
+ buildSelectedType(fileType->getSubtype(0), selected));
+ break;
+
+ case MAP:
+ result = new TypeImpl(fileType->getKind());
+ result->addChildType(
+ buildSelectedType(fileType->getSubtype(0), selected));
+ result->addChildType(
+ buildSelectedType(fileType->getSubtype(1), selected));
+ break;
+
+ case STRUCT: {
+ result = new TypeImpl(fileType->getKind());
+ for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addStructField(fileType->getFieldName(child),
+ std::move(childType));
+ }
+ }
+ break;
+ }
+
+ case UNION: {
+ result = new TypeImpl(fileType->getKind());
+ for (uint64_t child = 0; child < fileType->getSubtypeCount(); ++child) {
+ std::unique_ptr<Type> childType =
+ buildSelectedType(fileType->getSubtype(child), selected);
+ if (childType.get() != nullptr) {
+ result->addUnionChild(std::move(childType));
+ }
+ }
+ break;
+ }
+
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "Unknown type kind");
+ }
+ result->setIds(fileType->getColumnId(), fileType->getMaximumColumnId());
+ return std::unique_ptr<Type>(result);
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/type-impl.h b/depends/storage/src/storage/format/orc/type-impl.h
new file mode 100644
index 0000000..d3cf4ec
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/type-impl.h
@@ -0,0 +1,109 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_IMPL_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_IMPL_H_
+
+#include <string>
+#include <vector>
+
+#include "storage/format/orc/orc_proto.pb.h"
+#include "storage/format/orc/type.h"
+
+namespace orc {
+
+class TypeImpl : public Type {
+ private:
+ TypeImpl* parent;
+ mutable int64_t columnId;
+ mutable int64_t maximumColumnId;
+ ORCTypeKind kind;
+ std::vector<Type*> subTypes;
+ std::vector<std::string> fieldNames;
+ uint64_t subtypeCount;
+ uint64_t maxLength;
+ uint64_t precision;
+ uint64_t scale;
+
+ public:
+ // Create most of the primitive types.
+ explicit TypeImpl(ORCTypeKind kind);
+
+ // Create char and varchar type.
+ TypeImpl(ORCTypeKind kind, uint64_t maxLength);
+
+ // Create decimal type.
+ TypeImpl(ORCTypeKind kind, uint64_t precision, uint64_t scale);
+
+ virtual ~TypeImpl();
+
+ uint64_t getColumnId() const override;
+
+ uint64_t getMaximumColumnId() const override;
+
+ ORCTypeKind getKind() const override;
+
+ uint64_t getSubtypeCount() const override;
+
+ const Type* getSubtype(uint64_t i) const override;
+
+ const std::string& getFieldName(uint64_t i) const override;
+
+ uint64_t getMaximumLength() const override;
+
+ uint64_t getPrecision() const override;
+
+ uint64_t getScale() const override;
+
+ std::string toString() const override;
+
+ Type* addStructField(const std::string& fieldName,
+ std::unique_ptr<Type> fieldType) override;
+ Type* addUnionChild(std::unique_ptr<Type> fieldType) override;
+
+ std::unique_ptr<ColumnVectorBatch> createRowBatch(
+ uint64_t size, dbcommon::MemoryPool& pool) const override;
+
+ // Explicitly set the column ids. Only for internal usage.
+ void setIds(uint64_t columnId, uint64_t maxColumnId);
+
+ // Add a child type.
+ void addChildType(std::unique_ptr<Type> childType);
+
+ uint64_t assignIds(uint64_t rootId) const override;
+
+ private:
+ // Ensure that ids are assigned to all of the nodes.
+ void ensureIdAssigned() const;
+};
+
+std::unique_ptr<Type> convertType(const proto::Type& type,
+ const proto::Footer& footer);
+
+// Build a clone of the file type, projecting columns from the selected
+// vector. This routine assumes that the parent of any selected column
+// is also selected.
+// @param fileType the type in the file
+// @param selected is each column by id selected
+// @return a clone of the fileType filtered by the selection array
+std::unique_ptr<Type> buildSelectedType(const Type* fileType,
+ const std::vector<bool>& selected);
+} // namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_IMPL_H_
diff --git a/depends/storage/src/storage/format/orc/type.h b/depends/storage/src/storage/format/orc/type.h
new file mode 100644
index 0000000..35e229a
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/type.h
@@ -0,0 +1,105 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_H_
+
+#include <string>
+
+#include "dbcommon/utils/memory-pool.h"
+
+namespace orc {
+enum ORCTypeKind {
+ BOOLEAN = 0,
+ BYTE = 1,
+ SHORT = 2,
+ INT = 3,
+ LONG = 4,
+ FLOAT = 5,
+ DOUBLE = 6,
+ STRING = 7,
+ BINARY = 8,
+ TIMESTAMP = 9,
+ LIST = 10,
+ MAP = 11,
+ STRUCT = 12,
+ UNION = 13,
+ DECIMAL = 14,
+ DATE = 15,
+ VARCHAR = 16,
+ CHAR = 17,
+ TIME = 18,
+ TYPE_INVALID = -1
+};
+
+struct ColumnVectorBatch;
+
+class Type {
+ public:
+ virtual ~Type();
+ virtual uint64_t getColumnId() const = 0;
+ virtual uint64_t getMaximumColumnId() const = 0;
+ virtual ORCTypeKind getKind() const = 0;
+ virtual uint64_t getSubtypeCount() const = 0;
+ virtual const Type* getSubtype(uint64_t childId) const = 0;
+ virtual const std::string& getFieldName(uint64_t childId) const = 0;
+ virtual uint64_t getMaximumLength() const = 0;
+ virtual uint64_t getPrecision() const = 0;
+ virtual uint64_t getScale() const = 0;
+ virtual std::string toString() const = 0;
+
+ // Create a row batch for this type.
+ virtual std::unique_ptr<ColumnVectorBatch> createRowBatch(
+ uint64_t size, dbcommon::MemoryPool& pool) const = 0; // NOLINT
+
+ // Add a new field to a struct type.
+ // @param fieldName the name of the new field
+ // @param fieldType the type of the new field
+ // @return a reference to the struct type
+ virtual Type* addStructField(const std::string& fieldName,
+ std::unique_ptr<Type> fieldType) = 0;
+
+ // Add a new child to a union type.
+ // @param fieldType the type of the new field
+ // @return a reference to the union type
+ virtual Type* addUnionChild(std::unique_ptr<Type> fieldType) = 0;
+
+ // Assign ids to this node and its children giving this
+ // node rootId.
+ // @param rootId the column id that should be assigned to this node.
+ virtual uint64_t assignIds(uint64_t rootId) const = 0;
+};
+
+const int64_t DEFAULT_DECIMAL_SCALE = 18;
+const int64_t DEFAULT_DECIMAL_PRECISION = 38;
+
+std::unique_ptr<Type> createPrimitiveType(ORCTypeKind kind);
+std::unique_ptr<Type> createCharType(ORCTypeKind kind, uint64_t maxLength);
+std::unique_ptr<Type> createDecimalType(
+ uint64_t precision = DEFAULT_DECIMAL_PRECISION,
+ uint64_t scale = DEFAULT_DECIMAL_SCALE);
+
+std::unique_ptr<Type> createStructType();
+std::unique_ptr<Type> createListType(std::unique_ptr<Type> elements);
+std::unique_ptr<Type> createMapType(std::unique_ptr<Type> key,
+ std::unique_ptr<Type> value);
+std::unique_ptr<Type> createUnionType();
+
+} // namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_TYPE_H_
diff --git a/depends/storage/src/storage/format/orc/vector.cc b/depends/storage/src/storage/format/orc/vector.cc
new file mode 100644
index 0000000..da30c92
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/vector.cc
@@ -0,0 +1,453 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <algorithm>
+#include <cstdlib>
+#include <iostream>
+#include <sstream>
+#include <string>
+
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/vector.h"
+
+namespace orc {
+
+ColumnVectorBatch::ColumnVectorBatch(uint64_t cap, dbcommon::MemoryPool& pool)
+ : capacity(cap),
+ numElements(0),
+ notNull(pool, cap),
+ hasNulls(false),
+ hasStats(false),
+ memoryPool(pool) {
+ // PASS
+}
+
+ColumnVectorBatch::~ColumnVectorBatch() {
+ // PASS
+}
+
+void ColumnVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ capacity = cap;
+ notNull.resize(cap);
+ }
+}
+
+uint64_t ColumnVectorBatch::getMemoryUsage() {
+ return static_cast<uint64_t>(notNull.capacity() * sizeof(char));
+}
+
+BytesVectorBatch::BytesVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(capacity, pool),
+ data(pool, capacity),
+ length(pool, capacity) {
+ // PASS
+}
+
+BytesVectorBatch::~BytesVectorBatch() {
+ // PASS
+}
+
+std::string BytesVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "String vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+}
+
+void BytesVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ length.resize(cap);
+ }
+}
+
+uint64_t BytesVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(char*) +
+ length.capacity() * sizeof(int64_t));
+}
+
+StructVectorBatch::StructVectorBatch(uint64_t cap,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool) {
+ // PASS
+}
+
+StructVectorBatch::~StructVectorBatch() {
+ for (uint64_t i = 0; i < this->fields.size(); i++) {
+ delete this->fields[i];
+ }
+}
+
+std::string StructVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Struct vector <" << numElements << " of " << capacity << "; ";
+ for (std::vector<ColumnVectorBatch*>::const_iterator ptr = fields.begin();
+ ptr != fields.end(); ++ptr) {
+ buffer << (*ptr)->toString() << "; ";
+ }
+ buffer << ">";
+ return buffer.str();
+}
+
+void StructVectorBatch::resize(uint64_t cap) {
+ ColumnVectorBatch::resize(cap);
+ for (unsigned int i = 0; i < fields.size(); i++) {
+ fields[i]->resize(cap);
+ }
+}
+
+uint64_t StructVectorBatch::getMemoryUsage() {
+ uint64_t memory = ColumnVectorBatch::getMemoryUsage();
+ for (unsigned int i = 0; i < fields.size(); i++) {
+ memory += fields[i]->getMemoryUsage();
+ }
+ return memory;
+}
+
+bool StructVectorBatch::hasVariableLength() {
+ for (unsigned int i = 0; i < fields.size(); i++) {
+ if (fields[i]->hasVariableLength()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+ListVectorBatch::ListVectorBatch(uint64_t cap,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
+ // PASS
+}
+
+ListVectorBatch::~ListVectorBatch() {
+ // PASS
+}
+
+std::string ListVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "List vector <" << (elements == nullptr ? "" : elements->toString())
+ << " with " << numElements << " of " << capacity << ">";
+ return buffer.str();
+}
+
+void ListVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ offsets.resize(cap + 1);
+ }
+}
+
+uint64_t ListVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) +
+ (elements == nullptr ? 0 : elements->getMemoryUsage());
+}
+
+bool ListVectorBatch::hasVariableLength() { return true; }
+
+MapVectorBatch::MapVectorBatch(uint64_t cap,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool), offsets(pool, cap + 1) {
+ // PASS
+}
+
+MapVectorBatch::~MapVectorBatch() {
+ // PASS
+}
+
+std::string MapVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Map vector <" << (keys == nullptr ? "" : keys->toString()) << ", "
+ << (elements == nullptr ? "" : elements->toString()) << " with "
+ << numElements << " of " << capacity << ">";
+ return buffer.str();
+}
+
+void MapVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ offsets.resize(cap + 1);
+ }
+}
+
+uint64_t MapVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(offsets.capacity() * sizeof(int64_t)) +
+ (keys == nullptr ? 0 : keys->getMemoryUsage()) +
+ (elements == nullptr ? 0 : elements->getMemoryUsage());
+}
+
+bool MapVectorBatch::hasVariableLength() { return true; }
+
+UnionVectorBatch::UnionVectorBatch(uint64_t cap,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool), tags(pool, cap), offsets(pool, cap) {
+ // PASS
+}
+
+UnionVectorBatch::~UnionVectorBatch() {
+ for (uint64_t i = 0; i < children.size(); i++) {
+ delete children[i];
+ }
+}
+
+std::string UnionVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Union vector <";
+ for (size_t i = 0; i < children.size(); ++i) {
+ if (i != 0) {
+ buffer << ", ";
+ }
+ buffer << children[i]->toString();
+ }
+ buffer << "; with " << numElements << " of " << capacity << ">";
+ return buffer.str();
+}
+
+void UnionVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ tags.resize(cap);
+ offsets.resize(cap);
+ }
+}
+
+uint64_t UnionVectorBatch::getMemoryUsage() {
+ uint64_t memory =
+ ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(tags.capacity() * sizeof(unsigned char) +
+ offsets.capacity() * sizeof(uint64_t));
+ for (size_t i = 0; i < children.size(); ++i) {
+ memory += children[i]->getMemoryUsage();
+ }
+ return memory;
+}
+
+bool UnionVectorBatch::hasVariableLength() {
+ for (size_t i = 0; i < children.size(); ++i) {
+ if (children[i]->hasVariableLength()) {
+ return true;
+ }
+ }
+ return false;
+}
+
+Decimal64VectorBatch::Decimal64VectorBatch(
+ uint64_t cap,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ highbitValues(pool, cap),
+ readScales(pool, cap) {
+ // PASS
+}
+
+Decimal64VectorBatch::~Decimal64VectorBatch() {
+ // PASS
+}
+
+std::string Decimal64VectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Decimal64 vector with " << numElements << " of " << capacity
+ << ">";
+ return buffer.str();
+}
+
+void Decimal64VectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ values.resize(cap);
+ highbitValues.resize(cap);
+ readScales.resize(cap);
+ }
+}
+
+uint64_t Decimal64VectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>((values.capacity() + highbitValues.capacity() +
+ readScales.capacity()) *
+ sizeof(int64_t));
+}
+
+Decimal128VectorBatch::Decimal128VectorBatch(
+ uint64_t cap, dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(cap, pool),
+ precision(0),
+ scale(0),
+ values(pool, cap),
+ highbitValues(pool, cap),
+ lowbitValues(pool, cap),
+ readScales(pool, cap) {
+ // PASS
+}
+
+Decimal128VectorBatch::~Decimal128VectorBatch() {
+ // PASS
+}
+
+std::string Decimal128VectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Decimal128 vector with " << numElements << " of " << capacity
+ << ">";
+ return buffer.str();
+}
+
+void Decimal128VectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ highbitValues.resize(cap);
+ lowbitValues.resize(cap);
+ values.resize(cap);
+ readScales.resize(cap);
+ }
+}
+
+uint64_t Decimal128VectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>((highbitValues.capacity() +
+ lowbitValues.capacity() +
+ readScales.capacity()) *
+ sizeof(int64_t));
+}
+
+Decimal::Decimal(const Int128& _value, int32_t _scale)
+ : value(_value), scale(_scale) {
+ // PASS
+}
+
+Decimal::Decimal(const std::string& str) {
+ std::size_t foundPoint = str.find(".");
+ // no decimal point, it is int
+ if (foundPoint == std::string::npos) {
+ value = Int128(str);
+ scale = 0;
+ } else {
+ std::string copy(str);
+ scale = static_cast<int32_t>(str.length() - foundPoint - 1);
+ value = Int128(copy.replace(foundPoint, 1, ""));
+ }
+}
+
+std::string Decimal::toString() const { return value.toDecimalString(scale); }
+
+bool Decimal::operator<(const Decimal& right) const {
+ orc::Int128 left_value = value;
+ orc::Int128 right_value = right.value;
+ orc::Int128 integral1 = scaleDownInt128ByPowerOfTen(left_value, scale);
+ orc::Int128 integral2 = scaleDownInt128ByPowerOfTen(right_value, right.scale);
+ if (integral1 < integral2) {
+ return true;
+ } else if (integral1 > integral2) {
+ return false;
+ }
+
+ bool overflow = false;
+ bool positive = left_value >= 0;
+ left_value -= scaleUpInt128ByPowerOfTen(integral1, scale, overflow);
+ right_value -= scaleUpInt128ByPowerOfTen(integral2, right.scale, overflow);
+ int32_t diff = scale - right.scale;
+ if (diff > 0) {
+ right_value = scaleUpInt128ByPowerOfTen(right_value, diff, overflow);
+ if (overflow) {
+ return positive ? true : false;
+ }
+ } else {
+ left_value = scaleUpInt128ByPowerOfTen(left_value, -diff, overflow);
+ if (overflow) {
+ return positive ? false : true;
+ }
+ }
+
+ if (left_value < right_value) {
+ return true;
+ }
+ return false;
+}
+
+bool Decimal::operator>(const Decimal& right) const {
+ orc::Int128 left_value = value;
+ orc::Int128 right_value = right.value;
+ orc::Int128 integral1 = scaleDownInt128ByPowerOfTen(left_value, scale);
+ orc::Int128 integral2 = scaleDownInt128ByPowerOfTen(right_value, right.scale);
+ if (integral1 > integral2) {
+ return true;
+ } else if (integral1 < integral2) {
+ return false;
+ }
+
+ bool overflow = false;
+ bool positive = left_value >= 0;
+ left_value -= scaleUpInt128ByPowerOfTen(integral1, scale, overflow);
+ right_value -= scaleUpInt128ByPowerOfTen(integral2, right.scale, overflow);
+ int32_t diff = scale - right.scale;
+ if (diff > 0) {
+ right_value = scaleUpInt128ByPowerOfTen(right_value, diff, overflow);
+ if (overflow) {
+ return positive ? false : true;
+ }
+ } else {
+ left_value = scaleUpInt128ByPowerOfTen(left_value, -diff, overflow);
+ if (overflow) {
+ return positive ? true : false;
+ }
+ }
+
+ if (left_value > right_value) {
+ return true;
+ }
+ return false;
+}
+
+TimestampVectorBatch::TimestampVectorBatch(
+ uint64_t capacity, dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(capacity, pool),
+ data(pool, capacity),
+ nanoseconds(pool, capacity) {
+ // PASS
+}
+
+TimestampVectorBatch::~TimestampVectorBatch() {
+ // PASS
+}
+
+std::string TimestampVectorBatch::toString() const {
+ std::ostringstream buffer;
+ buffer << "Timestamp vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+}
+
+void TimestampVectorBatch::resize(uint64_t cap) {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ nanoseconds.resize(cap);
+ }
+}
+
+uint64_t TimestampVectorBatch::getMemoryUsage() {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>((data.capacity() + nanoseconds.capacity()) *
+ sizeof(int64_t));
+}
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/vector.h b/depends/storage/src/storage/format/orc/vector.h
new file mode 100644
index 0000000..d88518c
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/vector.h
@@ -0,0 +1,704 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_VECTOR_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_VECTOR_H_
+
+#include <cstdlib>
+#include <cstring>
+#include <iostream>
+#include <list>
+#include <memory>
+#include <stdexcept>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dbcommon/common/vector.h"
+
+#include "storage/format/orc/data-buffer.h"
+#include "storage/format/orc/exceptions.h"
+#include "storage/format/orc/int128.h"
+#include "storage/format/orc/type.h"
+
+namespace orc {
+// The base class for each of the column vectors. This class handles
+// the generic attributes such as number of elements, capacity, and
+// notNull vector.
+struct ColumnVectorBatch {
+ explicit ColumnVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~ColumnVectorBatch();
+
+ // the number of slots available
+ uint64_t capacity;
+ // the number of current occupied slots
+ uint64_t numElements;
+ // an array of capacity length marking non-null values
+ DataBuffer<char> notNull;
+ // whether there are any null values
+ bool hasNulls;
+
+ // stripe statistics part
+ bool hasStats;
+ dbcommon::VectorStatistics stats;
+
+ // custom memory pool
+ dbcommon::MemoryPool& memoryPool;
+
+ // Generate a description of this vector as a string.
+ virtual std::string toString() const = 0;
+
+ // Change the number of slots to at least the given capacity.
+ // This function is not recursive into subtypes.
+ virtual void resize(uint64_t capacity);
+
+ // Heap memory used by the batch.
+ virtual uint64_t getMemoryUsage();
+
+ // Check whether the batch length varies depending on data.
+ virtual bool hasVariableLength() = 0;
+
+ // Get the type
+ virtual ORCTypeKind getType() = 0;
+
+ // Get the data array pointer
+ virtual const char* getData() const = 0;
+ virtual const char* getNanoseconds() const { return nullptr; }
+ virtual const char* getAuxiliaryData() const { return nullptr; }
+ virtual const char* getScaleData() const { return nullptr; }
+
+ virtual uint32_t getWidth() = 0;
+
+ char* getNotNull() { return notNull.data(); }
+
+ // Build the corresponding dbcommon vector
+ virtual std::unique_ptr<dbcommon::Vector> buildVector() = 0;
+ virtual std::unique_ptr<dbcommon::Vector> buildVector(
+ dbcommon::TypeKind type) {
+ return nullptr;
+ }
+
+ private:
+ ColumnVectorBatch(const ColumnVectorBatch&);
+ ColumnVectorBatch& operator=(const ColumnVectorBatch&);
+};
+
+template <class ElementType>
+struct FixedSizeVectorBatch : public ColumnVectorBatch {
+ explicit FixedSizeVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : ColumnVectorBatch(capacity, pool), data(pool, capacity) {}
+
+ virtual ~FixedSizeVectorBatch() {}
+
+ std::string toString() const override {
+ std::ostringstream buffer;
+ buffer << "Integer vector <" << numElements << " of " << capacity << ">";
+ return buffer.str();
+ }
+
+ void resize(uint64_t cap) override {
+ if (capacity < cap) {
+ ColumnVectorBatch::resize(cap);
+ data.resize(cap);
+ }
+ }
+
+ bool hasVariableLength() override { return false; }
+
+ uint64_t getMemoryUsage() override {
+ return ColumnVectorBatch::getMemoryUsage() +
+ static_cast<uint64_t>(data.capacity() * sizeof(ElementType));
+ }
+
+ char* getData() const override {
+ return (char*)(data.data()); // NOLINT
+ }
+
+ uint32_t getWidth() override { return sizeof(ElementType); }
+
+ DataBuffer<ElementType> data;
+};
+
+struct LongVectorBatch : public FixedSizeVectorBatch<int64_t> {
+ explicit LongVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int64_t>(capacity, pool) {}
+ virtual ~LongVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::LONG; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::BIGINTID,
+ hasStats);
+ }
+};
+
+struct IntVectorBatch : public FixedSizeVectorBatch<int32_t> {
+ explicit IntVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int32_t>(capacity, pool) {}
+ virtual ~IntVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::INT; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::INTID, hasStats);
+ }
+};
+
+struct ByteVectorBatch : public FixedSizeVectorBatch<int8_t> {
+ explicit ByteVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int8_t>(capacity, pool) {}
+ virtual ~ByteVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::BYTE; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::TINYINTID,
+ hasStats);
+ }
+};
+
+struct ShortVectorBatch : public FixedSizeVectorBatch<int16_t> {
+ explicit ShortVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int16_t>(capacity, pool) {}
+ virtual ~ShortVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::SHORT; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::SMALLINTID,
+ hasStats);
+ }
+};
+
+struct FloatVectorBatch : public FixedSizeVectorBatch<float> {
+ explicit FloatVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<float>(capacity, pool) {}
+ virtual ~FloatVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::FLOAT; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::FLOATID, hasStats);
+ }
+};
+
+struct DoubleVectorBatch : public FixedSizeVectorBatch<double> {
+ explicit DoubleVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<double>(capacity, pool) {}
+ virtual ~DoubleVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::DOUBLE; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DOUBLEID,
+ hasStats);
+ }
+};
+
+struct BooleanVectorBatch : public FixedSizeVectorBatch<bool> {
+ explicit BooleanVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<bool>(capacity, pool) {}
+ virtual ~BooleanVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::BOOLEAN; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::BOOLEANID,
+ hasStats);
+ }
+};
+
+struct BytesVectorBatch : public ColumnVectorBatch {
+ virtual ~BytesVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+
+ // pointers to the start of each string
+ DataBuffer<char*> data;
+ // the length of each string
+ DataBuffer<int64_t> length;
+ // whether a direct encoding
+ bool isDirectEncoding = false;
+
+ ORCTypeKind getType() override = 0;
+
+ char* getData() const override {
+ return (char*)data.data(); // NOLINT
+ }
+
+ uint32_t getWidth() override { return 0; }
+
+ bool hasVariableLength() override { return true; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override = 0;
+
+ protected:
+ explicit BytesVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ int64_t maxLenModifier_ = -1;
+};
+
+struct BlankPaddedCharVectorBatch : public BytesVectorBatch {
+ explicit BlankPaddedCharVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool, // NOLINT
+ int64_t maxLenModifier = 1)
+ : BytesVectorBatch(capacity, pool) {
+ assert(maxLenModifier != -1);
+ maxLenModifier_ = maxLenModifier;
+ }
+ ORCTypeKind getType() override { return ORCTypeKind::CHAR; }
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(
+ dbcommon::TypeKind::CHARID, hasStats,
+ dbcommon::TypeModifierUtil::getTypeModifierFromMaxLength(
+ maxLenModifier_));
+ }
+};
+struct VaryingCharVectorBatch : public BytesVectorBatch {
+ explicit VaryingCharVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool, // NOLINT
+ int64_t maxLenModifier = -1)
+ : BytesVectorBatch(capacity, pool) {
+ maxLenModifier_ = maxLenModifier;
+ }
+ ORCTypeKind getType() override { return ORCTypeKind::VARCHAR; }
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(
+ dbcommon::TypeKind::VARCHARID, hasStats,
+ dbcommon::TypeModifierUtil::getTypeModifierFromMaxLength(
+ maxLenModifier_));
+ }
+};
+struct StringVectorBatch : public BytesVectorBatch {
+ explicit StringVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : BytesVectorBatch(capacity, pool) {}
+ ORCTypeKind getType() override { return ORCTypeKind::STRING; }
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::STRINGID,
+ hasStats);
+ }
+};
+struct BinaryVectorBatch : public BytesVectorBatch {
+ explicit BinaryVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : BytesVectorBatch(capacity, pool) {}
+ ORCTypeKind getType() override { return ORCTypeKind::BINARY; }
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::BINARYID,
+ hasStats);
+ }
+};
+
+struct StructVectorBatch : public ColumnVectorBatch {
+ explicit StructVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~StructVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
+
+ std::vector<ColumnVectorBatch*> fields;
+
+ ORCTypeKind getType() override { return ORCTypeKind::STRUCT; }
+
+ char* getData() const override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getData for StructVectorBatch");
+ }
+
+ uint32_t getWidth() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getWidth for StructVectorBatch");
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented buildVector for StructVectorBatch");
+ }
+};
+
+struct ListVectorBatch : public ColumnVectorBatch {
+ explicit ListVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~ListVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
+
+ // The offset of the first element of each list.
+ // The length of list i is startOffset[i+1] - startOffset[i].
+ DataBuffer<int64_t> offsets;
+
+ // the concatenated elements
+ std::unique_ptr<ColumnVectorBatch> elements;
+
+ ORCTypeKind getType() override { return ORCTypeKind::LIST; }
+
+ char* getData() const override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getData for ListVectorBatch");
+ }
+
+ uint32_t getWidth() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getWidth for ListVectorBatch");
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented buildVector for ListVectorBatch");
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector(ORCTypeKind type) {
+ switch (type) {
+ case orc::ORCTypeKind::SHORT:
+ return dbcommon::Vector::BuildVector(
+ dbcommon::TypeKind::SMALLINTARRAYID, hasStats);
+ break;
+
+ case orc::ORCTypeKind::INT:
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::INTARRAYID,
+ hasStats);
+ break;
+
+ case orc::ORCTypeKind::LONG:
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::BIGINTARRAYID,
+ hasStats);
+ break;
+
+ case orc::ORCTypeKind::FLOAT:
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::FLOATARRAYID,
+ hasStats);
+ break;
+
+ case orc::ORCTypeKind::DOUBLE:
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DOUBLEARRAYID,
+ hasStats);
+ break;
+
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "vector type %d is not supported yet", type);
+ }
+ }
+};
+
+struct MapVectorBatch : public ColumnVectorBatch {
+ explicit MapVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~MapVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
+
+ // The offset of the first element of each list.
+ // The length of list i is startOffset[i+1] - startOffset[i].
+ DataBuffer<int64_t> offsets;
+
+ // the concatenated keys
+ std::unique_ptr<ColumnVectorBatch> keys;
+ // the concatenated elements
+ std::unique_ptr<ColumnVectorBatch> elements;
+
+ ORCTypeKind getType() override { return ORCTypeKind::MAP; }
+
+ char* getData() const override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getData for MapVectorBatch");
+ }
+
+ uint32_t getWidth() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getWidth for MapVectorBatch");
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented buildVector for MapVectorBatch");
+ }
+};
+
+struct UnionVectorBatch : public ColumnVectorBatch {
+ explicit UnionVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~UnionVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+ bool hasVariableLength() override;
+
+ // For each value, which element of children has the value.
+ DataBuffer<unsigned char> tags;
+
+ // For each value, the index inside of the child ColumnVectorBatch.
+ DataBuffer<uint64_t> offsets;
+
+ // the sub-columns
+ std::vector<ColumnVectorBatch*> children;
+
+ ORCTypeKind getType() override { return ORCTypeKind::UNION; }
+
+ char* getData() const override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getData for UnionVectorBatch");
+ }
+
+ uint32_t getWidth() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented getWidth for UnionVectorBatch");
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR,
+ "not implemented buildVector for UnionVectorBatch");
+ }
+};
+
+struct Decimal {
+ Decimal(const Int128& value, int32_t scale);
+ explicit Decimal(const std::string& value);
+ bool operator<(const Decimal& right) const;
+ bool operator>(const Decimal& right) const;
+
+ std::string toString() const;
+ Int128 value;
+ int32_t scale;
+};
+
+struct Decimal64VectorBatch : public ColumnVectorBatch {
+ explicit Decimal64VectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~Decimal64VectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+
+ // total number of digits
+ int32_t precision;
+ // the number of places after the decimal
+ int32_t scale;
+
+ // the numeric values
+ DataBuffer<int64_t> values;
+ DataBuffer<int64_t> highbitValues;
+ DataBuffer<int64_t> readScales;
+
+ ORCTypeKind getType() override { return ORCTypeKind::DECIMAL; }
+
+ const char* getData() const override {
+ return reinterpret_cast<const char*>(values.data());
+ }
+
+ const char* getAuxiliaryData() const override {
+ return reinterpret_cast<const char*>(highbitValues.data());
+ }
+
+ const char* getScaleData() const override {
+ return reinterpret_cast<const char*>(readScales.data());
+ }
+
+ uint32_t getWidth() override {
+ return sizeof(int64_t) + sizeof(int64_t) + sizeof(int64_t);
+ }
+
+ bool hasVariableLength() override { return false; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DECIMALNEWID,
+ true);
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector(
+ dbcommon::TypeKind type) override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DECIMALNEWID,
+ hasStats);
+ }
+
+ std::string toDecimalString(int64_t value, int32_t scale) {
+ std::stringstream buffer;
+ if (scale == 0) {
+ buffer << value;
+ return buffer.str();
+ }
+ std::string sign = "";
+ if (value < 0) {
+ sign = "-";
+ value = -value;
+ }
+ buffer << value;
+ std::string str = buffer.str();
+ int32_t len = static_cast<int32_t>(str.length());
+ if (len > scale) {
+ return sign + str.substr(0, static_cast<size_t>(len - scale)) + "." +
+ str.substr(static_cast<size_t>(len - scale),
+ static_cast<size_t>(scale));
+ } else if (len == scale) {
+ return sign + "0." + str;
+ } else {
+ std::string result = sign + "0.";
+ for (int32_t i = 0; i < scale - len; ++i) {
+ result += "0";
+ }
+ return result + str;
+ }
+ }
+
+ protected:
+ friend class Decimal64ColumnReader;
+};
+
+struct Decimal128VectorBatch : public ColumnVectorBatch {
+ explicit Decimal128VectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~Decimal128VectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+
+ // total number of digits
+ int32_t precision;
+ // the number of places after the decimal
+ int32_t scale;
+
+ // the numeric values
+ DataBuffer<Int128> values;
+ DataBuffer<int64_t> highbitValues;
+ DataBuffer<uint64_t> lowbitValues;
+ DataBuffer<int64_t> readScales;
+
+ ORCTypeKind getType() override { return ORCTypeKind::DECIMAL; }
+
+ const char* getData() const override {
+ return reinterpret_cast<const char*>(lowbitValues.data());
+ }
+
+ const char* getAuxiliaryData() const override {
+ return reinterpret_cast<const char*>(highbitValues.data());
+ }
+
+ const char* getScaleData() const override {
+ return reinterpret_cast<const char*>(readScales.data());
+ }
+
+ uint32_t getWidth() override {
+ return sizeof(uint64_t) + sizeof(int64_t) + sizeof(int64_t);
+ }
+
+ bool hasVariableLength() override { return false; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DECIMALNEWID,
+ true);
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector(
+ dbcommon::TypeKind type) override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DECIMALNEWID,
+ hasStats);
+ }
+
+ protected:
+ friend class Decimal128ColumnReader;
+ friend class DecimalHive11ColumnReader;
+};
+
+struct DateVectorBatch : public FixedSizeVectorBatch<int32_t> {
+ explicit DateVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int32_t>(capacity, pool) {}
+ virtual ~DateVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::DATE; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::DATEID, hasStats);
+ }
+};
+
+struct TimeVectorBatch : public FixedSizeVectorBatch<int64_t> {
+ explicit TimeVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool) // NOLINT
+ : FixedSizeVectorBatch<int64_t>(capacity, pool) {}
+ virtual ~TimeVectorBatch() {}
+
+ ORCTypeKind getType() override { return ORCTypeKind::TIME; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::TIMEID, hasStats);
+ }
+};
+
+// A column vector batch for storing timestamp values.
+// The timestamps are stored split into the time_t value (seconds since
+// 1 Jan 1970 00:00:00) and the nanoseconds within the time_t value.
+struct TimestampVectorBatch : public ColumnVectorBatch {
+ explicit TimestampVectorBatch(uint64_t capacity,
+ dbcommon::MemoryPool& pool); // NOLINT
+ virtual ~TimestampVectorBatch();
+ std::string toString() const override;
+ void resize(uint64_t capacity) override;
+ uint64_t getMemoryUsage() override;
+
+ // the number of seconds past 1 Jan 1970 00:00 UTC (aka time_t)
+ DataBuffer<int64_t> data;
+
+ // the nanoseconds of each value
+ DataBuffer<int64_t> nanoseconds;
+
+ ORCTypeKind getType() override { return ORCTypeKind::TIMESTAMP; }
+
+ const char* getData() const override {
+ return reinterpret_cast<const char*>(data.data());
+ }
+
+ const char* getNanoseconds() const override {
+ return reinterpret_cast<const char*>(nanoseconds.data());
+ }
+
+ uint32_t getWidth() override { return sizeof(int64_t) + sizeof(int64_t); }
+
+ bool hasVariableLength() override { return false; }
+
+ std::unique_ptr<dbcommon::Vector> buildVector() override {
+ return dbcommon::Vector::BuildVector(dbcommon::TypeKind::TIMESTAMPID,
+ hasStats);
+ }
+
+ std::unique_ptr<dbcommon::Vector> buildVector(
+ dbcommon::TypeKind type) override {
+ return dbcommon::Vector::BuildVector(type, hasStats);
+ }
+};
+
+} // namespace orc
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_VECTOR_H_
diff --git a/depends/storage/src/storage/format/orc/writer.cc b/depends/storage/src/storage/format/orc/writer.cc
new file mode 100644
index 0000000..7bf181e
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/writer.cc
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/writer.h"
+#include "dbcommon/utils/comp/snappy-compressor.h"
+
+namespace orc {
+
+std::unique_ptr<Writer> createWriter(std::unique_ptr<OutputStream> stream,
+ WriterOptions* options) {
+ std::unique_ptr<WriterImpl> writer(
+ new WriterImpl(std::move(stream), options));
+
+ return std::move(writer);
+}
+
+std::unique_ptr<ColumnWriter> ColumnWriter::buildColumnWriter(
+ const orc::Type* type, WriterOptions* options) {
+ std::unique_ptr<ColumnWriter> cw;
+
+ switch (type->getKind()) {
+ case orc::ORCTypeKind::STRUCT: {
+ cw.reset(new StructColumnWriter(type, options));
+ for (uint32_t i = 0; i < type->getSubtypeCount(); i++) {
+ cw->addChildWriter(buildColumnWriter(type->getSubtype(i), options));
+ }
+ break;
+ }
+ case orc::ORCTypeKind::LIST: {
+ cw.reset(new ListColumnWriter(type, options));
+ cw->addChildWriter(buildColumnWriter(type->getSubtype(0), options));
+ break;
+ }
+ case orc::ORCTypeKind::BYTE: {
+ cw.reset(new ByteColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::INT: {
+ cw.reset(new IntColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::LONG: {
+ cw.reset(new LongColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::SHORT: {
+ cw.reset(new ShortColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::FLOAT:
+ case orc::ORCTypeKind::DOUBLE: {
+ cw.reset(new DoubleColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::STRING:
+ case orc::ORCTypeKind::VARCHAR:
+ case orc::ORCTypeKind::CHAR: {
+ cw.reset(new StringColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::BOOLEAN: {
+ cw.reset(new BooleanColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::DATE: {
+ cw.reset(new DateColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::TIME: {
+ cw.reset(new TimeColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::BINARY: {
+ cw.reset(new BinaryColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::TIMESTAMP: {
+ cw.reset(new TimestampColumnWriter(type, options));
+ break;
+ }
+ case orc::ORCTypeKind::DECIMAL: {
+ if (type->getPrecision() <= Decimal64ColumnWriter::MAX_PRECISION_64)
+ cw.reset(new Decimal64ColumnWriter(type, options));
+ else
+ cw.reset(new Decimal128ColumnWriter(type, options));
+ break;
+ }
+ default:
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "type %d not supported",
+ type->getKind());
+ }
+ return std::move(cw);
+}
+
+void WriterImpl::writePostScript() {
+ postscript->set_compression(options->getProtoCompressionKind());
+ postscript->set_compressionblocksize(options->getCompressionBlockSize());
+ postscript->set_footerlength(this->fileFooterLen);
+ postscript->set_metadatalength(this->fileMetadataLen);
+ postscript->add_version(options->getFileVersion().getMajor());
+ postscript->add_version(options->getFileVersion().getMinor());
+ postscript->set_writerversion(WriterVersion_ORC_135);
+ postscript->set_magic(this->magicId);
+ std::string buffer = postscript->SerializeAsString();
+
+ if (!(reinterpret_cast<GeneralFileOutputStream*>(outStream.get()))
+ ->fileopen())
+ this->writeHeader();
+ this->outStream->write(const_cast<char*>(buffer.data()), buffer.size());
+ // LOG_INFO("postscript size %lu", buffer.size());
+
+ uint8_t postscriptLen = buffer.size();
+ this->outStream->write(&postscriptLen, sizeof(uint8_t));
+
+ // LOG_INFO("last byte %d", 1);
+}
+
+void WriterImpl::writeFileFooter(uint64_t bodyLength) {
+ fileFooter->set_headerlength(HEADER_LENGTH);
+ fileFooter->set_contentlength(bodyLength);
+ fileFooter->set_numberofrows(this->totalRows);
+ fileFooter->set_writer(this->writerId);
+ fileFooter->set_rowindexstride(
+ static_cast<uint32_t>(options->getRowIndexStride()));
+ // Add types
+ columnWriter->addTypeToFooter(fileFooter.get());
+ // Write file statistics
+ columnWriter->writeFileStatistics(fileFooter.get());
+ // Write the file footer
+ std::string buffer = fileFooter->SerializeAsString();
+ compressor->write(buffer.data(), buffer.size());
+ compressor->flushToStream(this->outStream.get());
+
+ fileFooterLen = compressor->getStreamSize();
+
+ compressor->reset();
+
+ // LOG_INFO("file footer size: before compression %lu after compression %llu",
+ // buffer.size(), fileFooterLen);
+}
+
+void WriterImpl::writeFileMetadata() {
+ std::string buffer = fileMetadata->SerializeAsString();
+ compressor->write(buffer.data(), buffer.size());
+ compressor->flushToStream(this->outStream.get());
+
+ fileMetadataLen = compressor->getStreamSize();
+
+ compressor->reset();
+}
+
+void WriterImpl::writeTupleBatch(dbcommon::TupleBatch* tb) {
+ uint32_t nCols = tb->getNumOfColumns();
+ for (uint32_t i = 0; i < nCols; i++) {
+ dbcommon::Vector* vec = tb->getColumn(i);
+ if (!columnWriter) {
+ orc::Type* type = options->getSchema();
+ columnWriter = ColumnWriter::buildColumnWriter(type, options);
+ }
+ columnWriter->getChildWriter(i)->writeVector(vec);
+ }
+ this->numRowsInCurrentStripe += tb->getNumOfRows();
+ if (numRowsInCurrentStripe > 0 &&
+ numRowsInCurrentStripe % options->getRowIndexStride() == 0)
+ columnWriter->addBloomFilterEntry();
+}
+
+uint64_t WriterImpl::getStripeDataLength(proto::StripeFooter* footer) {
+ uint32_t sz = footer->streams_size();
+ uint64_t len = 0;
+ for (uint32_t i = 0; i < sz; i++) {
+ len += footer->streams(i).length();
+ }
+ return len;
+}
+
+uint64_t WriterImpl::writeStripeFooter() {
+ std::string buffer = stripeFooter->SerializeAsString();
+ compressor->write(buffer.data(), buffer.size());
+ compressor->flushToStream(this->outStream.get());
+
+ // LOG_INFO("strip footer size %lu %d after compression sz %lld",
+ // buffer.size(), stripeFooter->ByteSize(),
+ // compressor->getStreamSize());
+
+ uint64_t sfooterLen = compressor->getStreamSize();
+ compressor->reset();
+
+ return sfooterLen;
+}
+
+void WriterImpl::writeHeader() {
+ std::string buffer(this->magicId);
+ this->outStream->write(const_cast<char*>(buffer.data()), buffer.size());
+ this->stripeStart = this->outStream->getPosition();
+}
+
+void WriterImpl::writeCurrentStrip() {
+ if (!(reinterpret_cast<GeneralFileOutputStream*>(outStream.get()))
+ ->fileopen())
+ this->writeHeader();
+ if (numRowsInCurrentStripe % options->getRowIndexStride() != 0)
+ columnWriter->addBloomFilterEntry();
+ proto::StripeStatistics* stats = fileMetadata->add_stripestats();
+ columnWriter->writeStripe(stripeFooter.get(), stats, this->outStream.get());
+ uint64_t stripFooterLen = this->writeStripeFooter();
+ completeStripInfo(stripFooterLen);
+ this->totalRows += this->numRowsInCurrentStripe;
+ this->stripeFooter->Clear();
+ padStripe();
+}
+
+void WriterImpl::completeStripInfo(uint64_t stripFooterLen) {
+ currentStripe = fileFooter->add_stripes();
+ currentStripe->set_offset(this->stripeStart);
+ currentStripe->set_datalength(getStripeDataLength(this->stripeFooter.get()));
+ currentStripe->set_footerlength(stripFooterLen);
+ currentStripe->set_indexlength(0);
+ currentStripe->set_numberofrows(this->numRowsInCurrentStripe);
+ // LOG_INFO("stripe info: offset %lld, datalength %lld, "
+ // "footerlength %lld, indexlength %lld, #row %lld",
+ // currentStripe->offset(),
+ // currentStripe->datalength(), currentStripe->footerlength(),
+ // currentStripe->indexlength(), currentStripe->numberofrows());
+}
+
+void WriterImpl::startNewStripe() {
+ this->stripeStart = this->outStream->getPosition();
+ this->numRowsInCurrentStripe = 0;
+}
+
+void WriterImpl::padStripe() {
+ uint64_t currentStripeSize =
+ this->outStream->getPosition() - this->stripeStart;
+ uint64_t available =
+ this->options->getBlockSize() -
+ (this->outStream->getPosition() % this->options->getBlockSize());
+ uint64_t defaultStripeSize = this->options->getStripeSize();
+ uint64_t overflow = currentStripeSize - adjustedStripeSize;
+ if (scale == 1)
+ scale = static_cast<double>(currentStripeSize) /
+ static_cast<double>(defaultStripeSize);
+ double availRatio =
+ static_cast<double>(available) / static_cast<double>(defaultStripeSize);
+ double paddingTolerance = this->options->getPaddingTolerance();
+ if (availRatio > 0.0 && availRatio < 1.0 && availRatio > paddingTolerance) {
+ double correction = overflow > 0
+ ? static_cast<double>(overflow) /
+ static_cast<double>(adjustedStripeSize)
+ : 0.0;
+ correction = correction > paddingTolerance ? paddingTolerance : correction;
+ adjustedStripeSize = static_cast<uint64_t>(
+ floor((1.0 - correction) * (availRatio * defaultStripeSize)));
+ } else if (availRatio >= 1.0) {
+ adjustedStripeSize = defaultStripeSize;
+ }
+ if (availRatio < paddingTolerance) {
+ this->outStream->padding(available);
+ adjustedStripeSize = defaultStripeSize;
+ LOG_INFO(
+ "ORC stripe size: %llu bytes, block available: %llu bytes, "
+ "adjustedStripeSize: %llu bytes, padding then",
+ currentStripeSize, available, adjustedStripeSize);
+ } else {
+ LOG_INFO(
+ "ORC stripe size: %llu bytes, block available: %llu bytes, "
+ "adjustedStripeSize: %llu bytes, with no padding",
+ currentStripeSize, available, adjustedStripeSize);
+ }
+}
+
+} // end of namespace orc
diff --git a/depends/storage/src/storage/format/orc/writer.h b/depends/storage/src/storage/format/orc/writer.h
new file mode 100644
index 0000000..3011d88
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/writer.h
@@ -0,0 +1,1516 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_FORMAT_ORC_WRITER_H_
+#define STORAGE_SRC_STORAGE_FORMAT_ORC_WRITER_H_
+
+#include <algorithm>
+#include <list>
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/vector/list-vector.h"
+#include "dbcommon/common/vector/struct-vector.h"
+#include "dbcommon/common/vector/variable-length-vector.h"
+#include "dbcommon/utils/comp/compressor.h"
+#include "dbcommon/utils/macro.h"
+#include "dbcommon/utils/string-util.h"
+
+#include "storage/common/bloom-filter.h"
+#include "storage/format/orc/byte-rle.h"
+#include "storage/format/orc/file-version.h"
+#include "storage/format/orc/orc-proto-definition.h"
+#include "storage/format/orc/output-stream.h"
+#include "storage/format/orc/seekable-output-stream.h"
+#include "storage/format/orc/string-dictionary.h"
+
+namespace orc {
+
+struct WriterOptionsPrivate {
+ FileVersion fileVersion;
+ uint64_t blockSize;
+ uint64_t stripeSize;
+ uint64_t compressionBlockSize;
+ double blockPaddingTolerance;
+ CompressionKind compKind;
+ RleVersion rleVersion;
+ double dictionaryKeySizeThreshold;
+ int64_t rowIndexStrideValue;
+ std::unique_ptr<std::vector<bool>> columnsToBloomFilter;
+ bool writeStats;
+
+ WriterOptionsPrivate() : fileVersion(0, 11) {
+ // The block size, for example, hdfs block size
+ // Used for letting stripe not across block boundary
+ // and block padding, 128 MB by default
+ blockSize = 1 << 27;
+ stripeSize = 1 << 26; // default stripe size is 64 MB
+ compressionBlockSize = ORC_COMPRESSION_BLOCK_SIZE;
+ // If the number of distinct keys in a dictionary is greater than this
+ // fraction of the total number of non-null rows, turn off dictionary
+ // encoding
+ dictionaryKeySizeThreshold = 0;
+ blockPaddingTolerance = 0.05;
+ compKind = CompressionKind::CompressionKind_NONE;
+ rleVersion = RleVersion::RleVersion_2;
+ rowIndexStrideValue = DEFAULT_NUMBER_TUPLES_PER_BATCH * 32;
+ columnsToBloomFilter = nullptr;
+ writeStats = true;
+ }
+};
+
+class WriterOptions {
+ public:
+ WriterOptions()
+ : privateBits(
+ std::unique_ptr<WriterOptionsPrivate>(new WriterOptionsPrivate)) {}
+
+ orc::Type *getSchema() { return schema.get(); }
+ void setSchema(std::unique_ptr<orc::Type> t) { this->schema = std::move(t); }
+
+ uint64_t getBlockSize() { return privateBits->blockSize; }
+ void setBlockSize(uint64_t blockSize) { privateBits->blockSize = blockSize; }
+
+ uint64_t getStripeSize() { return privateBits->stripeSize; }
+
+ double getPaddingTolerance() { return privateBits->blockPaddingTolerance; }
+
+ void setCompressionKind(const std::string &kindStr) {
+ std::string kind = dbcommon::StringUtil::lower(kindStr);
+ if (kind == "none")
+ privateBits->compKind = CompressionKind::CompressionKind_NONE;
+ else if (kind == "lz4")
+ privateBits->compKind = CompressionKind::CompressionKind_LZ4;
+ else if (kind == "snappy")
+ privateBits->compKind = CompressionKind::CompressionKind_SNAPPY;
+ else
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "compression kind %s not supported", kindStr.c_str());
+ }
+
+ CompressionKind getCompressionKind() { return privateBits->compKind; }
+
+ void setRleVersion(const std::string &versionStr) {
+ if (versionStr == "v0")
+ privateBits->rleVersion = RleVersion::RleVersion_0;
+ else if (versionStr == "v1")
+ privateBits->rleVersion = RleVersion::RleVersion_1;
+ else if (versionStr == "v2")
+ privateBits->rleVersion = RleVersion::RleVersion_2;
+ else
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "RleVersion %s not supported",
+ versionStr.c_str());
+ }
+
+ void setDictKeySizeThreshold(double threshold) {
+ privateBits->dictionaryKeySizeThreshold = threshold;
+ }
+
+ double getDictKeySizeThreshold() {
+ return privateBits->dictionaryKeySizeThreshold;
+ }
+
+ void setColumnsToBloomFilter(const std::vector<int> &columnsToBloomFilter,
+ int32_t tableSize) {
+ privateBits->columnsToBloomFilter.reset(
+ new std::vector<bool>(schema.get()->getMaximumColumnId() + 1, false));
+
+ for (int i = 0; i < columnsToBloomFilter.size(); ++i) {
+ const orc::Type *subType =
+ schema.get()->getSubtype(columnsToBloomFilter[i]);
+ (*privateBits->columnsToBloomFilter)[subType->getColumnId()] = true;
+ }
+ }
+
+ const std::vector<bool> *getColumnsToBloomFilter() {
+ return privateBits->columnsToBloomFilter.get();
+ }
+
+ void setWriteStats(bool writeStats) { privateBits->writeStats = writeStats; }
+
+ bool getWriteStats() { return privateBits->writeStats; }
+
+ RleVersion getRleVersion() { return privateBits->rleVersion; }
+
+ orc::proto::CompressionKind getProtoCompressionKind() {
+ switch (privateBits->compKind) {
+ case orc::CompressionKind::CompressionKind_NONE:
+ return orc::proto::CompressionKind::NONE;
+ case orc::CompressionKind::CompressionKind_SNAPPY:
+ return orc::proto::CompressionKind::SNAPPY;
+ case orc::CompressionKind::CompressionKind_LZ4:
+ return orc::proto::CompressionKind::LZ4;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "compression kind %lu not supported", privateBits->compKind);
+ }
+ }
+
+ int64_t getRowIndexStride() { return privateBits->rowIndexStrideValue; }
+
+ uint64_t getCompressionBlockSize() {
+ return privateBits->compressionBlockSize;
+ }
+
+ FileVersion getFileVersion() { return privateBits->fileVersion; }
+
+ private:
+ std::unique_ptr<orc::Type> schema;
+ std::unique_ptr<WriterOptionsPrivate> privateBits;
+};
+
+class ColumnWriter {
+ public:
+ ColumnWriter(const orc::Type *type, WriterOptions *options)
+ : bitWriter(createBooleanRleEncoderImpl(options->getCompressionKind())),
+ stripeColStats(createColumnStatistics(type)),
+ fileColStats(createColumnStatistics(type)),
+ type(type),
+ version(options->getRleVersion()),
+ createBloomFilter(false),
+ writeStatsOn(options->getWriteStats()) {
+ const std::vector<bool> *columnsToBloomFilter =
+ options->getColumnsToBloomFilter();
+ if (columnsToBloomFilter && (*columnsToBloomFilter)[this->getColumnId()])
+ createBloomFilter = true;
+ if (createBloomFilter) {
+ bloomFilter.reset(new storage::BloomFilter(options->getRowIndexStride()));
+ bloomFilterWriter = createBlockCompressor(options->getCompressionKind());
+ }
+ }
+
+ virtual ~ColumnWriter() {}
+
+ virtual void writeVector(dbcommon::Vector *vector) {
+ uint64_t sz = vector->getNumOfRows();
+ if (vector->hasNullValue()) {
+ std::unique_ptr<dbcommon::ByteBuffer> buf =
+ vector->getNullBuffer()->getReverseBools();
+ bitWriter->write(buf->data(), sz, nullptr);
+
+ // gather statistics
+ const bool *nulls = vector->getNullBuffer()->getBools();
+ uint64_t cnt = 0;
+ for (int i = 0; i < sz; ++i) {
+ if (!nulls[i]) ++cnt;
+ }
+ notNullCnt += cnt;
+ } else {
+ bitWriter->write(nullptr, sz, nullptr);
+ notNullCnt += sz;
+ }
+ totalCnt += sz;
+ }
+
+ virtual void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) {
+ assert(stripeFooter != nullptr && out != nullptr);
+
+ if (createBloomFilter && bloomFilterIndexProto.bloomfilter_size() > 0) {
+ std::string bfStr = bloomFilterIndexProto.SerializeAsString();
+ bloomFilterWriter->write(bfStr.c_str(), bfStr.length());
+ bloomFilterWriter->flushToStream(out);
+ ::orc::proto::Stream *bloomFilterStream = stripeFooter->add_streams();
+ bloomFilterStream->set_kind(
+ ::orc::proto::Stream_Kind::Stream_Kind_BLOOM_FILTER);
+ bloomFilterStream->set_column(this->getColumnId());
+ bloomFilterStream->set_length(bloomFilterWriter->getStreamSize());
+ }
+
+ if (totalCnt != notNullCnt) {
+ bitWriter->flushToStream(out);
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_PRESENT);
+ stream->set_column(this->getColumnId());
+ stream->set_length(bitWriter->getStreamSize());
+ }
+
+ if (writeStatsOn) {
+ stripeColStats->increment(notNullCnt);
+ if (totalCnt == notNullCnt) stripeColStats->unsetNull();
+ fileColStats->merge(*stripeColStats);
+ stripeColStats->serialize(pb->add_colstats());
+ }
+
+ reset();
+ }
+
+ void addChildWriter(std::unique_ptr<ColumnWriter> writer) {
+ childWriters.push_back(std::move(writer));
+ }
+
+ ColumnWriter *getChildWriter(uint32_t index) {
+ return childWriters[index].get();
+ }
+
+ uint32_t getColumnId() { return type->getColumnId(); }
+
+ virtual uint64_t getEstimatedSpaceNeeded() {
+ return bitWriter->getEstimatedSpaceNeeded();
+ }
+
+ virtual void addBloomFilterEntry() {
+ if (createBloomFilter) {
+ proto::BloomFilter *bloomFilterProto =
+ bloomFilterIndexProto.add_bloomfilter();
+ bloomFilterProto->set_numhashfunctions(
+ bloomFilter->getNumHashFunctions());
+ for (int i = 0; i < bloomFilter->size(); ++i) {
+ bloomFilterProto->add_bitset(bloomFilter->getBitSet()[i]);
+ }
+ bloomFilter->reset();
+ }
+ }
+
+ virtual void writeFileStatistics(proto::Footer *fileFooter) {
+ fileColStats->serialize(fileFooter->add_statistics());
+ }
+
+ virtual orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() {
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ }
+
+ void reset() {
+ // clear the buffers
+ bitWriter->reset();
+
+ notNullCnt = 0;
+ totalCnt = 0;
+
+ if (createBloomFilter) {
+ bloomFilter->reset();
+ bloomFilterWriter->reset();
+ bloomFilterIndexProto.Clear();
+ }
+ }
+
+ virtual void addTypeToFooter(proto::Footer *footer) = 0;
+
+ static std::unique_ptr<ColumnWriter> buildColumnWriter(
+ const orc::Type *type, WriterOptions *options);
+
+ protected:
+ std::vector<std::unique_ptr<ColumnWriter>> childWriters;
+ std::unique_ptr<BooleanRleEncoderImpl> bitWriter;
+ RleVersion version;
+ const orc::Type *type = nullptr;
+
+ std::unique_ptr<ColumnStatisticsImpl> stripeColStats;
+ std::unique_ptr<ColumnStatisticsImpl> fileColStats;
+ uint64_t notNullCnt = 0;
+ uint64_t totalCnt = 0;
+ storage::BloomFilter::uptr bloomFilter;
+ std::unique_ptr<SeekableOutputStream> bloomFilterWriter;
+ bool createBloomFilter;
+ proto::BloomFilterIndex bloomFilterIndexProto;
+ bool writeStatsOn;
+};
+
+class BooleanColumnWriter : public ColumnWriter {
+ public:
+ BooleanColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options),
+ booleanWriter(
+ createBooleanRleEncoderImpl(options->getCompressionKind())) {
+ createBloomFilter = false;
+ }
+ virtual ~BooleanColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ booleanWriter->write(
+ const_cast<char *>(vector->getValue()), vector->getNumOfRows(),
+ (vector->hasNullValue()
+ ? vector->getNullBuffer()->getReverseBools()->data()
+ : nullptr));
+
+ if (writeStatsOn) {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<BooleanColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateBoolean(reinterpret_cast<const bool *>(vals)[i]);
+ }
+ }
+ }
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ booleanWriter->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(booleanWriter->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ booleanWriter->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return booleanWriter->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_BOOLEAN);
+ }
+
+ protected:
+ std::unique_ptr<BooleanRleEncoderImpl> booleanWriter;
+};
+
+class ByteColumnWriter : public ColumnWriter {
+ public:
+ explicit ByteColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ rleCoder = createByteRleCoder(options->getCompressionKind());
+ }
+ virtual ~ByteColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ rleCoder->write(const_cast<char *>(vector->getValue()),
+ vector->getNumOfRows(),
+ (vector->hasNullValue()
+ ? vector->getNullBuffer()->getReverseBools()->data()
+ : nullptr));
+
+ if (writeStatsOn) {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<IntegerColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateInteger(reinterpret_cast<const int8_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int8_t *>(vals)[i]);
+ }
+ }
+ }
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ rleCoder->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(rleCoder->getStreamSize());
+
+ // LOG_INFO("stream kind %s, column# %d, length %lld ",
+ // "Stream_Kind_DATA",
+ // this->getColumnId(), stream->length());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+ // LOG_INFO("ColumnEncoding_Kind_DIRECT_V2");
+
+ // clear the buffers
+ rleCoder->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return rleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_BYTE);
+ }
+
+ protected:
+ std::unique_ptr<ByteRleCoder> rleCoder;
+};
+
+class IntegerColumnWriter : public ColumnWriter {
+ public:
+ explicit IntegerColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {}
+ virtual ~IntegerColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ rleCoder->write(const_cast<char *>(vector->getValue()),
+ vector->getNumOfRows(),
+ (vector->hasNullValue()
+ ? vector->getNullBuffer()->getReverseBools()->data()
+ : nullptr));
+
+ if (writeStatsOn) writeStats(vector);
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ rleCoder->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(rleCoder->getStreamSize());
+
+ // LOG_INFO("stream kind %s, column# %d, length %lld ",
+ // "Stream_Kind_DATA",
+ // this->getColumnId(), stream->length());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+ // LOG_INFO("ColumnEncoding_Kind_DIRECT_V2");
+
+ // clear the buffers
+ rleCoder->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return rleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ }
+
+ protected:
+ std::unique_ptr<RleCoder> rleCoder;
+
+ private:
+ virtual void writeStats(dbcommon::Vector *vector) = 0;
+};
+
+class IntColumnWriter : public IntegerColumnWriter {
+ public:
+ explicit IntColumnWriter(const orc::Type *type, WriterOptions *options)
+ : IntegerColumnWriter(type, options) {
+ rleCoder = createRleCoder(true, options->getRleVersion(), ORCTypeKind::INT,
+ options->getCompressionKind());
+ }
+ virtual ~IntColumnWriter() {}
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_INT);
+ }
+
+ private:
+ void writeStats(dbcommon::Vector *vector) override {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<IntegerColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateInteger(reinterpret_cast<const int32_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int32_t *>(vals)[i]);
+ }
+ }
+ }
+};
+
+class LongColumnWriter : public IntegerColumnWriter {
+ public:
+ explicit LongColumnWriter(const orc::Type *type, WriterOptions *options)
+ : IntegerColumnWriter(type, options) {
+ rleCoder = createRleCoder(true, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ }
+ virtual ~LongColumnWriter() {}
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_LONG);
+ }
+
+ private:
+ void writeStats(dbcommon::Vector *vector) override {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<IntegerColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateInteger(reinterpret_cast<const int64_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int64_t *>(vals)[i]);
+ }
+ }
+ }
+};
+
+class ShortColumnWriter : public IntegerColumnWriter {
+ public:
+ explicit ShortColumnWriter(const orc::Type *type, WriterOptions *options)
+ : IntegerColumnWriter(type, options) {
+ rleCoder =
+ createRleCoder(true, options->getRleVersion(), ORCTypeKind::SHORT,
+ options->getCompressionKind());
+ }
+ virtual ~ShortColumnWriter() {}
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_SHORT);
+ }
+
+ private:
+ void writeStats(dbcommon::Vector *vector) override {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<IntegerColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateInteger(reinterpret_cast<const int16_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int16_t *>(vals)[i]);
+ }
+ }
+ }
+};
+
+class DoubleColumnWriter : public ColumnWriter {
+ public:
+ explicit DoubleColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ bufferedStream = createBlockCompressor(options->getCompressionKind());
+ myType = type->getKind();
+ }
+
+ virtual ~DoubleColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ if (myType == orc::ORCTypeKind::DOUBLE) {
+ bufferedStream->write<double>(
+ reinterpret_cast<const double *>(vals)[i]);
+ if (writeStatsOn)
+ dynamic_cast<DoubleColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateDouble(reinterpret_cast<const double *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addDouble(reinterpret_cast<const double *>(vals)[i]);
+ } else {
+ assert(myType == orc::ORCTypeKind::FLOAT);
+ bufferedStream->write<float>(
+ reinterpret_cast<const float *>(vals)[i]);
+ if (writeStatsOn)
+ dynamic_cast<DoubleColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateDouble(reinterpret_cast<const float *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addDouble(reinterpret_cast<const float *>(vals)[i]);
+ }
+ }
+ }
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ bufferedStream->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(bufferedStream->getStreamSize());
+
+ // LOG_INFO("stream kind %s, column# %d, length %lld ",
+ // "Stream_Kind_DATA",
+ // this->getColumnId(), stream->length());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+ // LOG_INFO("ColumnEncoding_Kind_DIRECT_V2");
+
+ // clear the buffers
+ bufferedStream->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return bufferedStream->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *typ = footer->add_types();
+ if (type->getKind() == orc::ORCTypeKind::FLOAT) {
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_FLOAT);
+ } else if (type->getKind() == orc::ORCTypeKind::DOUBLE) {
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_DOUBLE);
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "unknown double/float type %d",
+ type->getKind());
+ }
+ }
+
+ protected:
+ std::unique_ptr<SeekableOutputStream> bufferedStream;
+ orc::ORCTypeKind myType;
+};
+
+class StringColumnWriter : public ColumnWriter {
+ public:
+ explicit StringColumnWriter(const orc::Type *type, WriterOptions *options);
+
+ void writeVector(dbcommon::Vector *vector) override;
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override;
+
+ uint64_t getEstimatedSpaceNeeded() override;
+
+ void addTypeToFooter(proto::Footer *footer) override;
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override;
+
+ void addBloomFilterEntry() override;
+
+ private:
+ void checkDictionaryEncoding();
+
+ void flushDictionary();
+
+ void reset();
+
+ private:
+ double dictionaryKeySizeThreshold;
+ bool doneDictionaryCheck = false;
+ bool useDictionaryEncoding = true;
+
+ // for dict
+ std::unique_ptr<RleCoder> rowsRleCoder;
+ std::unique_ptr<SeekableOutputStream> dictDataBufferedStream;
+ StringDictionary dictionary;
+ std::vector<uint32_t> rows;
+ // for direct
+ std::unique_ptr<SeekableOutputStream> directDataBufferedStream;
+ // share
+ std::unique_ptr<RleCoder> lengthRleCoder;
+};
+
+class BinaryColumnWriter : public ColumnWriter {
+ public:
+ explicit BinaryColumnWriter(const orc::Type *type, WriterOptions *options);
+ void writeVector(dbcommon::Vector *vector) override;
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override;
+ uint64_t getEstimatedSpaceNeeded() override;
+ void addTypeToFooter(proto::Footer *footer) override;
+
+ private:
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override;
+ std::unique_ptr<SeekableOutputStream> dataBufferedStream;
+ std::unique_ptr<RleCoder> lengthRleCoder;
+};
+
+class DateColumnWriter : public ColumnWriter {
+ public:
+ explicit DateColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ rleCoder = createRleCoder(true, options->getRleVersion(), ORCTypeKind::INT,
+ options->getCompressionKind());
+ }
+ virtual ~DateColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ rleCoder->write(const_cast<char *>(vector->getValue()),
+ vector->getNumOfRows(),
+ (vector->hasNullValue()
+ ? vector->getNullBuffer()->getReverseBools()->data()
+ : nullptr));
+
+ if (writeStatsOn) writeStats(vector);
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ rleCoder->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(rleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ rleCoder->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return rleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_DATE);
+ }
+
+ protected:
+ std::unique_ptr<RleCoder> rleCoder;
+
+ private:
+ void writeStats(dbcommon::Vector *vector) {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<DateColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateDate(reinterpret_cast<const int32_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int32_t *>(vals)[i]);
+ }
+ }
+ }
+};
+
+class TimeColumnWriter : public ColumnWriter {
+ public:
+ explicit TimeColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ rleCoder = createRleCoder(true, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ }
+ virtual ~TimeColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ rleCoder->write(const_cast<char *>(vector->getValue()),
+ vector->getNumOfRows(),
+ (vector->hasNullValue()
+ ? vector->getNullBuffer()->getReverseBools()->data()
+ : nullptr));
+
+ if (writeStatsOn) writeStats(vector);
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ rleCoder->flushToStream(out);
+
+ ::orc::proto::Stream *stream = stripeFooter->add_streams();
+ stream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream->set_column(this->getColumnId());
+ stream->set_length(rleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ rleCoder->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return rleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_TIME);
+ }
+
+ protected:
+ std::unique_ptr<RleCoder> rleCoder;
+
+ private:
+ void writeStats(dbcommon::Vector *vector) {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ dynamic_cast<IntegerColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateInteger(reinterpret_cast<const int64_t *>(vals)[i]);
+ if (createBloomFilter)
+ bloomFilter->addInt(reinterpret_cast<const int64_t *>(vals)[i]);
+ }
+ }
+ }
+};
+
+class TimestampColumnWriter : public ColumnWriter {
+ public:
+ explicit TimestampColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ secondsRleCoder =
+ createRleCoder(true, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ nanoRleCoder =
+ createRleCoder(false, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ }
+ virtual ~TimestampColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ int64_t *seconds =
+ reinterpret_cast<int64_t *>(const_cast<char *>(vector->getValue()));
+ int64_t *nanoseconds = reinterpret_cast<int64_t *>(
+ const_cast<char *>(vector->getNanoseconds()));
+
+ const char *notnulls = nullptr;
+ std::unique_ptr<dbcommon::ByteBuffer> buf;
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notnulls = reinterpret_cast<const char *>(buf->data());
+ }
+
+ // CAUTION: not consider selectlist now
+ for (uint64_t i = 0; i < vector->getNumOfRows(); i++) {
+ int64_t second =
+ seconds[i] -
+ (ORC_TIMESTAMP_EPOCH_JDATE - UNIX_EPOCH_JDATE) * SECONDS_PER_DAY;
+ int64_t nano;
+ if (!notnulls || notnulls[i]) {
+ nano = formatNanos(nanoseconds[i]);
+ secondsRleCoder->write(&second, 1, nullptr);
+ nanoRleCoder->write(&nano, 1, nullptr);
+ } else {
+ nano = nanoseconds[i];
+ secondsRleCoder->write(&second, 1, ¬nulls[i]);
+ nanoRleCoder->write(&nano, 1, ¬nulls[i]);
+ }
+ }
+
+ if (writeStatsOn) writeStats(vector);
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ secondsRleCoder->flushToStream(out);
+ ::orc::proto::Stream *secondsStream = stripeFooter->add_streams();
+ secondsStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ secondsStream->set_column(this->getColumnId());
+ secondsStream->set_length(secondsRleCoder->getStreamSize());
+
+ nanoRleCoder->flushToStream(out);
+ ::orc::proto::Stream *nanoStream = stripeFooter->add_streams();
+ nanoStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_SECONDARY);
+ nanoStream->set_column(this->getColumnId());
+ nanoStream->set_length(nanoRleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ secondsRleCoder->reset();
+ nanoRleCoder->reset();
+
+ stripeColStats->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ return secondsRleCoder->getEstimatedSpaceNeeded() +
+ nanoRleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ }
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_TIMESTAMP);
+ }
+
+ protected:
+ std::unique_ptr<RleCoder> secondsRleCoder;
+ std::unique_ptr<RleCoder> nanoRleCoder;
+
+ private:
+ void writeStats(dbcommon::Vector *vector) {
+ uint64_t numValues = vector->getNumOfRows();
+ const char *vals = vector->getValue();
+ const char *nanos = vector->getNanoseconds();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!vector->isNull(i)) {
+ int64_t milli = reinterpret_cast<const int64_t *>(vals)[i] * 1000 +
+ reinterpret_cast<const int64_t *>(nanos)[i] / 1000000;
+ if (milli < 0 && reinterpret_cast<const int64_t *>(nanos)[i] > 0)
+ milli -= 1000;
+ dynamic_cast<TimestampColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateTimestamp(milli);
+ if (createBloomFilter) bloomFilter->addInt(milli);
+ }
+ }
+ }
+
+ uint64_t formatNanos(uint64_t nanos) {
+ if (nanos == 0) {
+ return 0;
+ } else if (nanos % 100 != 0) {
+ return nanos << 3;
+ } else {
+ nanos /= 100;
+ int trailingZeros = 1;
+ while (nanos % 10 == 0 && trailingZeros < 7) {
+ nanos /= 10;
+ trailingZeros += 1;
+ }
+ return nanos << 3 | trailingZeros;
+ }
+ }
+};
+
+class StructColumnWriter : public ColumnWriter {
+ public:
+ explicit StructColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ createBloomFilter = false;
+ }
+
+ virtual ~StructColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ dbcommon::StructVector *svect =
+ dynamic_cast<dbcommon::StructVector *>(vector);
+ assert(svect != nullptr);
+ uint64_t nChild = svect->getChildSize();
+ for (uint64_t i = 0; i < nChild; i++) {
+ dbcommon::Vector *v = svect->getChildVector(i);
+ childWriters[i]->writeVector(v);
+ }
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ assert(stripeFooter != nullptr);
+
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ // StructColumnReader needs this, must be ColumnEncoding_Kind_DIRECT
+ stripeFooter->add_columns()->set_kind(
+ ::orc::proto::ColumnEncoding_Kind_DIRECT);
+
+ // Here, by default, we do not add PRESENT stream for struct type
+
+ // Write child stipes
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ childWriters[i]->writeStripe(stripeFooter, pb, out);
+ }
+
+ // clear the Buffer
+ // If we add PRESENT stream, we also need to clear the buffer.
+ }
+
+ // WE DONT COUNT PRESENT stream for struct type
+ uint64_t getEstimatedSpaceNeeded() override {
+ uint64_t sz = 0;
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ sz += childWriters[i]->getEstimatedSpaceNeeded();
+ }
+ return sz;
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ ::orc::proto::Type *typ = footer->add_types();
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_STRUCT);
+
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ typ->add_fieldnames(this->type->getFieldName(i));
+ typ->add_subtypes(this->getChildWriter(i)->getColumnId());
+ }
+
+ for (uint64_t i = 0; i < nChild; i++) {
+ this->getChildWriter(i)->addTypeToFooter(footer);
+ }
+ }
+
+ void addBloomFilterEntry() override {
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ childWriters[i]->addBloomFilterEntry();
+ }
+ }
+
+ void writeFileStatistics(proto::Footer *fileFooter) override {
+ if (!writeStatsOn) return;
+ ColumnWriter::writeFileStatistics(fileFooter);
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ childWriters[i]->writeFileStatistics(fileFooter);
+ }
+ }
+};
+
+class ListColumnWriter : public ColumnWriter {
+ public:
+ explicit ListColumnWriter(const orc::Type *type, WriterOptions *options)
+ : ColumnWriter(type, options) {
+ lengthRleCoder =
+ createRleCoder(false, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ }
+
+ virtual ~ListColumnWriter() {}
+
+ void writeVector(dbcommon::Vector *vector) override {
+ ColumnWriter::writeVector(vector);
+
+ std::unique_ptr<dbcommon::ByteBuffer> buf =
+ vector->getNullBuffer()->getReverseBools();
+
+ dbcommon::ListVector *lvector =
+ dynamic_cast<dbcommon::ListVector *>(vector);
+ uint64_t *offsets = const_cast<uint64_t *>(lvector->getOffsets());
+ const char *notnulls = reinterpret_cast<const char *>(buf->data());
+
+ uint64_t currentLength = 0;
+ for (uint64_t i = 0; i < vector->getNumOfRows(); i++) {
+ currentLength = offsets[i + 1] - offsets[i];
+ lengthRleCoder->write(¤tLength, 1, ¬nulls[i]);
+ if (createBloomFilter) bloomFilter->addInt(currentLength);
+ }
+ childWriters[0]->writeVector(vector->getChildVector(0));
+ }
+
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override {
+ assert(stripeFooter != nullptr);
+
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ lengthRleCoder->flushToStream(out);
+ ::orc::proto::Stream *lengthStream = stripeFooter->add_streams();
+ lengthStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_LENGTH);
+ lengthStream->set_column(this->getColumnId());
+ lengthStream->set_length(lengthRleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // Write child stipes
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ childWriters[i]->writeStripe(stripeFooter, pb, out);
+ }
+
+ lengthRleCoder->reset();
+ }
+
+ uint64_t getEstimatedSpaceNeeded() override {
+ uint64_t sz = lengthRleCoder->getEstimatedSpaceNeeded();
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ sz += childWriters[i]->getEstimatedSpaceNeeded();
+ }
+ return sz;
+ }
+
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ }
+
+ void addTypeToFooter(proto::Footer *footer) override {
+ ::orc::proto::Type *typ = footer->add_types();
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_LIST);
+
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ typ->add_fieldnames(this->type->getFieldName(i));
+ typ->add_subtypes(this->getChildWriter(i)->getColumnId());
+ }
+
+ for (uint64_t i = 0; i < nChild; i++) {
+ this->getChildWriter(i)->addTypeToFooter(footer);
+ }
+ }
+
+ void writeFileStatistics(proto::Footer *fileFooter) override {
+ if (!writeStatsOn) return;
+ ColumnWriter::writeFileStatistics(fileFooter);
+ uint64_t nChild = childWriters.size();
+ for (uint64_t i = 0; i < nChild; i++) {
+ childWriters[i]->writeFileStatistics(fileFooter);
+ }
+ }
+
+ private:
+ std::unique_ptr<RleCoder> lengthRleCoder;
+};
+
+class Decimal64ColumnWriter : public ColumnWriter {
+ public:
+ static const uint32_t MAX_PRECISION_64 = 18;
+ static const uint32_t MAX_PRECISION_128 = 38;
+ Decimal64ColumnWriter(const orc::Type *type, WriterOptions *options);
+ void writeVector(dbcommon::Vector *vector) override;
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override;
+ uint64_t getEstimatedSpaceNeeded() override;
+ void addTypeToFooter(proto::Footer *footer) override;
+
+ private:
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override;
+ void writeInt64(int64_t value);
+ uint64_t zigzagEncodeInt64(int64_t value) {
+ return (value << 1) ^ (value >> 63);
+ }
+ std::unique_ptr<SeekableOutputStream> dataBufferedStream;
+ std::unique_ptr<RleCoder> scaleRleCoder;
+ int32_t precision;
+ int32_t scale;
+};
+
+class Decimal128ColumnWriter : public ColumnWriter {
+ public:
+ Decimal128ColumnWriter(const orc::Type *type, WriterOptions *options);
+ void writeVector(dbcommon::Vector *vector) override;
+ void writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb, OutputStream *out) override;
+ uint64_t getEstimatedSpaceNeeded() override;
+ void addTypeToFooter(proto::Footer *footer) override;
+
+ private:
+ orc::proto::ColumnEncoding_Kind getProtoColumnEncoding() override;
+ void writeInt128(orc::Int128 *value);
+ void zigzagEncodeInt128(orc::Int128 *value);
+ std::unique_ptr<SeekableOutputStream> dataBufferedStream;
+ std::unique_ptr<RleCoder> scaleRleCoder;
+ int32_t precision;
+ int32_t scale;
+};
+
+// The interface for writing ORC files.
+class Writer {
+ public:
+ Writer() {}
+ virtual ~Writer() {}
+
+ // Get the schema for this writer
+ // @return the file schema
+ virtual Type *getSchema() = 0;
+
+ // Add arbitrary meta-data to the ORC file. This may be called at any point
+ // until the Writer is closed. If the same key is passed a second time, the
+ // second value will replace the first.
+ // @param key A key to label the data with.
+ // @param value The contents of the metadata.
+ // @return Void
+ virtual void addUserMetadata(const std::string &key,
+ const std::string &value) = 0;
+
+ // Add a tuple batch to the ORC file.
+ // @param tb The tuple batch
+ // @return Void
+ virtual void addTupleBatch(dbcommon::TupleBatch *tb) = 0;
+
+ // Begin the write. Write the header.
+ // @return Void
+ virtual void begin() = 0;
+
+ // Flush all of the buffers and close the file. No methods on this writer
+ // should be called afterwards.
+ // @return Void
+ virtual void end() = 0;
+
+ // Return the deserialized data size. Raw data size will be compute when
+ // writing the file footer. Hence raw data size value will be available only
+ // after closing the writer.
+ // @return raw data size
+ virtual uint64_t getRawDataSize() = 0;
+
+ // Return the number of rows in file. Row count gets updated when flushing
+ // the stripes. To get accurate row count this method should be called after
+ // closing the writer.
+ // @return Row count
+ virtual uint64_t getNumberOfRows() = 0;
+
+ // Write an intermediate footer on the file such that if the file is
+ // truncated to the returned offset, it would be a valid ORC file.
+ // @return the offset that would be a valid end location for an ORC file
+ virtual uint64_t writeIntermediateFooter() = 0;
+
+ // Fast stripe append to ORC file. This interface is used for fast ORC file
+ // merge with other ORC files. When merging, the file to be merged should
+ // pass
+ // stripe in binary form along with stripe information and stripe
+ // statistics.
+ // After appending last stripe of a file, use appendUserMetadata() to append
+ // any user metadata.
+ // @param stripe Stripe as byte array
+ // @param offset Offset within byte array
+ // @param length Length of stripe within byte array
+ // @param stripeInfo Stripe information
+ // @param stripeStatistics Stripe statistics (Protobuf objects can be
+ // merged directly)
+ // @return Void
+ virtual void appendStripe(
+ char *stripe, uint32_t offset, uint32_t length,
+ const orc::StripeInformation &stripeInfo,
+ const orc::proto::StripeStatistics &stripeStatistics) = 0;
+
+ // When fast stripe append is used for merging ORC stripes, after appending
+ // the last stripe from a file, this interface must be used to merge any
+ // user metadata.
+ // @param userMetadata - user metadata
+ // @return Void
+ virtual void appendUserMetadata(
+ const std::list<orc::proto::UserMetadataItem> &userMetadata) = 0;
+};
+
+class WriterImpl : public Writer {
+ public:
+ WriterImpl(std::unique_ptr<OutputStream> stream, WriterOptions *options)
+ : outStream(std::move(stream)),
+ options(options),
+ adjustedStripeSize(options->getStripeSize()) {
+ orc::Type *type = options->getSchema();
+ assert(type->getKind() == orc::ORCTypeKind::STRUCT);
+
+ stripeFooter.reset(new proto::StripeFooter());
+ fileFooter.reset(new proto::Footer);
+ fileMetadata.reset(new proto::Metadata);
+ postscript.reset(new proto::PostScript);
+
+ // sfos.reset(new SeekableFileOutputStream(outStream.get()));
+ columnWriter = NULL;
+
+ compressor = createBlockCompressor(options->getCompressionKind());
+ }
+
+ virtual ~WriterImpl() {}
+
+ Type *getSchema() override { return options->getSchema(); }
+
+ void addUserMetadata(const std::string &key,
+ const std::string &value) override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::addUserMetadata not implemented");
+ }
+
+ void addTupleBatch(dbcommon::TupleBatch *tb) override {
+ writeTupleBatch(tb);
+ if (columnWriter->getEstimatedSpaceNeeded() * scale > adjustedStripeSize) {
+ writeCurrentStrip();
+ startNewStripe();
+ }
+ }
+
+ void begin() override { this->numRowsInCurrentStripe = 0; }
+
+ void end() override {
+ // write the last stripe to disk
+ if (this->numRowsInCurrentStripe > 0) {
+ writeCurrentStrip();
+ }
+
+ this->writeFileMetadata();
+ this->writeFileFooter(outStream->getPosition() - HEADER_LENGTH);
+ this->writePostScript();
+
+ // need to close, otherwise, another following reader might not
+ // get the latest update for this write.
+ this->outStream->close();
+ }
+
+ private:
+ void writeTupleBatch(dbcommon::TupleBatch *tb);
+
+ void startNewStripe();
+
+ void completeStripInfo(uint64_t stripFooterLen);
+
+ void writeCurrentStrip();
+
+ void writeHeader();
+
+ uint64_t writeStripeFooter();
+
+ void writeFileFooter(uint64_t bodyLength);
+
+ void writeFileMetadata();
+
+ uint64_t getStripeDataLength(proto::StripeFooter *footer);
+
+ void writePostScript();
+
+ uint64_t getRawDataSize() override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::getRawDataSize not implemented");
+ }
+
+ uint64_t getNumberOfRows() override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::getNumberOfRows not implemented");
+ }
+
+ uint64_t writeIntermediateFooter() override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::writeIntermediateFooter not implemented");
+ }
+
+ void appendStripe(
+ char *stripe, uint32_t offset, uint32_t length,
+ const orc::StripeInformation &stripeInfo,
+ const orc::proto::StripeStatistics &stripeStatistics) override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::appendStripe not implemented");
+ }
+
+ void appendUserMetadata(
+ const std::list<orc::proto::UserMetadataItem> &userMetadata) override {
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED,
+ "WriterImpl::appendUserMetadata not implemented");
+ }
+
+ void padStripe();
+
+ private:
+ std::unique_ptr<OutputStream> outStream;
+ std::unique_ptr<SeekableOutputStream> sfos;
+
+ WriterOptions *options;
+
+ std::unique_ptr<proto::StripeFooter> stripeFooter;
+ std::unique_ptr<proto::Footer> fileFooter;
+ uint64_t fileFooterLen = 0;
+ std::unique_ptr<proto::Metadata> fileMetadata;
+ uint64_t fileMetadataLen = 0;
+ std::unique_ptr<proto::PostScript> postscript;
+
+ std::unique_ptr<ColumnWriter> columnWriter;
+
+ std::unique_ptr<orc::SeekableOutputStream> compressor;
+
+ ::orc::proto::StripeInformation *currentStripe = nullptr;
+ uint64_t stripeStart = 0;
+ uint64_t numRowsInCurrentStripe = 0;
+ uint64_t totalRows = 0;
+ uint64_t adjustedStripeSize = 0;
+ double scale = 1;
+
+ const char *magicId = "ORC";
+ const uint32_t HEADER_LENGTH = 3;
+ const WriterId writerId = WriterId::ORC_CPP_WRITER;
+};
+
+// Create a writer for the ORC file.
+// @param stream The stream to write
+// @param options The options for writing the file
+std::unique_ptr<Writer> createWriter(std::unique_ptr<OutputStream> stream,
+ WriterOptions *options);
+
+} // end of namespace orc
+
+#endif // STORAGE_SRC_STORAGE_FORMAT_ORC_WRITER_H_
diff --git a/depends/storage/src/storage/format/orc/writer/binary-column-writer.cc b/depends/storage/src/storage/format/orc/writer/binary-column-writer.cc
new file mode 100644
index 0000000..3bc91e6
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/writer/binary-column-writer.cc
@@ -0,0 +1,101 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/writer.h"
+namespace orc {
+BinaryColumnWriter::BinaryColumnWriter(const orc::Type *type,
+ WriterOptions *options)
+ : ColumnWriter(type, options) {
+ lengthRleCoder =
+ createRleCoder(false, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ dataBufferedStream = createBlockCompressor(options->getCompressionKind());
+}
+void BinaryColumnWriter::writeVector(dbcommon::Vector *vector) {
+ ColumnWriter::writeVector(vector);
+
+ const char *notNull = nullptr;
+ std::unique_ptr<dbcommon::ByteBuffer> buf;
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = reinterpret_cast<const char *>(buf->data());
+ }
+
+ uint64_t numValues = vector->getNumOfRows();
+ const char **vals = vector->getValPtrs();
+ const uint64_t *lens = vector->getLengths();
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ dataBufferedStream->write(vals[i], lens[i]);
+ }
+ dynamic_cast<BinaryColumnStatisticsImpl *>(stripeColStats.get())
+ ->update(lens[i]);
+ if (createBloomFilter) bloomFilter->addString(vals[i], lens[i]);
+ }
+ lengthRleCoder->write(const_cast<uint64_t *>(vector->getLengths()),
+ vector->getNumOfRows(), notNull);
+}
+void BinaryColumnWriter::writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb,
+ OutputStream *out) {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ lengthRleCoder->flushToStream(out);
+ ::orc::proto::Stream *lenStream = stripeFooter->add_streams();
+ lenStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_LENGTH);
+ lenStream->set_column(this->getColumnId());
+ lenStream->set_length(lengthRleCoder->getStreamSize());
+ dataBufferedStream->flushToStream(out);
+ ::orc::proto::Stream *dataStream = stripeFooter->add_streams();
+ dataStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ dataStream->set_column(this->getColumnId());
+ dataStream->set_length(dataBufferedStream->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ dataBufferedStream->reset();
+ lengthRleCoder->reset();
+}
+uint64_t BinaryColumnWriter::getEstimatedSpaceNeeded() {
+ return dataBufferedStream->getEstimatedSpaceNeeded() +
+ lengthRleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+}
+void BinaryColumnWriter::addTypeToFooter(proto::Footer *footer) {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_BINARY);
+}
+
+orc::proto::ColumnEncoding_Kind BinaryColumnWriter::getProtoColumnEncoding() {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/writer/decimal-column-writer.cc b/depends/storage/src/storage/format/orc/writer/decimal-column-writer.cc
new file mode 100644
index 0000000..c463767
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/writer/decimal-column-writer.cc
@@ -0,0 +1,295 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "storage/format/orc/writer.h"
+
+#include "dbcommon/common/vector/decimal-vector.h"
+namespace orc {
+Decimal64ColumnWriter::Decimal64ColumnWriter(const orc::Type *type,
+ WriterOptions *options)
+ : ColumnWriter(type, options) {
+ dataBufferedStream = createBlockCompressor(options->getCompressionKind());
+ scaleRleCoder =
+ createRleCoder(true, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ precision = static_cast<int64_t>(type->getPrecision());
+ scale = static_cast<int64_t>(type->getScale());
+}
+
+void Decimal64ColumnWriter::writeVector(dbcommon::Vector *vector) {
+ char *notNull = nullptr;
+ std::unique_ptr<dbcommon::ByteBuffer> buf;
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = const_cast<char *>(buf->data());
+ }
+
+ // A work around to support "insert select" expression
+ if (vector->getTypeKind() == dbcommon::DECIMALNEWID) {
+ dbcommon::DecimalVector *dvec =
+ dynamic_cast<dbcommon::DecimalVector *>(vector);
+ dvec->computeRawValueAndValPtrs();
+ }
+
+ uint64_t numValues = vector->getNumOfRows();
+ const char **vals = vector->getValPtrs();
+ const uint64_t *lens = vector->getLengths();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ std::string str(vals[i], lens[i]);
+ std::transform(str.begin(), str.end(), str.begin(), tolower);
+ if (str == "nan") {
+ vector->setNull(i);
+ vector->setHasNull(true);
+ continue;
+ }
+ }
+ }
+
+ ColumnWriter::writeVector(vector);
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = const_cast<char *>(buf->data());
+ }
+ std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ std::string str(vals[i], lens[i]);
+ if (scale != 0) {
+ size_t len = str.length();
+ str = str.substr(0, len - scale - 1) + str.substr(len - scale);
+ }
+ int64_t value = std::stoll(str);
+ writeInt64(value);
+ dynamic_cast<DecimalColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateDecimal(Decimal(value, scale));
+ if (createBloomFilter) bloomFilter->addString(vals[i], lens[i]);
+ }
+ }
+ scaleRleCoder->write(scales.data(), vector->getNumOfRows(), notNull);
+}
+
+void Decimal64ColumnWriter::writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb,
+ OutputStream *out) {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ dataBufferedStream->flushToStream(out);
+ ::orc::proto::Stream *dataStream = stripeFooter->add_streams();
+ dataStream->set_column(this->getColumnId());
+ dataStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ dataStream->set_length(dataBufferedStream->getStreamSize());
+
+ scaleRleCoder->flushToStream(out);
+ ::orc::proto::Stream *secondaryStream = stripeFooter->add_streams();
+ secondaryStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_SECONDARY);
+ secondaryStream->set_column(this->getColumnId());
+ secondaryStream->set_length(scaleRleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ dataBufferedStream->reset();
+ scaleRleCoder->reset();
+ stripeColStats->reset();
+}
+
+uint64_t Decimal64ColumnWriter::getEstimatedSpaceNeeded() {
+ return dataBufferedStream->getEstimatedSpaceNeeded() +
+ scaleRleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+}
+
+void Decimal64ColumnWriter::addTypeToFooter(proto::Footer *footer) {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_DECIMAL);
+ type->set_precision(precision);
+ type->set_scale(scale);
+}
+
+orc::proto::ColumnEncoding_Kind
+Decimal64ColumnWriter::getProtoColumnEncoding() {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+}
+
+void Decimal64ColumnWriter::writeInt64(int64_t value) {
+ uint64_t uintValue = zigzagEncodeInt64(value);
+ while (true) {
+ if ((uintValue & ~0x7f) == 0) {
+ dataBufferedStream->writeByte((int8_t)uintValue);
+ return;
+ } else {
+ dataBufferedStream->writeByte((int8_t)(0x80 | (uintValue & 0x7f)));
+ uintValue >>= 7;
+ }
+ }
+}
+
+Decimal128ColumnWriter::Decimal128ColumnWriter(const orc::Type *type,
+ WriterOptions *options)
+ : ColumnWriter(type, options) {
+ dataBufferedStream = createBlockCompressor(options->getCompressionKind());
+ scaleRleCoder =
+ createRleCoder(true, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ precision = static_cast<int32_t>(type->getPrecision());
+ scale = static_cast<int32_t>(type->getScale());
+}
+
+void Decimal128ColumnWriter::writeVector(dbcommon::Vector *vector) {
+ char *notNull = nullptr;
+ std::unique_ptr<dbcommon::ByteBuffer> buf;
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = const_cast<char *>(buf->data());
+ }
+ // A work around to support "insert select" expression
+ if (vector->getTypeKind() == dbcommon::DECIMALNEWID) {
+ dbcommon::DecimalVector *dvec =
+ dynamic_cast<dbcommon::DecimalVector *>(vector);
+ dvec->computeRawValueAndValPtrs();
+ }
+
+ uint64_t numValues = vector->getNumOfRows();
+ const char **vals = vector->getValPtrs();
+ const uint64_t *lens = vector->getLengths();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ std::string str(vals[i], lens[i]);
+ std::transform(str.begin(), str.end(), str.begin(), tolower);
+ if (str == "nan") {
+ vector->setNull(i);
+ vector->setHasNull(true);
+ continue;
+ }
+ }
+ }
+
+ ColumnWriter::writeVector(vector);
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = const_cast<char *>(buf->data());
+ }
+ std::vector<int64_t> scales(numValues, static_cast<int64_t>(scale));
+
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ std::string str(vals[i], lens[i]);
+ if (scale != 0) {
+ size_t len = str.length();
+ str = str.substr(0, len - scale - 1) + str.substr(len - scale);
+ }
+ orc::Int128 value = orc::Int128(str);
+ writeInt128(&value);
+ value = orc::Int128(str);
+ dynamic_cast<DecimalColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateDecimal(Decimal(value, scale));
+ if (createBloomFilter) bloomFilter->addString(vals[i], lens[i]);
+ }
+ }
+ scaleRleCoder->write(scales.data(), vector->getNumOfRows(), notNull);
+}
+
+void Decimal128ColumnWriter::writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb,
+ OutputStream *out) {
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ dataBufferedStream->flushToStream(out);
+ ::orc::proto::Stream *dataStream = stripeFooter->add_streams();
+ dataStream->set_column(this->getColumnId());
+ dataStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ dataStream->set_length(dataBufferedStream->getStreamSize());
+
+ scaleRleCoder->flushToStream(out);
+ ::orc::proto::Stream *secondaryStream = stripeFooter->add_streams();
+ secondaryStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_SECONDARY);
+ secondaryStream->set_column(this->getColumnId());
+ secondaryStream->set_length(scaleRleCoder->getStreamSize());
+
+ stripeFooter->add_columns()->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ dataBufferedStream->reset();
+ scaleRleCoder->reset();
+ stripeColStats->reset();
+}
+
+uint64_t Decimal128ColumnWriter::getEstimatedSpaceNeeded() {
+ return dataBufferedStream->getEstimatedSpaceNeeded() +
+ scaleRleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+}
+
+void Decimal128ColumnWriter::addTypeToFooter(proto::Footer *footer) {
+ proto::Type *type = footer->add_types();
+ type->set_kind(::orc::proto::Type_Kind::Type_Kind_DECIMAL);
+ type->set_precision(precision);
+ type->set_scale(scale);
+}
+
+orc::proto::ColumnEncoding_Kind
+Decimal128ColumnWriter::getProtoColumnEncoding() {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+}
+
+void Decimal128ColumnWriter::writeInt128(Int128 *value) {
+ zigzagEncodeInt128(value);
+ while (true) {
+ if ((value->getLowBits() & ~0x7f) == 0) {
+ dataBufferedStream->writeByte((int8_t)(value->getLowBits()));
+ return;
+ } else {
+ dataBufferedStream->writeByte(
+ (int8_t)(0x80 | (value->getLowBits() & 0x7f)));
+ *value >>= 7;
+ }
+ }
+}
+
+void Decimal128ColumnWriter::zigzagEncodeInt128(Int128 *value) {
+ *value <<= 1;
+ if (*value < 0) {
+ value->negate();
+ *value -= 1;
+ }
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/format/orc/writer/string-column-writer.cc b/depends/storage/src/storage/format/orc/writer/string-column-writer.cc
new file mode 100644
index 0000000..15089c5
--- /dev/null
+++ b/depends/storage/src/storage/format/orc/writer/string-column-writer.cc
@@ -0,0 +1,232 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "dbcommon/utils/string-util.h"
+#include "storage/format/orc/writer.h"
+namespace orc {
+
+StringColumnWriter::StringColumnWriter(const orc::Type *type,
+ WriterOptions *options)
+ : ColumnWriter(type, options),
+ dictionaryKeySizeThreshold(options->getDictKeySizeThreshold()) {
+ lengthRleCoder =
+ createRleCoder(false, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ rowsRleCoder =
+ createRleCoder(false, options->getRleVersion(), ORCTypeKind::LONG,
+ options->getCompressionKind());
+ dictDataBufferedStream = createBlockCompressor(options->getCompressionKind());
+ directDataBufferedStream =
+ createBlockCompressor(options->getCompressionKind());
+ rows.reserve(DEFAULT_NUMBER_TUPLES_PER_BATCH);
+ if (dictionaryKeySizeThreshold == 0) {
+ useDictionaryEncoding = false;
+ doneDictionaryCheck = true;
+ }
+}
+
+void StringColumnWriter::writeVector(dbcommon::Vector *vector) {
+ ColumnWriter::writeVector(vector);
+
+ const char *notNull = nullptr;
+ std::unique_ptr<dbcommon::ByteBuffer> buf;
+ if (vector->hasNullValue()) {
+ buf = vector->getNullBuffer()->getReverseBools();
+ notNull = reinterpret_cast<const char *>(buf->data());
+ }
+
+ uint64_t numValues = vector->getNumOfRows();
+ const char **vals = vector->getValPtrs();
+ const uint64_t *lens = vector->getLengths();
+ for (uint64_t i = 0; i < numValues; i++) {
+ if (!notNull || notNull[i]) {
+ if (useDictionaryEncoding) {
+ rows.push_back(dictionary.add(vals[i], lens[i]));
+ } else {
+ directDataBufferedStream->write(vals[i], lens[i]);
+ }
+ if (writeStatsOn)
+ dynamic_cast<StringColumnStatisticsImpl *>(stripeColStats.get())
+ ->updateString(vals[i], lens[i]);
+ if (createBloomFilter) bloomFilter->addString(vals[i], lens[i]);
+ }
+ }
+ if (!useDictionaryEncoding)
+ lengthRleCoder->write(const_cast<uint64_t *>(vector->getLengths()),
+ vector->getNumOfRows(), notNull);
+}
+
+void StringColumnWriter::writeStripe(proto::StripeFooter *stripeFooter,
+ proto::StripeStatistics *pb,
+ OutputStream *out) {
+ checkDictionaryEncoding();
+
+ if (rows.size() > 0) flushDictionary();
+
+ ColumnWriter::writeStripe(stripeFooter, pb, out);
+
+ if (useDictionaryEncoding) {
+ lengthRleCoder->flushToStream(out);
+ ::orc::proto::Stream *lenStream = stripeFooter->add_streams();
+ lenStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_LENGTH);
+ lenStream->set_column(this->getColumnId());
+ lenStream->set_length(lengthRleCoder->getStreamSize());
+ dictDataBufferedStream->flushToStream(out);
+ ::orc::proto::Stream *dataStream = stripeFooter->add_streams();
+ dataStream->set_kind(
+ ::orc::proto::Stream_Kind::Stream_Kind_DICTIONARY_DATA);
+ dataStream->set_column(this->getColumnId());
+ dataStream->set_length(dictDataBufferedStream->getStreamSize());
+ rowsRleCoder->flushToStream(out);
+ ::orc::proto::Stream *rowsStream = stripeFooter->add_streams();
+ rowsStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ rowsStream->set_column(this->getColumnId());
+ rowsStream->set_length(rowsRleCoder->getStreamSize());
+ } else {
+ lengthRleCoder->flushToStream(out);
+ ::orc::proto::Stream *lenStream = stripeFooter->add_streams();
+ lenStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_LENGTH);
+ lenStream->set_column(this->getColumnId());
+ lenStream->set_length(lengthRleCoder->getStreamSize());
+ directDataBufferedStream->flushToStream(out);
+ ::orc::proto::Stream *dataStream = stripeFooter->add_streams();
+ dataStream->set_kind(::orc::proto::Stream_Kind::Stream_Kind_DATA);
+ dataStream->set_column(this->getColumnId());
+ dataStream->set_length(directDataBufferedStream->getStreamSize());
+ }
+ ::orc::proto::ColumnEncoding *encodingBuilder = stripeFooter->add_columns();
+ if (useDictionaryEncoding)
+ encodingBuilder->set_dictionarysize(dictionary.size());
+ encodingBuilder->set_kind(getProtoColumnEncoding());
+
+ // clear the buffers
+ reset();
+}
+
+uint64_t StringColumnWriter::getEstimatedSpaceNeeded() {
+ if (useDictionaryEncoding)
+ return 4 * rows.size() + dictionary.sizeInBytes() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+ else
+ return directDataBufferedStream->getEstimatedSpaceNeeded() +
+ lengthRleCoder->getEstimatedSpaceNeeded() +
+ ColumnWriter::getEstimatedSpaceNeeded();
+}
+
+void StringColumnWriter::addTypeToFooter(proto::Footer *footer) {
+ proto::Type *typ = footer->add_types();
+ if (type->getKind() == orc::ORCTypeKind::STRING) {
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_STRING);
+ } else if (type->getKind() == orc::ORCTypeKind::VARCHAR) {
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_VARCHAR);
+ typ->set_maximumlength(type->getMaximumLength());
+ } else if (type->getKind() == orc::ORCTypeKind::CHAR) {
+ typ->set_kind(::orc::proto::Type_Kind::Type_Kind_CHAR);
+ typ->set_maximumlength(type->getMaximumLength());
+ } else {
+ LOG_ERROR(ERRCODE_INTERNAL_ERROR, "unknown string/varchar/char type %d",
+ type->getKind());
+ }
+}
+
+orc::proto::ColumnEncoding_Kind StringColumnWriter::getProtoColumnEncoding() {
+ if (useDictionaryEncoding) {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DICTIONARY_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DICTIONARY;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DICTIONARY_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "Version %u not supported",
+ version);
+ }
+ } else {
+ switch (version) {
+ case RleVersion::RleVersion_0:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V0;
+ case RleVersion::RleVersion_1:
+ return orc::proto::ColumnEncoding_Kind_DIRECT;
+ case RleVersion::RleVersion_2:
+ return orc::proto::ColumnEncoding_Kind_DIRECT_V2;
+ default:
+ LOG_ERROR(ERRCODE_FEATURE_NOT_SUPPORTED, "VersionKind %u not supported",
+ version);
+ }
+ }
+}
+
+void StringColumnWriter::addBloomFilterEntry() {
+ ColumnWriter::addBloomFilterEntry();
+ checkDictionaryEncoding();
+ if (!useDictionaryEncoding && rows.size() > 0) flushDictionary();
+}
+
+void StringColumnWriter::checkDictionaryEncoding() {
+ if (!doneDictionaryCheck) {
+ double ratio = rows.size() > 0 ? static_cast<double>(dictionary.size()) /
+ static_cast<double>(rows.size())
+ : 0;
+ useDictionaryEncoding = ratio <= dictionaryKeySizeThreshold;
+ doneDictionaryCheck = true;
+ }
+}
+
+void StringColumnWriter::flushDictionary() {
+ std::vector<const char *> vals;
+ std::vector<uint64_t> lens;
+ std::vector<uint32_t> dumpOrder;
+ dictionary.dump(&vals, &lens, &dumpOrder);
+ std::vector<uint64_t> position;
+ uint64_t rowSize = rows.size();
+ position.resize(rowSize);
+ for (int i = 0; i < rowSize; ++i) {
+ position[i] = dumpOrder[rows[i]];
+ }
+
+ if (useDictionaryEncoding) {
+ uint64_t dictSize = lens.size();
+ for (uint64_t i = 0; i < dictSize; ++i)
+ dictDataBufferedStream->write(vals[i], lens[i]);
+ lengthRleCoder->write(lens.data(), dictSize, nullptr);
+ rowsRleCoder->write(position.data(), rowSize, nullptr);
+ } else {
+ for (uint64_t i = 0; i < rowSize; ++i) {
+ directDataBufferedStream->write(vals[position[i]], lens[position[i]]);
+ lengthRleCoder->write(&lens[position[i]], 1, nullptr);
+ }
+ }
+
+ rows.clear();
+}
+
+void StringColumnWriter::reset() {
+ ColumnWriter::reset();
+
+ dictionary.clear();
+ dictDataBufferedStream->reset();
+ lengthRleCoder->reset();
+ rowsRleCoder->reset();
+ directDataBufferedStream->reset();
+
+ stripeColStats->reset();
+}
+
+} // namespace orc
diff --git a/depends/storage/src/storage/testutil/file-utils.h b/depends/storage/src/storage/testutil/file-utils.h
new file mode 100644
index 0000000..c207c04
--- /dev/null
+++ b/depends/storage/src/storage/testutil/file-utils.h
@@ -0,0 +1,54 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_TESTUTIL_FILE_UTILS_H_
+#define STORAGE_SRC_STORAGE_TESTUTIL_FILE_UTILS_H_
+
+#include <fstream>
+#include <iostream>
+#include <string>
+
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/filesystem/local/local-file-system.h"
+#include "dbcommon/testutil/tuple-batch-utils.h"
+#include "dbcommon/utils/string-util.h"
+
+namespace storage {
+
+class FileUtility {
+ public:
+ explicit FileUtility(dbcommon::FileSystem *fs) : fs(fs) {}
+ ~FileUtility() {}
+
+ public:
+ void createFile(const std::string &fileName, const std::string &content) {
+ std::unique_ptr<dbcommon::File> file = fs->open(fileName.c_str(), O_WRONLY);
+ fs->write(file.get(), content.c_str(), content.length());
+ }
+
+ void dropFile(const std::string &fileName) { fs->remove(fileName.c_str()); }
+
+ private:
+ dbcommon::FileSystem *fs = nullptr;
+};
+
+} // namespace storage
+
+#endif // STORAGE_SRC_STORAGE_TESTUTIL_FILE_UTILS_H_
diff --git a/depends/storage/src/storage/testutil/format-util.h b/depends/storage/src/storage/testutil/format-util.h
new file mode 100644
index 0000000..722bbb4
--- /dev/null
+++ b/depends/storage/src/storage/testutil/format-util.h
@@ -0,0 +1,288 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#ifndef STORAGE_SRC_STORAGE_TESTUTIL_FORMAT_UTIL_H_
+#define STORAGE_SRC_STORAGE_TESTUTIL_FORMAT_UTIL_H_
+
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/filesystem/local/local-file-system.h"
+#include "dbcommon/log/logger.h"
+#include "dbcommon/testutil/tuple-batch-utils.h"
+#include "dbcommon/utils/global.h"
+
+#include "storage/format/format.h"
+#include "univplan/testutil/univplan-proto-util.h"
+
+namespace storage {
+
+class FormatUtility {
+ public:
+ FormatUtility() {}
+ ~FormatUtility() {}
+
+ std::unique_ptr<storage::Format> createFormat(
+ const std::string &fmt, dbcommon::Parameters *params = nullptr) {
+ std::unique_ptr<storage::Format> format;
+
+ if (dbcommon::StringUtil::iequals(fmt, "fast")) {
+ // format.reset(new FastFormat(params));
+ LOG_ERROR(ERRCODE_INVALID_PARAMETER_VALUE, "invalid format: %s",
+ fmt.c_str());
+ } else if (dbcommon::StringUtil::iequals(fmt, "orc")) {
+ format.reset(new ORCFormat(params));
+ } else {
+ LOG_ERROR(ERRCODE_INVALID_PARAMETER_VALUE, "invalid format: %s",
+ fmt.c_str());
+ }
+
+ return std::move(format);
+ }
+
+ void writeThenReadCompare(
+ const std::string &fmt, dbcommon::TupleDesc *desc,
+ dbcommon::TupleBatch::uptr tb, const std::string &localPath,
+ std::string paramsStr,
+ const univplan::UnivPlanExprPolyList *predicateExprs, bool shouldSkip) {
+ EXPECT_EQ(tb->isValid(), true);
+ dbcommon::Parameters params;
+ if (paramsStr.size() > 0)
+ params.set("table.options", paramsStr);
+ else
+ params.set("table.options",
+ "{\"compresstype\":\"none\",\"rlecoder\":\"v2\", "
+ "\"dicthreshold\":\"0.5\"}");
+ std::unique_ptr<storage::Format> format = createFormat(fmt, ¶ms);
+
+ format->setFileSystemManager(&FSManager);
+
+ std::string beforeInsert = tb->toString();
+
+ std::string fullFileName = "file://" + localPath;
+ std::string fileName(localPath);
+ dbcommon::FileSystem *fs = FSManager.get(fullFileName.c_str());
+
+ if (fs->exists(fileName.c_str())) {
+ fs->remove(fileName.c_str());
+ }
+ int totalWrite = tb->getNumOfRows();
+ format->beginInsert(fullFileName, *desc);
+ format->doInsert(std::move(tb));
+ format->endInsert();
+
+ std::vector<std::unique_ptr<Input> > files;
+ std::unique_ptr<dbcommon::FileInfo> info =
+ fs->getFileInfo(fileName.c_str());
+
+ std::unique_ptr<Input> file(
+ new FileInput(fullFileName.c_str(), info->size));
+ files.push_back(std::move(file));
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListList> tasks =
+ format->createTasks(files, 1);
+ format->beginScan(tasks.get(), desc, nullptr, predicateExprs, nullptr,
+ false);
+
+ int totalRead = 0;
+ dbcommon::TupleBatch::uptr result = format->next();
+ if (shouldSkip) {
+ EXPECT_EQ(result.get(), nullptr);
+ } else {
+ if (result.get() == nullptr)
+ throw dbcommon::TransactionAbortException("got zero row",
+ ERRCODE_DATA_EXCEPTION);
+ EXPECT_EQ(result->isValid(), true);
+
+ format->endScan();
+
+ std::string afterInsert = result->toString();
+
+ EXPECT_EQ(beforeInsert, afterInsert);
+ }
+ if (fs->exists(fileName.c_str())) {
+ fs->remove(fileName.c_str());
+ }
+ }
+ void writeThenReadCompare(const std::string &fmt, dbcommon::TupleDesc *desc,
+ dbcommon::TupleBatch::uptr tb,
+ const std::string &localPath) {
+ writeThenReadCompare(fmt, desc, std::move(tb), localPath, "", nullptr,
+ false);
+ }
+
+ void writeThenReadCompare(const std::string &fmt, dbcommon::TupleDesc *desc,
+ dbcommon::TupleBatch::uptr tb,
+ const std::string &localPath,
+ std::string paramsStr) {
+ writeThenReadCompare(fmt, desc, std::move(tb), localPath, paramsStr,
+ nullptr, false);
+ }
+
+ void writeThenReadCompare(
+ const std::string &fmt, dbcommon::TupleDesc *desc,
+ dbcommon::TupleBatch::uptr tb, const std::string &localPath,
+ const univplan::UnivPlanExprPolyList *predicateExprs, bool shouldSkip) {
+ writeThenReadCompare(fmt, desc, std::move(tb), localPath, "",
+ predicateExprs, shouldSkip);
+ }
+
+ void multiBlockTest(const std::string &fmt, const std::string &pattern,
+ uint64_t start, uint64_t step, uint64_t number,
+ uint64_t nTupleReadPerBatch, uint64_t blockAlignSize,
+ const std::string &localPath) {
+ dbcommon::Parameters params;
+ params.set("number.tuples.per.batch", std::to_string(nTupleReadPerBatch));
+ params.set("format.block.align.size", std::to_string(blockAlignSize));
+ params.set("table.options",
+ "{\"compresstype\":\"snappy\",\"rlecoder\":\"v2\","
+ "\"bloomfilter\":\"0\"}");
+ std::unique_ptr<storage::Format> format = createFormat(fmt, ¶ms);
+
+ format->setFileSystemManager(&FSManager);
+
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc(pattern);
+
+ std::string fullFileName = "file://" + localPath;
+ std::string fileName(localPath);
+ dbcommon::FileSystem *fs = FSManager.get(fullFileName.c_str());
+
+ if (fs->exists(fileName.c_str())) {
+ fs->remove(fileName.c_str());
+ }
+
+ format->beginInsert(fullFileName, *desc);
+
+ uint64_t writeEnd = start + number;
+ for (uint64_t i = start; i < writeEnd; i += step) {
+ uint64_t numToAdd = (i + step > writeEnd ? writeEnd - i : step);
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatch(*desc, i, numToAdd);
+ EXPECT_EQ(tb->isValid(), true);
+ format->doInsert(std::move(tb));
+ }
+ format->endInsert();
+
+ std::vector<std::unique_ptr<storage::Input> > files;
+ std::unique_ptr<dbcommon::FileInfo> info =
+ fs->getFileInfo(fileName.c_str());
+
+ std::unique_ptr<storage::Input> file(
+ new storage::FileInput(fullFileName.c_str(), info->size));
+ files.push_back(std::move(file));
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListList> tasks =
+ format->createTasks(files, 1);
+ format->beginScan(tasks.get(), desc.get(), nullptr, nullptr, nullptr,
+ false);
+
+ uint64_t totalRead = 0;
+ uint64_t startFrom = start;
+ dbcommon::TupleBatch::uptr result;
+ while (totalRead < number) {
+ result = format->next();
+ EXPECT_EQ(result->isValid(), true);
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatch(*desc, startFrom, result->getNumOfRows());
+ ASSERT_EQ(result->toString(), tb->toString());
+ totalRead += result->getNumOfRows();
+ startFrom += result->getNumOfRows();
+ result.reset(nullptr);
+ }
+
+ EXPECT_EQ(totalRead, number);
+
+ format->endScan();
+ }
+
+ void multiBlockTest(const std::string &fmt, const std::string &pattern,
+ uint64_t start, uint64_t step, uint64_t number,
+ uint64_t nTupleReadPerBatch, uint64_t blockAlignSize,
+ const std::string &localPath, bool hasNull) {
+ dbcommon::Parameters params;
+ params.set("number.tuples.per.batch", std::to_string(nTupleReadPerBatch));
+ params.set("format.block.align.size", std::to_string(blockAlignSize));
+ params.set("table.options",
+ "{\"compresstype\":\"snappy\",\"rlecoder\":\"v2\","
+ "\"bloomfilter\":\"0\"}");
+ std::unique_ptr<storage::Format> format = createFormat(fmt, ¶ms);
+
+ format->setFileSystemManager(&FSManager);
+
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc(pattern);
+
+ std::string fullFileName = "file://" + localPath;
+ std::string fileName(localPath);
+ dbcommon::FileSystem *fs = FSManager.get(fullFileName.c_str());
+
+ if (fs->exists(fileName.c_str())) {
+ fs->remove(fileName.c_str());
+ }
+
+ format->beginInsert(fullFileName, *desc);
+
+ uint64_t writeEnd = start + number;
+ for (uint64_t i = start; i < writeEnd; i += step) {
+ uint64_t numToAdd = (i + step > writeEnd ? writeEnd - i : step);
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatch(*desc, i, numToAdd, hasNull);
+ EXPECT_EQ(tb->isValid(), true);
+ format->doInsert(std::move(tb));
+ }
+ format->endInsert();
+
+ std::vector<std::unique_ptr<storage::Input> > files;
+ std::unique_ptr<dbcommon::FileInfo> info =
+ fs->getFileInfo(fileName.c_str());
+
+ std::unique_ptr<storage::Input> file(
+ new storage::FileInput(fullFileName.c_str(), info->size));
+ files.push_back(std::move(file));
+ std::unique_ptr<univplan::UnivPlanScanFileSplitListList> tasks =
+ format->createTasks(files, 1);
+ format->beginScan(tasks.get(), desc.get(), nullptr, nullptr, nullptr,
+ false);
+
+ uint64_t totalRead = 0;
+ uint64_t startFrom = start;
+ dbcommon::TupleBatch::uptr result;
+ while (totalRead < number) {
+ result = format->next();
+ EXPECT_EQ(result->isValid(), true);
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(
+ *desc, startFrom, result->getNumOfRows(), hasNull);
+ ASSERT_EQ(result->toString(), tb->toString());
+ totalRead += result->getNumOfRows();
+ startFrom += result->getNumOfRows();
+ result.reset(nullptr);
+ }
+
+ EXPECT_EQ(totalRead, number);
+
+ format->endScan();
+ }
+};
+
+} // end of namespace storage
+
+#endif // STORAGE_SRC_STORAGE_TESTUTIL_FORMAT_UTIL_H_
diff --git a/depends/storage/test/CMakeLists.txt b/depends/storage/test/CMakeLists.txt
new file mode 100644
index 0000000..2092b56
--- /dev/null
+++ b/depends/storage/test/CMakeLists.txt
@@ -0,0 +1,32 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+SET(CMAKE_BUILD_TYPE "Debug")
+SET(TEST_WORKING_DIR ${CMAKE_CURRENT_SOURCE_DIR}/data/)
+ADD_DEFINITIONS(-DDATA_DIR="${TEST_WORKING_DIR}/")
+SET(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fno-access-control")
+
+INCLUDE_DIRECTORIES(${storage_ROOT_DIR})
+INCLUDE_DIRECTORIES(${CMAKE_BINARY_DIR}/src)
+INCLUDE_DIRECTORIES(${DEPENDENCY_INSTALL_PREFIX}/package/include)
+INCLUDE_DIRECTORIES(/usr/local/include)
+LINK_DIRECTORIES(/usr/local/lib)
+
+ADD_SUBDIRECTORY(unit)
+
+IF(TEST_RUNNER)
+ SEPARATE_ARGUMENTS(TEST_RUNNER_LIST UNIX_COMMAND ${TEST_RUNNER})
+ENDIF(TEST_RUNNER)
+
+ADD_CUSTOM_TARGET(unittest
+ COMMAND ${TEST_RUNNER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/unit/unit
+ DEPENDS unit
+ WORKING_DIRECTORY ${TEST_WORKING_DIR}
+ COMMENT "Run Unit Test..."
+)
+
+ADD_CUSTOM_TARGET(punittest
+ COMMAND ${CMAKE_CURRENT_SOURCE_DIR}/parallel/parallel-launcher.py ${TEST_RUNNER_LIST} ${CMAKE_CURRENT_BINARY_DIR}/unit/unit
+ DEPENDS unit
+ WORKING_DIRECTORY ${TEST_WORKING_DIR}
+ COMMENT "Run Unit Test in parallel..."
+)
diff --git a/depends/storage/test/data/hawq-write-orc.sql b/depends/storage/test/data/hawq-write-orc.sql
new file mode 100644
index 0000000..b6558f3
--- /dev/null
+++ b/depends/storage/test/data/hawq-write-orc.sql
@@ -0,0 +1,3 @@
+drop external table if exists testorc;
+CREATE WRITABLE EXTERNAL TABLE testorc (i2 int2, i4 int4, i8 int8, f4 float4, f8 float8, text TEXT) LOCATION ('hdfs://localhost:8020/testorc') FORMAT 'orc';
+insert INTO testorc select values (1,1,3,5.1,6.1,'aaaaaaa'), (1,2,3,5.1,6.1,'aaaaaaa'), (1,4,3,5.1,6.1,'aaaaaaa'), (1,5,3,5.1,6.1,'aaaaaaa');
diff --git a/depends/storage/test/data/sampledata b/depends/storage/test/data/sampledata
new file mode 100644
index 0000000..e69de29
--- /dev/null
+++ b/depends/storage/test/data/sampledata
diff --git a/depends/storage/test/data/spark-read-orc.sql b/depends/storage/test/data/spark-read-orc.sql
new file mode 100644
index 0000000..8b769cd
--- /dev/null
+++ b/depends/storage/test/data/spark-read-orc.sql
@@ -0,0 +1,3 @@
+CREATE EXTERNAL TABLE testorc (i2 short, i4 int, i8 long, f4 float, f8 double, text string) stored as orc LOCATION 'hdfs://localhost:9000/testorc' TBLPROPERTIES ("orc.bloom.filter.columns"="i4")
+select * from testorc where i4=3;
+select * from testorc where i4=4;
diff --git a/depends/storage/test/parallel/parallel-launcher.py b/depends/storage/test/parallel/parallel-launcher.py
new file mode 100755
index 0000000..f572f7b
--- /dev/null
+++ b/depends/storage/test/parallel/parallel-launcher.py
@@ -0,0 +1,153 @@
+#!/usr/bin/python
+# Copyright (c) 2010 The Chromium Authors. All rights reserved.
+# Use of this source code is governed by a BSD-style license that can be
+# found in the LICENSE file.
+
+"""
+This tool launches several shards of a gtest-based binary
+in parallel on a local machine.
+
+Example usage:
+
+parallel_launcher.py path/to/base_unittests
+"""
+
+import optparse
+import os
+import subprocess
+import sys
+import threading
+import time
+
+
+def StreamCopyWindows(stream_from, stream_to):
+ """Copies stream_from to stream_to."""
+
+ while True:
+ buf = stream_from.read(1024)
+ if not buf:
+ break
+ stream_to.write(buf)
+ stream_to.flush()
+
+def StreamCopyPosix(stream_from, stream_to, child_exited):
+ """
+ Copies stream_from to stream_to, and exits if child_exited
+ is signaled.
+ """
+
+ import fcntl
+
+ # Put the source stream in a non-blocking mode, so we can check
+ # child_exited when there is no data.
+ fd = stream_from.fileno()
+ fl = fcntl.fcntl(fd, fcntl.F_GETFL)
+ fcntl.fcntl(fd, fcntl.F_SETFL, fl | os.O_NONBLOCK)
+
+ while True:
+ try:
+ buf = os.read(fd, 1024)
+ except OSError, e:
+ if e.errno == 11 or e.errno == 35:
+ if child_exited.isSet():
+ break
+ time.sleep(0.1)
+ continue
+ raise
+ if not buf:
+ break
+ stream_to.write(buf)
+ stream_to.flush()
+
+class TestLauncher(object):
+ def __init__(self, args, executable, num_shards, shard):
+ self._args = args
+ self._executable = executable
+ self._num_shards = num_shards
+ self._shard = shard
+ self._test = None
+
+ def launch(self):
+ env = os.environ.copy()
+
+ env['CHROME_LOG_FILE'] = 'chrome_log_%d' % self._shard
+
+ if 'GTEST_TOTAL_SHARDS' in env:
+ # Handle the requested sharding transparently.
+ outer_shards = int(env['GTEST_TOTAL_SHARDS'])
+ outer_index = int(env['GTEST_SHARD_INDEX'])
+
+ env['GTEST_TOTAL_SHARDS'] = str(self._num_shards * outer_shards)
+
+ # Calculate the right shard index to pass to the child. This is going
+ # to be a shard of a shard.
+ env['GTEST_SHARD_INDEX'] = str((self._num_shards * outer_index) +
+ self._shard)
+ else:
+ env['GTEST_TOTAL_SHARDS'] = str(self._num_shards)
+ env['GTEST_SHARD_INDEX'] = str(self._shard)
+
+ args = self._args + ['--test-server-shard=' + str(self._shard)]
+
+ self._test = subprocess.Popen(args=args,
+ executable=self._executable,
+ stdout=subprocess.PIPE,
+ stderr=subprocess.STDOUT,
+ env=env)
+ def wait(self):
+ if subprocess.mswindows:
+ stdout_thread = threading.Thread(
+ target=StreamCopyWindows,
+ args=[self._test.stdout, sys.stdout])
+ stdout_thread.start()
+ code = self._test.wait()
+ stdout_thread.join()
+ return code
+ else:
+ child_exited = threading.Event()
+ stdout_thread = threading.Thread(
+ target=StreamCopyPosix,
+ args=[self._test.stdout, sys.stdout, child_exited])
+ stdout_thread.start()
+ code = self._test.wait()
+ child_exited.set()
+ stdout_thread.join()
+ return code
+
+def main(argv):
+ parser = optparse.OptionParser()
+ parser.add_option("--shards", type="int", dest="shards", default=16)
+
+ # Make it possible to pass options to the launched process.
+ # Options for parallel_launcher should be first, then the binary path,
+ # and finally - optional arguments for the launched binary.
+ parser.disable_interspersed_args()
+
+ options, args = parser.parse_args(argv)
+
+ if not args:
+ print 'You must provide path to the test binary'
+ return 1
+
+ env = os.environ
+ if bool('GTEST_TOTAL_SHARDS' in env) != bool('GTEST_SHARD_INDEX' in env):
+ print 'Inconsistent environment. GTEST_TOTAL_SHARDS and GTEST_SHARD_INDEX'
+ print 'should either be both defined, or both undefined.'
+ return 1
+
+ launchers = []
+
+ for shard in range(options.shards):
+ launcher = TestLauncher(args, args[0], options.shards, shard)
+ launcher.launch()
+ launchers.append(launcher)
+
+ return_code = 0
+ for launcher in launchers:
+ if launcher.wait() != 0:
+ return_code = 1
+
+ return return_code
+
+if __name__ == "__main__":
+ sys.exit(main(sys.argv[1:]))
diff --git a/depends/storage/test/unit/CMakeLists.txt b/depends/storage/test/unit/CMakeLists.txt
new file mode 100644
index 0000000..e57aaeb
--- /dev/null
+++ b/depends/storage/test/unit/CMakeLists.txt
@@ -0,0 +1,23 @@
+CMAKE_MINIMUM_REQUIRED(VERSION 2.8)
+
+ADD_DEFINITIONS(-DTESTONLY)
+
+AUTO_SOURCES(unit_SOURCES "*.cc" "RECURSE" ${CMAKE_CURRENT_SOURCE_DIR})
+
+FIND_PACKAGE(GTest REQUIRED)
+FIND_PACKAGE(Protobuf REQUIRED)
+FIND_PACKAGE(JSON REQUIRED)
+FIND_PACKAGE(Snappy REQUIRED)
+FIND_PACKAGE(ZLIB REQUIRED)
+
+INCLUDE_DIRECTORIES(${storage_ROOT_DIR}/test/unit)
+INCLUDE_DIRECTORIES(${GTEST_INCLUDE_DIRS})
+
+LINK_DIRECTORIES(${CMAKE_BINARY_DIR}/src)
+LINK_DIRECTORIES(${DEPENDENCY_INSTALL_PREFIX}/package/lib)
+
+ADD_EXECUTABLE(unit EXCLUDE_FROM_ALL
+ ${unit_SOURCES}
+)
+
+target_link_libraries(unit storage-shared gtest gmock)
diff --git a/depends/storage/test/unit/common/test-bloom-filter.cc b/depends/storage/test/unit/common/test-bloom-filter.cc
new file mode 100644
index 0000000..c90810f
--- /dev/null
+++ b/depends/storage/test/unit/common/test-bloom-filter.cc
@@ -0,0 +1,118 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "gtest/gtest.h"
+
+#include "dbcommon/utils/macro.h"
+
+#include "storage/common/bloom-filter.h"
+
+namespace storage {
+
+TEST(TestBloomFilter, TestInteger) {
+ BloomFilter bloomFilter(6);
+ bloomFilter.addInt(1);
+ bloomFilter.addInt(9);
+ bloomFilter.addInt(3);
+ bloomFilter.addInt(9);
+ bloomFilter.addInt(-2);
+ bloomFilter.addInt(2);
+ bloomFilter.addInt(0);
+
+ EXPECT_EQ(bloomFilter.testInt(-2), true);
+ EXPECT_EQ(bloomFilter.testInt(0), true);
+ EXPECT_EQ(bloomFilter.testInt(1), true);
+ EXPECT_EQ(bloomFilter.testInt(2), true);
+ EXPECT_EQ(bloomFilter.testInt(3), true);
+ EXPECT_EQ(bloomFilter.testInt(9), true);
+}
+
+TEST(TestBloomFilter, TestLotsOfInteger) {
+ int num = DEFAULT_NUMBER_TUPLES_PER_BATCH * 5;
+ BloomFilter bloomFilter(num);
+ for (int i = 0; i < num; ++i) bloomFilter.addInt(i);
+
+ EXPECT_EQ(bloomFilter.testInt(10239), true);
+ EXPECT_EQ(bloomFilter.testInt(0), true);
+ EXPECT_EQ(bloomFilter.testInt(10240), false);
+ EXPECT_EQ(bloomFilter.testInt(-1), false);
+}
+
+TEST(TestBloomFilter, TestDouble) {
+ BloomFilter bloomFilter(6);
+ bloomFilter.addDouble(0);
+ bloomFilter.addDouble(-1);
+ bloomFilter.addDouble(1.2);
+ bloomFilter.addDouble(2.3);
+ bloomFilter.addDouble(-2.5);
+ bloomFilter.addDouble(0);
+ bloomFilter.addDouble(100.446);
+
+ EXPECT_EQ(bloomFilter.testDouble(-2.5), true);
+ EXPECT_EQ(bloomFilter.testDouble(-1), true);
+ EXPECT_EQ(bloomFilter.testDouble(0), true);
+ EXPECT_EQ(bloomFilter.testDouble(1.2), true);
+ EXPECT_EQ(bloomFilter.testDouble(2.3), true);
+ EXPECT_EQ(bloomFilter.testDouble(100.446), true);
+}
+
+TEST(TestBloomFilter, TestString) {
+ BloomFilter bloomFilter(6);
+ bloomFilter.addString("abc", 3);
+ bloomFilter.addString("cd", 2);
+ bloomFilter.addString("oushu", 5);
+ bloomFilter.addString("a", 1);
+ bloomFilter.addString("mn", 2);
+ bloomFilter.addString("ab", 2);
+ bloomFilter.addString("cd", 2);
+
+ EXPECT_EQ(bloomFilter.testString("abc", 3), true);
+ EXPECT_EQ(bloomFilter.testString("cd", 2), true);
+ EXPECT_EQ(bloomFilter.testString("oushu", 5), true);
+ EXPECT_EQ(bloomFilter.testString("a", 1), true);
+ EXPECT_EQ(bloomFilter.testString("mn", 2), true);
+ EXPECT_EQ(bloomFilter.testString("ab", 2), true);
+}
+
+TEST(TestBloomFilter, TestLongString) {
+ BloomFilter bloomFilter(4);
+ bloomFilter.addString("oushuoushu***oushuoushu", 23);
+ bloomFilter.addString("oushuoushu1***oushuoushu1", 25);
+ bloomFilter.addString("oushuoushu2***oushuoushu2", 25);
+ bloomFilter.addString("oushu", 5);
+
+ EXPECT_EQ(bloomFilter.testString("oushu", 5), true);
+ EXPECT_EQ(bloomFilter.testString("oushuoushu1***oushuoushu1", 25), true);
+ EXPECT_EQ(bloomFilter.testString("oushuoushu2***oushuoushu2", 25), true);
+ EXPECT_EQ(bloomFilter.testString("oushuoushu***oushuoushu", 23), true);
+}
+
+TEST(TestBloomFilter, TestDeserialize) {
+ BloomFilter bloomFilter(2);
+ bloomFilter.addString("Oushu", 5);
+ bloomFilter.addString("Inc.", 4);
+
+ BloomFilter::uptr deserBloomFilter(
+ new BloomFilter(bloomFilter.getBitSet(), bloomFilter.size(),
+ bloomFilter.getNumHashFunctions()));
+ EXPECT_EQ(deserBloomFilter->testString("Oushu", 5), true);
+ EXPECT_EQ(deserBloomFilter->testString("Inc.", 4), true);
+}
+
+} // namespace storage
diff --git a/depends/storage/test/unit/format/test-filter-pushdown.cc b/depends/storage/test/unit/format/test-filter-pushdown.cc
new file mode 100644
index 0000000..3dac28f
--- /dev/null
+++ b/depends/storage/test/unit/format/test-filter-pushdown.cc
@@ -0,0 +1,613 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/filesystem/local/local-file-system.h"
+#include "dbcommon/function/func-kind.cg.h"
+#include "dbcommon/log/logger.h"
+#include "dbcommon/testutil/tuple-batch-utils.h"
+#include "dbcommon/utils/parameters.h"
+#include "gtest/gtest.h"
+#include "storage/format/orc/orc-format.h"
+#include "storage/testutil/format-util.h"
+
+using namespace testing; // NOLINT
+
+namespace storage {
+
+void generateTest(const std::string &pattern,
+ const univplan::UnivPlanExprPolyList *predicateExprs,
+ const char *casename, bool shouldSkip1 = false,
+ bool shouldSkip2 = true, bool testNe = false) {
+ dbcommon::TupleBatchUtility tbu;
+ std::string path = "/tmp/";
+ path.append(casename);
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc(pattern);
+ dbcommon::TupleBatch::uptr tb;
+ if (testNe)
+ tb = tbu.generateTupleBatch(*desc, 0, 1, false);
+ else
+ tb = tbu.generateTupleBatch(*desc, 0, 32, false);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb), path,
+ predicateExprs, shouldSkip1);
+ LOG_INFO("OK without nulls");
+ }
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc(pattern);
+ dbcommon::TupleBatch::uptr tb;
+ if (testNe)
+ tb = tbu.generateTupleBatch(*desc, 1, 2, true);
+ else
+ tb = tbu.generateTupleBatch(*desc, 1, 30, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb), path,
+ predicateExprs, shouldSkip2);
+ LOG_INFO("OK with nulls");
+ }
+}
+
+void testVarOpConst(const std::string &pattern, const char *casename,
+ int32_t opFuncId, dbcommon::TypeKind varType,
+ const char *buffer, bool shouldSkip1 = false,
+ bool shouldSkip2 = true, bool testNe = false) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructVarOpConstQualList(-1, opFuncId, 1, 1, varType, varType, buffer);
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest(pattern, predicateExprs, casename, shouldSkip1, shouldSkip2,
+ testNe);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_ConstEqVar) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructConstOpVarQualList(-1, dbcommon::SMALLINT_EQUAL_INT, 1, 1,
+ dbcommon::SMALLINTID, dbcommon::SMALLINTID,
+ "0");
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("h", predicateExprs, "TestFilterPushDown_ConstEqVar");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarEqConstSmallint",
+ dbcommon::SMALLINT_EQUAL_INT, dbcommon::SMALLINTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarEqConstInt",
+ dbcommon::INT_EQUAL_INT, dbcommon::INTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarEqConstBigint",
+ dbcommon::BIGINT_EQUAL_INT, dbcommon::BIGINTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarEqConstDouble",
+ dbcommon::DOUBLE_EQUAL_DOUBLE, dbcommon::DOUBLEID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarEqConstFloat",
+ dbcommon::FLOAT_EQUAL_DOUBLE, dbcommon::FLOATID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarEqConstString",
+ dbcommon::STRING_EQUAL_STRING, dbcommon::STRINGID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarEqConstDate",
+ dbcommon::INT_EQUAL_INT, dbcommon::DATEID, "2018-01-10");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarEqConstTime",
+ dbcommon::BIGINT_EQUAL_BIGINT, dbcommon::TIMEID, "00:00:00");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarEqConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarEqConstTimestamp",
+ dbcommon::TIMESTAMP_EQUAL_TIMESTAMP, dbcommon::TIMESTAMPID,
+ "2018-01-10 15:16:01.123");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarNeConstSmallint",
+ dbcommon::SMALLINT_NOT_EQUAL_INT, dbcommon::SMALLINTID, "0",
+ true, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarNeConstInt",
+ dbcommon::INT_NOT_EQUAL_INT, dbcommon::INTID, "0", true, false,
+ true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarNeConstBigint",
+ dbcommon::BIGINT_NOT_EQUAL_INT, dbcommon::BIGINTID, "0", true,
+ false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarNeConstDouble",
+ dbcommon::DOUBLE_NOT_EQUAL_DOUBLE, dbcommon::DOUBLEID, "0",
+ true, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarNeConstFloat",
+ dbcommon::FLOAT_NOT_EQUAL_DOUBLE, dbcommon::FLOATID, "0", true,
+ false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarNeConstString",
+ dbcommon::STRING_NOT_EQUAL_STRING, dbcommon::STRINGID, "0",
+ true, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarNeConstDate",
+ dbcommon::INT_NOT_EQUAL_INT, dbcommon::DATEID, "2018-01-10",
+ true, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarNeConstTime",
+ dbcommon::BIGINT_NOT_EQUAL_BIGINT, dbcommon::TIMEID,
+ "00:00:00", true, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarNeConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarNeConstTimestamp",
+ dbcommon::TIMESTAMP_NOT_EQUAL_TIMESTAMP, dbcommon::TIMESTAMPID,
+ "2018-01-10 15:16:01.123", false, false, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarLqConstSmallint",
+ dbcommon::SMALLINT_LESS_THAN_INT, dbcommon::SMALLINTID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarLqConstInt",
+ dbcommon::INT_LESS_THAN_INT, dbcommon::INTID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarLqConstBigint",
+ dbcommon::BIGINT_LESS_THAN_INT, dbcommon::BIGINTID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarLqConstDouble",
+ dbcommon::DOUBLE_LESS_THAN_DOUBLE, dbcommon::DOUBLEID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarLqConstFloat",
+ dbcommon::DOUBLE_LESS_THAN_DOUBLE, dbcommon::FLOATID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarLqConstString",
+ dbcommon::STRING_LESS_THAN_STRING, dbcommon::STRINGID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarLqConstDate",
+ dbcommon::INT_LESS_THAN_INT, dbcommon::DATEID, "2018-01-11");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarLqConstTime",
+ dbcommon::BIGINT_LESS_THAN_BIGINT, dbcommon::TIMEID,
+ "00:00:00.1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLqConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarLqConstTimestamp",
+ dbcommon::TIMESTAMP_LESS_THAN_TIMESTAMP, dbcommon::TIMESTAMPID,
+ "2018-01-11 15:16:01.123");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarGqConstSmallint",
+ dbcommon::SMALLINT_GREATER_THAN_INT, dbcommon::SMALLINTID,
+ "30");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarGqConstInt",
+ dbcommon::INT_GREATER_THAN_INT, dbcommon::INTID, "30");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarGqConstBigint",
+ dbcommon::BIGINT_GREATER_THAN_INT, dbcommon::BIGINTID, "30");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarGqConstDouble",
+ dbcommon::DOUBLE_GREATER_THAN_DOUBLE, dbcommon::DOUBLEID,
+ "30");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarGqConstFloat",
+ dbcommon::FLOAT_GREATER_THAN_DOUBLE, dbcommon::FLOATID, "30");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarGqConstString",
+ dbcommon::STRING_GREATER_THAN_STRING, dbcommon::STRINGID, "90",
+ true, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarGqConstDate",
+ dbcommon::INT_GREATER_THAN_INT, dbcommon::DATEID,
+ "2018-01-16");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarGqConstTime",
+ dbcommon::BIGINT_GREATER_THAN_BIGINT, dbcommon::TIMEID,
+ "00:00:00.9", true, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGqConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarGqConstTimestamp",
+ dbcommon::TIMESTAMP_GREATER_THAN_TIMESTAMP,
+ dbcommon::TIMESTAMPID, "2018-01-16 15:16:01.123999999");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarLeConstSmallint",
+ dbcommon::SMALLINT_LESS_EQ_INT, dbcommon::SMALLINTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarLeConstInt",
+ dbcommon::INT_LESS_EQ_INT, dbcommon::INTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarLeConstBigint",
+ dbcommon::BIGINT_LESS_EQ_INT, dbcommon::BIGINTID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarLeConstDouble",
+ dbcommon::DOUBLE_LESS_EQ_DOUBLE, dbcommon::DOUBLEID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarLeConstFloat",
+ dbcommon::FLOAT_LESS_EQ_DOUBLE, dbcommon::FLOATID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarLeConstString",
+ dbcommon::STRING_LESS_EQ_STRING, dbcommon::STRINGID, "0");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarLeConstDate",
+ dbcommon::INT_LESS_EQ_INT, dbcommon::DATEID, "2018-01-10");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarLeConstTime",
+ dbcommon::BIGINT_LESS_EQ_BIGINT, dbcommon::TIMEID, "00:00:00");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarLeConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarLeConstTimestamp",
+ dbcommon::TIMESTAMP_LESS_EQ_TIMESTAMP, dbcommon::TIMESTAMPID,
+ "2018-01-10 15:16:01.123");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstSmallint) {
+ testVarOpConst("h", "TestFilterPushDown_VarGeConstSmallint",
+ dbcommon::SMALLINT_GREATER_EQ_INT, dbcommon::SMALLINTID, "31");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstInt) {
+ testVarOpConst("i", "TestFilterPushDown_VarGeConstInt",
+ dbcommon::INT_GREATER_EQ_INT, dbcommon::INTID, "31");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstBigint) {
+ testVarOpConst("l", "TestFilterPushDown_VarGeConstBigint",
+ dbcommon::BIGINT_GREATER_EQ_INT, dbcommon::BIGINTID, "31");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstDouble) {
+ testVarOpConst("d", "TestFilterPushDown_VarGeConstDouble",
+ dbcommon::DOUBLE_GREATER_EQ_DOUBLE, dbcommon::DOUBLEID, "31");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstFloat) {
+ testVarOpConst("f", "TestFilterPushDown_VarGeConstFloat",
+ dbcommon::FLOAT_GREATER_EQ_DOUBLE, dbcommon::FLOATID, "31");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstString) {
+ testVarOpConst("s", "TestFilterPushDown_VarGeConstString",
+ dbcommon::STRING_GREATER_EQ_STRING, dbcommon::STRINGID, "90",
+ true, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstDate) {
+ testVarOpConst("D", "TestFilterPushDown_VarGeConstDate",
+ dbcommon::INT_GREATER_EQ_INT, dbcommon::DATEID, "2018-01-17");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstTime) {
+ testVarOpConst("T", "TestFilterPushDown_VarGeConstTime",
+ dbcommon::BIGINT_GREATER_EQ_BIGINT, dbcommon::TIMEID,
+ "00:00:00.91", true, true);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarGeConstTimestamp) {
+ testVarOpConst("S", "TestFilterPushDown_VarGeConstTimestamp",
+ dbcommon::TIMESTAMP_GREATER_EQ_TIMESTAMP,
+ dbcommon::TIMESTAMPID, "2018-01-17 15:16:01.123");
+}
+
+TEST(TestORCFormat, DISABLED_TestFilterPushDown_FuncLqVar) {
+ // TEST(TestORCFormat, TestFilterPushDown_FuncLqVar) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructFuncOpVarQualList(-1, dbcommon::DOUBLE_LESS_THAN_DOUBLE, 1, 4,
+ dbcommon::DOUBLEID, dbcommon::RANDOMF);
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_FuncEqVar");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_TestOr1) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructBoolQualList(-1, dbcommon::SMALLINT_EQUAL_INT, 1, 1,
+ dbcommon::SMALLINTID, dbcommon::SMALLINTID, "0",
+ dbcommon::INT_EQUAL_INT, 1, 2, dbcommon::INTID,
+ dbcommon::INTID, "0");
+ upu.constructBoolQualList(-1, dbcommon::BIGINT_EQUAL_INT, 1, 3,
+ dbcommon::BIGINTID, dbcommon::BIGINTID, "0",
+ dbcommon::DOUBLE_EQUAL_DOUBLE, 1, 4,
+ dbcommon::DOUBLEID, dbcommon::DOUBLEID, "0");
+ upu.constructBoolQualList(-1, dbcommon::FLOAT_EQUAL_DOUBLE, 1, 5,
+ dbcommon::FLOATID, dbcommon::FLOATID, "0",
+ dbcommon::BIGINT_EQUAL_INT, 1, 6,
+ dbcommon::STRINGID, dbcommon::STRINGID, "0");
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_TestOr1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_TestOr2) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructBoolQualList(-1, dbcommon::SMALLINT_EQUAL_INT, 1, 1,
+ dbcommon::SMALLINTID, dbcommon::SMALLINTID, "1",
+ dbcommon::INT_EQUAL_INT, 1, 2, dbcommon::INTID,
+ dbcommon::INTID, "0");
+ upu.constructBoolQualList(-1, dbcommon::BIGINT_EQUAL_INT, 1, 3,
+ dbcommon::BIGINTID, dbcommon::BIGINTID, "1",
+ dbcommon::DOUBLE_EQUAL_DOUBLE, 1, 4,
+ dbcommon::DOUBLEID, dbcommon::DOUBLEID, "0");
+ upu.constructBoolQualList(-1, dbcommon::FLOAT_EQUAL_DOUBLE, 1, 5,
+ dbcommon::FLOATID, dbcommon::FLOATID, "1",
+ dbcommon::BIGINT_EQUAL_INT, 1, 6,
+ dbcommon::STRINGID, dbcommon::STRINGID, "0");
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_TestOr2", false,
+ false);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_TestAnd) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructBoolQualList(-1, dbcommon::SMALLINT_EQUAL_INT, 1, 1,
+ dbcommon::SMALLINTID, dbcommon::SMALLINTID, "1",
+ dbcommon::INT_EQUAL_INT, 1, 2, dbcommon::INTID,
+ dbcommon::INTID, "0", 0);
+ upu.constructBoolQualList(-1, dbcommon::BIGINT_EQUAL_INT, 1, 3,
+ dbcommon::BIGINTID, dbcommon::BIGINTID, "1",
+ dbcommon::DOUBLE_EQUAL_DOUBLE, 1, 4,
+ dbcommon::DOUBLEID, dbcommon::DOUBLEID, "0", 0);
+ upu.constructBoolQualList(-1, dbcommon::FLOAT_EQUAL_DOUBLE, 1, 5,
+ dbcommon::FLOATID, dbcommon::FLOATID, "0",
+ dbcommon::BIGINT_EQUAL_INT, 1, 6,
+ dbcommon::STRINGID, dbcommon::STRINGID, "0", 0);
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_TestAnd");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_TestNot) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructBoolQualList(-1, dbcommon::SMALLINT_EQUAL_INT, 1, 1,
+ dbcommon::SMALLINTID, dbcommon::SMALLINTID, "1",
+ dbcommon::INT_EQUAL_INT, 1, 2, dbcommon::INTID,
+ dbcommon::INTID, "0", 2);
+ upu.constructBoolQualList(-1, dbcommon::BIGINT_EQUAL_INT, 1, 3,
+ dbcommon::BIGINTID, dbcommon::BIGINTID, "1",
+ dbcommon::DOUBLE_EQUAL_DOUBLE, 1, 4,
+ dbcommon::DOUBLEID, dbcommon::DOUBLEID, "0", 2);
+ upu.constructBoolQualList(-1, dbcommon::FLOAT_EQUAL_DOUBLE, 1, 5,
+ dbcommon::FLOATID, dbcommon::FLOATID, "0",
+ dbcommon::BIGINT_EQUAL_INT, 1, 6,
+ dbcommon::STRINGID, dbcommon::STRINGID, "0", 2);
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_TestNot", false,
+ false);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_IsNull) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructNullTestQualList(-1, 0, 1, 1,
+ dbcommon::SMALLINTID); // NULLTESTTYPE_IS_NULL
+ upu.constructNullTestQualList(-1, 0, 1, 2,
+ dbcommon::INTID); // NULLTESTTYPE_IS_NULL
+ upu.constructNullTestQualList(-1, 0, 1, 3,
+ dbcommon::BIGINTID); // NULLTESTTYPE_IS_NULL
+ upu.constructNullTestQualList(-1, 0, 1, 4,
+ dbcommon::DOUBLEID); // NULLTESTTYPE_IS_NULL
+ upu.constructNullTestQualList(-1, 0, 1, 5,
+ dbcommon::FLOATID); // NULLTESTTYPE_IS_NULL
+ upu.constructNullTestQualList(-1, 0, 1, 6,
+ dbcommon::STRINGID); // NULLTESTTYPE_IS_NULL
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_IsNull", true,
+ false);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_IsNotNull) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructNullTestQualList(
+ -1, 1, 1, 1, dbcommon::SMALLINTID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.constructNullTestQualList(-1, 1, 1, 2,
+ dbcommon::INTID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.constructNullTestQualList(
+ -1, 1, 1, 3, dbcommon::BIGINTID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.constructNullTestQualList(
+ -1, 1, 1, 4, dbcommon::DOUBLEID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.constructNullTestQualList(-1, 1, 1, 5,
+ dbcommon::FLOATID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.constructNullTestQualList(
+ -1, 1, 1, 6, dbcommon::STRINGID); // NULLTESTTYPE_IS_NOT_NULL
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest("hildfs", predicateExprs, "TestFilterPushDown_IsNotNull", false,
+ false);
+}
+
+void testVarOpConstThenOpVarOpConst(
+ const std::string &pattern, const char *casename, int32_t opFuncId,
+ int32_t opFuncId1, dbcommon::TypeKind varType1, const char *buffer1,
+ int32_t opFuncId2, dbcommon::TypeKind varType2, const char *buffer2,
+ bool shouldSkip1 = false, bool shouldSkip2 = true) {
+ univplan::UnivPlanProtoUtility upu;
+ int32_t uid = upu.univPlanSeqScanNewInstance(-1);
+ upu.constructVarOpConstThenOpVarOpConstQualList(
+ -1, opFuncId, opFuncId1, 1, 1, varType1, varType1, buffer1, opFuncId2, 1,
+ 2, varType2, varType2, buffer2);
+ upu.univPlanAddToPlanNodeTest(true);
+ const univplan::UnivPlanPlanNodePoly *planNode =
+ &upu.getUnivPlan()->upb.get()->getPlanBuilderPlan()->getPlan()->plan();
+ const univplan::UnivPlanScanSeq &ss = planNode->scanseq();
+ const univplan::UnivPlanExprPolyList *predicateExprs = &ss.super().quallist();
+ LOG_INFO("plan=%s", upu.univPlanGetJsonFormatedPlan());
+ generateTest(pattern, predicateExprs, casename);
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarOpConstThenOpVarOpConst1) {
+ testVarOpConstThenOpVarOpConst(
+ "hh", "TestFilterPushDown_VarOpConstThenOpVarOpConst1",
+ dbcommon::SMALLINT_LESS_THAN_SMALLINT, dbcommon::SMALLINT_ADD_SMALLINT,
+ dbcommon::SMALLINTID, "29", dbcommon::SMALLINT_SUB_SMALLINT,
+ dbcommon::SMALLINTID, "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarOpConstThenOpVarOpConst2) {
+ testVarOpConstThenOpVarOpConst(
+ "ii", "TestFilterPushDown_VarOpConstThenOpVarOpConst2",
+ dbcommon::INT_LESS_THAN_INT, dbcommon::INT_MUL_INT, dbcommon::INTID,
+ "100", dbcommon::INT_ADD_INT, dbcommon::INTID, "22");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarOpConstThenOpVarOpConst3) {
+ testVarOpConstThenOpVarOpConst(
+ "ll", "TestFilterPushDown_VarOpConstThenOpVarOpConst3",
+ dbcommon::BIGINT_GREATER_THAN_BIGINT, dbcommon::BIGINT_SUB_BIGINT,
+ dbcommon::BIGINTID, "29", dbcommon::BIGINT_MUL_BIGINT, dbcommon::BIGINTID,
+ "1");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarOpConstThenOpVarOpConst4) {
+ testVarOpConstThenOpVarOpConst(
+ "dd", "TestFilterPushDown_VarOpConstThenOpVarOpConst4",
+ dbcommon::DOUBLE_GREATER_THAN_DOUBLE, dbcommon::DOUBLE_DIV_DOUBLE,
+ dbcommon::DOUBLEID, "6", dbcommon::DOUBLE_MUL_DOUBLE, dbcommon::DOUBLEID,
+ "5");
+}
+
+TEST(TestORCFormat, TestFilterPushDown_VarOpConstThenOpVarOpConst5) {
+ testVarOpConstThenOpVarOpConst(
+ "ff", "TestFilterPushDown_VarOpConstThenOpVarOpConst5",
+ dbcommon::FLOAT_EQUAL_FLOAT, dbcommon::FLOAT_DIV_FLOAT, dbcommon::FLOATID,
+ "3", dbcommon::FLOAT_ADD_FLOAT, dbcommon::FLOATID, "10");
+}
+
+} // namespace storage
diff --git a/depends/storage/test/unit/format/test-orc-byte-rle-encoder.cc b/depends/storage/test/unit/format/test-orc-byte-rle-encoder.cc
new file mode 100644
index 0000000..35018c6
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-byte-rle-encoder.cc
@@ -0,0 +1,165 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <cstdlib>
+#include <memory>
+
+#include "gtest/gtest.h"
+
+#include "storage/format/orc/byte-rle.h"
+#include "storage/format/orc/output-stream.h"
+
+namespace orc {
+
+const int DEFAULT_MEM_STREAM_SIZE = 1024 * 1024; // 1M
+
+void generateNotNull(uint64_t numValues, uint64_t numNulls, char* notNull) {
+ if (numNulls != 0 && notNull != nullptr) {
+ memset(notNull, 1, numValues);
+ while (numNulls > 0) {
+ uint64_t pos = static_cast<uint64_t>(std::rand()) % numValues;
+ if (notNull[pos]) {
+ notNull[pos] = static_cast<char>(0);
+ --numNulls;
+ }
+ }
+ }
+}
+
+void generateData(uint64_t numValues, char* data, uint64_t numNulls = 0,
+ char* notNull = nullptr) {
+ generateNotNull(numValues, numNulls, notNull);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ data[i] = static_cast<char>(std::rand() % 256);
+ }
+}
+
+void generateBoolData(uint64_t numValues, char* data, uint64_t numNulls = 0,
+ char* notNull = nullptr) {
+ generateNotNull(numValues, numNulls, notNull);
+ for (uint64_t i = 0; i < numValues; ++i) {
+ data[i] = static_cast<char>(std::rand() % 2);
+ }
+}
+
+void decodeAndVerify(const std::unique_ptr<SeekableOutputStream>& outStream,
+ char* data, uint64_t numValues, char* notNull) {
+ std::unique_ptr<SeekableInputStream> inStream(new SeekableArrayInputStream(
+ outStream->plainBuffer.data(), outStream->plainBuffer.size()));
+
+ auto decoder = createByteRleDecoder(std::move(inStream));
+
+ char* decodedData = new char[numValues];
+ decoder->next(decodedData, numValues, notNull);
+
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ EXPECT_EQ(data[i], decodedData[i]);
+ }
+ }
+
+ delete[] decodedData;
+}
+
+void decodeAndVerifyBoolean(
+ const std::unique_ptr<SeekableOutputStream>& outStream, char* data,
+ uint64_t numValues, char* notNull) {
+ std::unique_ptr<SeekableInputStream> inStream(new SeekableArrayInputStream(
+ outStream->plainBuffer.data(), outStream->plainBuffer.size()));
+
+ auto decoder = createBooleanRleDecoder(std::move(inStream));
+
+ char* decodedData = new char[numValues];
+ decoder->next(decodedData, numValues, notNull);
+
+ for (uint64_t i = 0; i < numValues; ++i) {
+ if (!notNull || notNull[i]) {
+ bool expect = data[i] != 0;
+ bool actual = decodedData[i] != 0;
+ EXPECT_EQ(expect, actual);
+ }
+ }
+
+ delete[] decodedData;
+}
+
+TEST(ByteRleEncoder, random_chars) {
+ auto encoder = createByteRleCoder(CompressionKind_NONE);
+
+ char* data = new char[102400];
+ generateData(102400, data);
+ encoder->write(data, 102400, nullptr);
+ encoder->flush();
+
+ decodeAndVerify(encoder->output, data, 102400, nullptr);
+ delete[] data;
+}
+
+TEST(ByteRleEncoder, random_chars_with_null) {
+ auto encoder = createByteRleCoder(CompressionKind_NONE);
+
+ char* notNull = new char[102400];
+ char* data = new char[102400];
+ generateData(102400, data, 377, notNull);
+ encoder->write(data, 102400, notNull);
+ encoder->flush();
+
+ decodeAndVerify(encoder->output, data, 102400, notNull);
+ delete[] data;
+ delete[] notNull;
+}
+
+TEST(BooleanRleEncoder, random_bits_not_aligned) {
+ auto encoder = createBooleanRleEncoderImpl(CompressionKind_NONE);
+
+ char* data = new char[1779];
+ generateBoolData(1779, data);
+ encoder->write(data, 1779, nullptr);
+ encoder->flush();
+
+ decodeAndVerifyBoolean(encoder->output, data, 1779, nullptr);
+ delete[] data;
+}
+
+TEST(BooleanRleEncoder, random_bits_aligned) {
+ auto encoder = createBooleanRleEncoderImpl(CompressionKind_NONE);
+
+ char* data = new char[8000];
+ generateBoolData(8000, data);
+ encoder->write(data, 8000, nullptr);
+ encoder->flush();
+
+ decodeAndVerifyBoolean(encoder->output, data, 8000, nullptr);
+ delete[] data;
+}
+
+TEST(BooleanRleEncoder, random_bits_aligned_with_null) {
+ auto encoder = createBooleanRleEncoderImpl(CompressionKind_NONE);
+
+ char* notNull = new char[8000];
+ char* data = new char[8000];
+ generateBoolData(8000, data, 515, notNull);
+ encoder->write(data, 8000, notNull);
+ encoder->flush();
+
+ decodeAndVerifyBoolean(encoder->output, data, 8000, notNull);
+ delete[] data;
+ delete[] notNull;
+}
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-orc-byte-rle.cc b/depends/storage/test/unit/format/test-orc-byte-rle.cc
new file mode 100644
index 0000000..09ec4c8
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-byte-rle.cc
@@ -0,0 +1,1445 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <vector>
+#include "gtest/gtest.h"
+#include "storage/format/orc/byte-rle.h"
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof(*array))
+namespace orc {
+
+TEST(ByteRle, simpleTest) {
+ const unsigned char buffer[] = {0x61, 0x00, 0xfd, 0x44, 0x45, 0x46};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))));
+ std::vector<char> data(103);
+ rle->next(data.data(), data.size(), nullptr);
+
+ for (size_t i = 0; i < 100; ++i) {
+ EXPECT_EQ(0, data[i]) << "Output wrong at " << i;
+ }
+ EXPECT_EQ(0x44, data[100]);
+ EXPECT_EQ(0x45, data[101]);
+ EXPECT_EQ(0x46, data[102]);
+}
+
+TEST(ByteRle, nullTest) {
+ char buffer[258];
+ char notNull[266];
+ char result[266];
+ buffer[0] = -128;
+ buffer[129] = -128;
+ for (int i = 0; i < 128; ++i) {
+ buffer[1 + i] = static_cast<char>(i);
+ buffer[130 + i] = static_cast<char>(128 + i);
+ }
+ for (int i = 0; i < 266; ++i) {
+ notNull[i] = static_cast<char>(i >= 10);
+ }
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, sizeof(buffer))));
+ rle->next(result, sizeof(result), notNull);
+ for (size_t i = 0; i < sizeof(result); ++i) {
+ if (i >= 10) {
+ EXPECT_EQ((i - 10) & 0xff, static_cast<int>(result[i]) & 0xff)
+ << "Output wrong at " << i;
+ }
+ }
+}
+
+TEST(ByteRle, literalCrossBuffer) {
+ const unsigned char buffer[] = {0xf6, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x07, 0x10};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer), 6)));
+ std::vector<char> data(20);
+ rle->next(data.data(), data.size(), nullptr);
+
+ for (size_t i = 0; i < 10; ++i) {
+ EXPECT_EQ(i, data[i]) << "Output wrong at " << i;
+ }
+ for (size_t i = 10; i < 20; ++i) {
+ EXPECT_EQ(16, data[i]) << "Output wrong at " << i;
+ }
+}
+
+TEST(ByteRle, skipLiteralBufferUnderflowTest) {
+ const unsigned char buffer[] = {0xf8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer), 4)));
+ std::vector<char> data(8);
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(0x0, data[0]);
+ EXPECT_EQ(0x1, data[1]);
+ EXPECT_EQ(0x2, data[2]);
+
+ rle->skip(2);
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(0x5, data[0]);
+ EXPECT_EQ(0x6, data[1]);
+ EXPECT_EQ(0x7, data[2]);
+}
+
+TEST(ByteRle, simpleRuns) {
+ const unsigned char buffer[] = {0x0d, 0xff, 0x0d, 0xfe, 0x0d, 0xfd};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))));
+ std::vector<char> data(16);
+ for (size_t i = 0; i < 3; ++i) {
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(static_cast<char>(-1 - static_cast<int>(i)), data[j])
+ << "Output wrong at " << (16 * i + j);
+ }
+ }
+}
+
+TEST(ByteRle, splitHeader) {
+ const unsigned char buffer[] = {
+ 0x00, 0x01, 0xe0, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x08, 0x09,
+ 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15,
+ 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<orc::SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer), 1)));
+ std::vector<char> data(35);
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < 3; ++i) {
+ EXPECT_EQ(1, data[i]) << "Output wrong at " << i;
+ }
+ for (size_t i = 3; i < data.size(); ++i) {
+ EXPECT_EQ(i - 2, data[i]) << "Output wrong at " << i;
+ }
+}
+
+TEST(ByteRle, splitRuns) {
+ const unsigned char buffer[] = {0x0d, 0x02, 0xf0, 0x01, 0x02, 0x03, 0x04,
+ 0x05, 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b,
+ 0x0c, 0x0d, 0x0e, 0x0f, 0x10};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<orc::SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))));
+ std::vector<char> data(5);
+ for (size_t i = 0; i < 3; ++i) {
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(2, data[j]) << "Output wrong at " << (i * data.size() + j);
+ }
+ }
+ rle->next(data.data(), data.size(), nullptr);
+ EXPECT_EQ(2, data[0]) << "Output wrong at 15";
+ for (size_t i = 1; i < data.size(); ++i) {
+ EXPECT_EQ(i, data[i]) << "Output wrong at " << (15 + i);
+ }
+ for (size_t i = 0; i < 2; ++i) {
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(5 * i + j + data.size(), data[j]) << "Output wrong at "
+ << (20 + data.size() * i + j);
+ }
+ }
+ rle->next(data.data(), 2, nullptr);
+ EXPECT_EQ(15, data[0]) << "Output wrong at 30";
+ EXPECT_EQ(16, data[1]) << "Output wrong at 31";
+}
+
+TEST(ByteRle, testNulls) {
+ const unsigned char buffer[] = {0xf0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
+ 0x0d, 0x0e, 0x0f, 0x3d, 0xdc};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<orc::SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer), 3)));
+ std::vector<char> data(16, -1);
+ std::vector<char> notNull(data.size());
+ for (size_t i = 0; i < data.size(); ++i) {
+ notNull[i] = (i + 1) % 2;
+ }
+ for (size_t i = 0; i < 2; ++i) {
+ rle->next(data.data(), data.size(), notNull.data());
+ for (size_t j = 0; j < data.size(); ++j) {
+ if (j % 2 == 0) {
+ EXPECT_EQ((i * data.size() + j) / 2, data[j]) << "Output wrong at "
+ << (i * data.size() + j);
+ } else {
+ EXPECT_EQ(-1, data[j]) << "Output wrong at " << (i * data.size() + j);
+ }
+ }
+ }
+ for (size_t i = 0; i < 8; ++i) {
+ rle->next(data.data(), data.size(), notNull.data());
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(j % 2 == 0 ? -36 : -1, data[j]) << "Output wrong at "
+ << (i * data.size() + j + 32);
+ }
+ }
+}
+
+TEST(ByteRle, testAllNulls) {
+ const unsigned char buffer[] = {0xf0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
+ 0x0d, 0x0e, 0x0f, 0x3d, 0xdc};
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<orc::SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))));
+ std::vector<char> data(16, -1);
+ std::vector<char> allNull(data.size(), 0);
+ std::vector<char> noNull(data.size(), 1);
+ rle->next(data.data(), data.size(), allNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(-1, data[i]) << "Output wrong at " << i;
+ }
+ rle->next(data.data(), data.size(), noNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i, data[i]) << "Output wrong at " << i;
+ data[i] = -1;
+ }
+ rle->next(data.data(), data.size(), allNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(-1, data[i]) << "Output wrong at " << i;
+ }
+ for (size_t i = 0; i < 4; ++i) {
+ rle->next(data.data(), data.size(), noNull.data());
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(-36, data[j]) << "Output wrong at " << i;
+ }
+ }
+ rle->next(data.data(), data.size(), allNull.data());
+}
+
+TEST(ByteRle, testSkip) {
+ // the stream generated by Java's TestRunLengthByteReader.testSkips
+ // for (int i = 0; i < 2048; ++i) {
+ // if (i < 1024) {
+ // out.write(i / 16);
+ // } else {
+ // out.write(i % 256);
+ // }
+ // }
+ const unsigned char buffer[] = {
+ 0xd, 0x0, 0xd, 0x1, 0xd, 0x2, 0xd, 0x3, 0xd, 0x4, 0xd, 0x5,
+ 0xd, 0x6, 0xd, 0x7, 0xd, 0x8, 0xd, 0x9, 0xd, 0xa, 0xd, 0xb,
+ 0xd, 0xc, 0xd, 0xd, 0xd, 0xe, 0xd, 0xf, 0xd, 0x10, 0xd, 0x11,
+ 0xd, 0x12, 0xd, 0x13, 0xd, 0x14, 0xd, 0x15, 0xd, 0x16, 0xd, 0x17,
+ 0xd, 0x18, 0xd, 0x19, 0xd, 0x1a, 0xd, 0x1b, 0xd, 0x1c, 0xd, 0x1d,
+ 0xd, 0x1e, 0xd, 0x1f, 0xd, 0x20, 0xd, 0x21, 0xd, 0x22, 0xd, 0x23,
+ 0xd, 0x24, 0xd, 0x25, 0xd, 0x26, 0xd, 0x27, 0xd, 0x28, 0xd, 0x29,
+ 0xd, 0x2a, 0xd, 0x2b, 0xd, 0x2c, 0xd, 0x2d, 0xd, 0x2e, 0xd, 0x2f,
+ 0xd, 0x30, 0xd, 0x31, 0xd, 0x32, 0xd, 0x33, 0xd, 0x34, 0xd, 0x35,
+ 0xd, 0x36, 0xd, 0x37, 0xd, 0x38, 0xd, 0x39, 0xd, 0x3a, 0xd, 0x3b,
+ 0xd, 0x3c, 0xd, 0x3d, 0xd, 0x3e, 0xd, 0x3f, 0x80, 0x0, 0x1, 0x2,
+ 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe,
+ 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a,
+ 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+ 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a,
+ 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
+ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e,
+ 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
+ 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+ 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91,
+ 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
+ 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+ 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5,
+ 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1,
+ 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
+ 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
+ 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5,
+ 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
+ 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd,
+ 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+ 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+ 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+ 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
+ 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3,
+ 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
+ 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3,
+ 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
+ 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2,
+ 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe,
+ 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a,
+ 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+ 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a,
+ 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
+ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e,
+ 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
+ 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+ 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91,
+ 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
+ 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+ 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5,
+ 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1,
+ 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
+ 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
+ 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5,
+ 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
+ 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd,
+ 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+ 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+ 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+ 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
+ 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3,
+ 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
+ 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3,
+ 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
+ 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ };
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createByteRleDecoder(std::unique_ptr<orc::SeekableInputStream>(stream));
+ std::vector<char> data(1);
+ for (size_t i = 0; i < 2048; i += 10) {
+ rle->next(data.data(), data.size(), nullptr);
+ EXPECT_EQ(static_cast<char>(i < 1024 ? i / 16 : i & 0xff), data[0])
+ << "Output wrong at " << i;
+ if (i < 2038) {
+ rle->skip(9);
+ }
+ rle->skip(0);
+ }
+}
+
+TEST(ByteRle, testSeek) {
+ // the stream generated by Java's
+ // TestRunLengthByteReader.testUncompressedSeek
+ // for (int i = 0; i < 2048; ++i) {
+ // if (i < 1024) {
+ // out.write(i / 4);
+ // } else {
+ // out.write(i % 256);
+ // }
+ // }
+ const unsigned char buffer[] = {
+ 0x1, 0x0, 0x1, 0x1, 0x1, 0x2, 0x1, 0x3, 0x1, 0x4, 0x1, 0x5,
+ 0x1, 0x6, 0x1, 0x7, 0x1, 0x8, 0x1, 0x9, 0x1, 0xa, 0x1, 0xb,
+ 0x1, 0xc, 0x1, 0xd, 0x1, 0xe, 0x1, 0xf, 0x1, 0x10, 0x1, 0x11,
+ 0x1, 0x12, 0x1, 0x13, 0x1, 0x14, 0x1, 0x15, 0x1, 0x16, 0x1, 0x17,
+ 0x1, 0x18, 0x1, 0x19, 0x1, 0x1a, 0x1, 0x1b, 0x1, 0x1c, 0x1, 0x1d,
+ 0x1, 0x1e, 0x1, 0x1f, 0x1, 0x20, 0x1, 0x21, 0x1, 0x22, 0x1, 0x23,
+ 0x1, 0x24, 0x1, 0x25, 0x1, 0x26, 0x1, 0x27, 0x1, 0x28, 0x1, 0x29,
+ 0x1, 0x2a, 0x1, 0x2b, 0x1, 0x2c, 0x1, 0x2d, 0x1, 0x2e, 0x1, 0x2f,
+ 0x1, 0x30, 0x1, 0x31, 0x1, 0x32, 0x1, 0x33, 0x1, 0x34, 0x1, 0x35,
+ 0x1, 0x36, 0x1, 0x37, 0x1, 0x38, 0x1, 0x39, 0x1, 0x3a, 0x1, 0x3b,
+ 0x1, 0x3c, 0x1, 0x3d, 0x1, 0x3e, 0x1, 0x3f, 0x1, 0x40, 0x1, 0x41,
+ 0x1, 0x42, 0x1, 0x43, 0x1, 0x44, 0x1, 0x45, 0x1, 0x46, 0x1, 0x47,
+ 0x1, 0x48, 0x1, 0x49, 0x1, 0x4a, 0x1, 0x4b, 0x1, 0x4c, 0x1, 0x4d,
+ 0x1, 0x4e, 0x1, 0x4f, 0x1, 0x50, 0x1, 0x51, 0x1, 0x52, 0x1, 0x53,
+ 0x1, 0x54, 0x1, 0x55, 0x1, 0x56, 0x1, 0x57, 0x1, 0x58, 0x1, 0x59,
+ 0x1, 0x5a, 0x1, 0x5b, 0x1, 0x5c, 0x1, 0x5d, 0x1, 0x5e, 0x1, 0x5f,
+ 0x1, 0x60, 0x1, 0x61, 0x1, 0x62, 0x1, 0x63, 0x1, 0x64, 0x1, 0x65,
+ 0x1, 0x66, 0x1, 0x67, 0x1, 0x68, 0x1, 0x69, 0x1, 0x6a, 0x1, 0x6b,
+ 0x1, 0x6c, 0x1, 0x6d, 0x1, 0x6e, 0x1, 0x6f, 0x1, 0x70, 0x1, 0x71,
+ 0x1, 0x72, 0x1, 0x73, 0x1, 0x74, 0x1, 0x75, 0x1, 0x76, 0x1, 0x77,
+ 0x1, 0x78, 0x1, 0x79, 0x1, 0x7a, 0x1, 0x7b, 0x1, 0x7c, 0x1, 0x7d,
+ 0x1, 0x7e, 0x1, 0x7f, 0x1, 0x80, 0x1, 0x81, 0x1, 0x82, 0x1, 0x83,
+ 0x1, 0x84, 0x1, 0x85, 0x1, 0x86, 0x1, 0x87, 0x1, 0x88, 0x1, 0x89,
+ 0x1, 0x8a, 0x1, 0x8b, 0x1, 0x8c, 0x1, 0x8d, 0x1, 0x8e, 0x1, 0x8f,
+ 0x1, 0x90, 0x1, 0x91, 0x1, 0x92, 0x1, 0x93, 0x1, 0x94, 0x1, 0x95,
+ 0x1, 0x96, 0x1, 0x97, 0x1, 0x98, 0x1, 0x99, 0x1, 0x9a, 0x1, 0x9b,
+ 0x1, 0x9c, 0x1, 0x9d, 0x1, 0x9e, 0x1, 0x9f, 0x1, 0xa0, 0x1, 0xa1,
+ 0x1, 0xa2, 0x1, 0xa3, 0x1, 0xa4, 0x1, 0xa5, 0x1, 0xa6, 0x1, 0xa7,
+ 0x1, 0xa8, 0x1, 0xa9, 0x1, 0xaa, 0x1, 0xab, 0x1, 0xac, 0x1, 0xad,
+ 0x1, 0xae, 0x1, 0xaf, 0x1, 0xb0, 0x1, 0xb1, 0x1, 0xb2, 0x1, 0xb3,
+ 0x1, 0xb4, 0x1, 0xb5, 0x1, 0xb6, 0x1, 0xb7, 0x1, 0xb8, 0x1, 0xb9,
+ 0x1, 0xba, 0x1, 0xbb, 0x1, 0xbc, 0x1, 0xbd, 0x1, 0xbe, 0x1, 0xbf,
+ 0x1, 0xc0, 0x1, 0xc1, 0x1, 0xc2, 0x1, 0xc3, 0x1, 0xc4, 0x1, 0xc5,
+ 0x1, 0xc6, 0x1, 0xc7, 0x1, 0xc8, 0x1, 0xc9, 0x1, 0xca, 0x1, 0xcb,
+ 0x1, 0xcc, 0x1, 0xcd, 0x1, 0xce, 0x1, 0xcf, 0x1, 0xd0, 0x1, 0xd1,
+ 0x1, 0xd2, 0x1, 0xd3, 0x1, 0xd4, 0x1, 0xd5, 0x1, 0xd6, 0x1, 0xd7,
+ 0x1, 0xd8, 0x1, 0xd9, 0x1, 0xda, 0x1, 0xdb, 0x1, 0xdc, 0x1, 0xdd,
+ 0x1, 0xde, 0x1, 0xdf, 0x1, 0xe0, 0x1, 0xe1, 0x1, 0xe2, 0x1, 0xe3,
+ 0x1, 0xe4, 0x1, 0xe5, 0x1, 0xe6, 0x1, 0xe7, 0x1, 0xe8, 0x1, 0xe9,
+ 0x1, 0xea, 0x1, 0xeb, 0x1, 0xec, 0x1, 0xed, 0x1, 0xee, 0x1, 0xef,
+ 0x1, 0xf0, 0x1, 0xf1, 0x1, 0xf2, 0x1, 0xf3, 0x1, 0xf4, 0x1, 0xf5,
+ 0x1, 0xf6, 0x1, 0xf7, 0x1, 0xf8, 0x1, 0xf9, 0x1, 0xfa, 0x1, 0xfb,
+ 0x1, 0xfc, 0x1, 0xfd, 0x1, 0xfe, 0x1, 0xff, 0x80, 0x0, 0x1, 0x2,
+ 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe,
+ 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a,
+ 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+ 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a,
+ 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
+ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e,
+ 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
+ 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+ 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91,
+ 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
+ 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+ 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5,
+ 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1,
+ 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
+ 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
+ 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5,
+ 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
+ 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd,
+ 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+ 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+ 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+ 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
+ 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3,
+ 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
+ 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3,
+ 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
+ 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2,
+ 0x3, 0x4, 0x5, 0x6, 0x7, 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe,
+ 0xf, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a,
+ 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20, 0x21, 0x22, 0x23, 0x24, 0x25, 0x26,
+ 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c, 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32,
+ 0x33, 0x34, 0x35, 0x36, 0x37, 0x38, 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e,
+ 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a,
+ 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56,
+ 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c, 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62,
+ 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e,
+ 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a,
+ 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80, 0x80, 0x81, 0x82, 0x83, 0x84, 0x85,
+ 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91,
+ 0x92, 0x93, 0x94, 0x95, 0x96, 0x97, 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d,
+ 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3, 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9,
+ 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf, 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5,
+ 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb, 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1,
+ 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7, 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd,
+ 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3, 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9,
+ 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf, 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5,
+ 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb, 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1,
+ 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7, 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd,
+ 0xfe, 0xff, 0x80, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7, 0x8,
+ 0x9, 0xa, 0xb, 0xc, 0xd, 0xe, 0xf, 0x10, 0x11, 0x12, 0x13, 0x14,
+ 0x15, 0x16, 0x17, 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, 0x20,
+ 0x21, 0x22, 0x23, 0x24, 0x25, 0x26, 0x27, 0x28, 0x29, 0x2a, 0x2b, 0x2c,
+ 0x2d, 0x2e, 0x2f, 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, 0x38,
+ 0x39, 0x3a, 0x3b, 0x3c, 0x3d, 0x3e, 0x3f, 0x40, 0x41, 0x42, 0x43, 0x44,
+ 0x45, 0x46, 0x47, 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, 0x50,
+ 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, 0x58, 0x59, 0x5a, 0x5b, 0x5c,
+ 0x5d, 0x5e, 0x5f, 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68,
+ 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, 0x70, 0x71, 0x72, 0x73, 0x74,
+ 0x75, 0x76, 0x77, 0x78, 0x79, 0x7a, 0x7b, 0x7c, 0x7d, 0x7e, 0x7f, 0x80,
+ 0x80, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87, 0x88, 0x89, 0x8a, 0x8b,
+ 0x8c, 0x8d, 0x8e, 0x8f, 0x90, 0x91, 0x92, 0x93, 0x94, 0x95, 0x96, 0x97,
+ 0x98, 0x99, 0x9a, 0x9b, 0x9c, 0x9d, 0x9e, 0x9f, 0xa0, 0xa1, 0xa2, 0xa3,
+ 0xa4, 0xa5, 0xa6, 0xa7, 0xa8, 0xa9, 0xaa, 0xab, 0xac, 0xad, 0xae, 0xaf,
+ 0xb0, 0xb1, 0xb2, 0xb3, 0xb4, 0xb5, 0xb6, 0xb7, 0xb8, 0xb9, 0xba, 0xbb,
+ 0xbc, 0xbd, 0xbe, 0xbf, 0xc0, 0xc1, 0xc2, 0xc3, 0xc4, 0xc5, 0xc6, 0xc7,
+ 0xc8, 0xc9, 0xca, 0xcb, 0xcc, 0xcd, 0xce, 0xcf, 0xd0, 0xd1, 0xd2, 0xd3,
+ 0xd4, 0xd5, 0xd6, 0xd7, 0xd8, 0xd9, 0xda, 0xdb, 0xdc, 0xdd, 0xde, 0xdf,
+ 0xe0, 0xe1, 0xe2, 0xe3, 0xe4, 0xe5, 0xe6, 0xe7, 0xe8, 0xe9, 0xea, 0xeb,
+ 0xec, 0xed, 0xee, 0xef, 0xf0, 0xf1, 0xf2, 0xf3, 0xf4, 0xf5, 0xf6, 0xf7,
+ 0xf8, 0xf9, 0xfa, 0xfb, 0xfc, 0xfd, 0xfe, 0xff,
+ };
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ const uint64_t fileLocs[] = {
+ 0, 0, 0, 0, 0, 2, 2, 2, 2, 4, 4, 4,
+ 4, 6, 6, 6, 6, 8, 8, 8, 8, 10, 10, 10,
+ 10, 12, 12, 12, 12, 14, 14, 14, 14, 16, 16, 16,
+ 16, 18, 18, 18, 18, 20, 20, 20, 20, 22, 22, 22,
+ 22, 24, 24, 24, 24, 26, 26, 26, 26, 28, 28, 28,
+ 28, 30, 30, 30, 30, 32, 32, 32, 32, 34, 34, 34,
+ 34, 36, 36, 36, 36, 38, 38, 38, 38, 40, 40, 40,
+ 40, 42, 42, 42, 42, 44, 44, 44, 44, 46, 46, 46,
+ 46, 48, 48, 48, 48, 50, 50, 50, 50, 52, 52, 52,
+ 52, 54, 54, 54, 54, 56, 56, 56, 56, 58, 58, 58,
+ 58, 60, 60, 60, 60, 62, 62, 62, 62, 64, 64, 64,
+ 64, 66, 66, 66, 66, 68, 68, 68, 68, 70, 70, 70,
+ 70, 72, 72, 72, 72, 74, 74, 74, 74, 76, 76, 76,
+ 76, 78, 78, 78, 78, 80, 80, 80, 80, 82, 82, 82,
+ 82, 84, 84, 84, 84, 86, 86, 86, 86, 88, 88, 88,
+ 88, 90, 90, 90, 90, 92, 92, 92, 92, 94, 94, 94,
+ 94, 96, 96, 96, 96, 98, 98, 98, 98, 100, 100, 100,
+ 100, 102, 102, 102, 102, 104, 104, 104, 104, 106, 106, 106,
+ 106, 108, 108, 108, 108, 110, 110, 110, 110, 112, 112, 112,
+ 112, 114, 114, 114, 114, 116, 116, 116, 116, 118, 118, 118,
+ 118, 120, 120, 120, 120, 122, 122, 122, 122, 124, 124, 124,
+ 124, 126, 126, 126, 126, 128, 128, 128, 128, 130, 130, 130,
+ 130, 132, 132, 132, 132, 134, 134, 134, 134, 136, 136, 136,
+ 136, 138, 138, 138, 138, 140, 140, 140, 140, 142, 142, 142,
+ 142, 144, 144, 144, 144, 146, 146, 146, 146, 148, 148, 148,
+ 148, 150, 150, 150, 150, 152, 152, 152, 152, 154, 154, 154,
+ 154, 156, 156, 156, 156, 158, 158, 158, 158, 160, 160, 160,
+ 160, 162, 162, 162, 162, 164, 164, 164, 164, 166, 166, 166,
+ 166, 168, 168, 168, 168, 170, 170, 170, 170, 172, 172, 172,
+ 172, 174, 174, 174, 174, 176, 176, 176, 176, 178, 178, 178,
+ 178, 180, 180, 180, 180, 182, 182, 182, 182, 184, 184, 184,
+ 184, 186, 186, 186, 186, 188, 188, 188, 188, 190, 190, 190,
+ 190, 192, 192, 192, 192, 194, 194, 194, 194, 196, 196, 196,
+ 196, 198, 198, 198, 198, 200, 200, 200, 200, 202, 202, 202,
+ 202, 204, 204, 204, 204, 206, 206, 206, 206, 208, 208, 208,
+ 208, 210, 210, 210, 210, 212, 212, 212, 212, 214, 214, 214,
+ 214, 216, 216, 216, 216, 218, 218, 218, 218, 220, 220, 220,
+ 220, 222, 222, 222, 222, 224, 224, 224, 224, 226, 226, 226,
+ 226, 228, 228, 228, 228, 230, 230, 230, 230, 232, 232, 232,
+ 232, 234, 234, 234, 234, 236, 236, 236, 236, 238, 238, 238,
+ 238, 240, 240, 240, 240, 242, 242, 242, 242, 244, 244, 244,
+ 244, 246, 246, 246, 246, 248, 248, 248, 248, 250, 250, 250,
+ 250, 252, 252, 252, 252, 254, 254, 254, 254, 256, 256, 256,
+ 256, 258, 258, 258, 258, 260, 260, 260, 260, 262, 262, 262,
+ 262, 264, 264, 264, 264, 266, 266, 266, 266, 268, 268, 268,
+ 268, 270, 270, 270, 270, 272, 272, 272, 272, 274, 274, 274,
+ 274, 276, 276, 276, 276, 278, 278, 278, 278, 280, 280, 280,
+ 280, 282, 282, 282, 282, 284, 284, 284, 284, 286, 286, 286,
+ 286, 288, 288, 288, 288, 290, 290, 290, 290, 292, 292, 292,
+ 292, 294, 294, 294, 294, 296, 296, 296, 296, 298, 298, 298,
+ 298, 300, 300, 300, 300, 302, 302, 302, 302, 304, 304, 304,
+ 304, 306, 306, 306, 306, 308, 308, 308, 308, 310, 310, 310,
+ 310, 312, 312, 312, 312, 314, 314, 314, 314, 316, 316, 316,
+ 316, 318, 318, 318, 318, 320, 320, 320, 320, 322, 322, 322,
+ 322, 324, 324, 324, 324, 326, 326, 326, 326, 328, 328, 328,
+ 328, 330, 330, 330, 330, 332, 332, 332, 332, 334, 334, 334,
+ 334, 336, 336, 336, 336, 338, 338, 338, 338, 340, 340, 340,
+ 340, 342, 342, 342, 342, 344, 344, 344, 344, 346, 346, 346,
+ 346, 348, 348, 348, 348, 350, 350, 350, 350, 352, 352, 352,
+ 352, 354, 354, 354, 354, 356, 356, 356, 356, 358, 358, 358,
+ 358, 360, 360, 360, 360, 362, 362, 362, 362, 364, 364, 364,
+ 364, 366, 366, 366, 366, 368, 368, 368, 368, 370, 370, 370,
+ 370, 372, 372, 372, 372, 374, 374, 374, 374, 376, 376, 376,
+ 376, 378, 378, 378, 378, 380, 380, 380, 380, 382, 382, 382,
+ 382, 384, 384, 384, 384, 386, 386, 386, 386, 388, 388, 388,
+ 388, 390, 390, 390, 390, 392, 392, 392, 392, 394, 394, 394,
+ 394, 396, 396, 396, 396, 398, 398, 398, 398, 400, 400, 400,
+ 400, 402, 402, 402, 402, 404, 404, 404, 404, 406, 406, 406,
+ 406, 408, 408, 408, 408, 410, 410, 410, 410, 412, 412, 412,
+ 412, 414, 414, 414, 414, 416, 416, 416, 416, 418, 418, 418,
+ 418, 420, 420, 420, 420, 422, 422, 422, 422, 424, 424, 424,
+ 424, 426, 426, 426, 426, 428, 428, 428, 428, 430, 430, 430,
+ 430, 432, 432, 432, 432, 434, 434, 434, 434, 436, 436, 436,
+ 436, 438, 438, 438, 438, 440, 440, 440, 440, 442, 442, 442,
+ 442, 444, 444, 444, 444, 446, 446, 446, 446, 448, 448, 448,
+ 448, 450, 450, 450, 450, 452, 452, 452, 452, 454, 454, 454,
+ 454, 456, 456, 456, 456, 458, 458, 458, 458, 460, 460, 460,
+ 460, 462, 462, 462, 462, 464, 464, 464, 464, 466, 466, 466,
+ 466, 468, 468, 468, 468, 470, 470, 470, 470, 472, 472, 472,
+ 472, 474, 474, 474, 474, 476, 476, 476, 476, 478, 478, 478,
+ 478, 480, 480, 480, 480, 482, 482, 482, 482, 484, 484, 484,
+ 484, 486, 486, 486, 486, 488, 488, 488, 488, 490, 490, 490,
+ 490, 492, 492, 492, 492, 494, 494, 494, 494, 496, 496, 496,
+ 496, 498, 498, 498, 498, 500, 500, 500, 500, 502, 502, 502,
+ 502, 504, 504, 504, 504, 506, 506, 506, 506, 508, 508, 508,
+ 508, 510, 510, 510, 510, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512, 512,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641, 641,
+ 641, 641, 641, 641, 641, 641, 641, 641, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770, 770,
+ 770, 770, 770, 770, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899, 899,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028,
+ 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1028, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157, 1157,
+ 1157, 1157, 1157, 1157, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286, 1286,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ 1415, 1415, 1415, 1415, 1415, 1415, 1415, 1415,
+ };
+ const uint64_t rleLocs[] = {
+ 0, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+ 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+ 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2,
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
+ 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+ 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+ 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+ 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
+ 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+ 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+ 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+ 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+ 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+ 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+ 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
+ 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
+ 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
+ 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+ 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127,
+ };
+ std::list<uint64_t> positions[2048];
+ for (size_t i = 0; i < 2048; ++i) {
+ positions[i].push_back(fileLocs[i]);
+ positions[i].push_back(rleLocs[i]);
+ }
+ std::unique_ptr<ByteRleDecoder> rle = createByteRleDecoder(std::move(stream));
+ std::vector<char> data(1);
+ for (size_t i = 0; i < 2048; ++i) {
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(static_cast<char>(i < 1024 ? i / 4 : i & 0xff), data[0])
+ << "Output wrong at " << i;
+ }
+ size_t i = 2048;
+ do {
+ --i;
+ PositionProvider location(positions[i]);
+ rle->seek(location);
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(static_cast<char>(i < 1024 ? i / 4 : i & 0xff), data[0])
+ << "Output wrong at " << i;
+ } while (i != 0);
+}
+
+TEST(BooleanRle, simpleTest) {
+ const unsigned char buffer[] = {0x61, 0xf0, 0xfd, 0x55, 0xAA, 0x55};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(50);
+ for (size_t i = 0; i < 16; ++i) {
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t j = 0; j < data.size(); ++j) {
+ const int bitPosn = static_cast<int>(50 * i + j);
+ EXPECT_EQ((bitPosn & 0x4) == 0 ? 1 : 0, data[j]) << "Output wrong at "
+ << i << ", " << j;
+ }
+ }
+ rle->next(data.data(), 24, nullptr);
+ for (size_t i = 0; i < 3; ++i) {
+ for (size_t j = 0; j < 8; ++j) {
+ EXPECT_EQ((i % 2) == (j % 2) ? 0 : 1, data[i * 8 + j])
+ << "Output wrong at " << i << "," << j;
+ }
+ }
+}
+
+TEST(BooleanRle, runsTest) {
+ const unsigned char buffer[] = {0xf7, 0xff, 0x80, 0x3f, 0xe0,
+ 0x0f, 0xf8, 0x03, 0xfe, 0x00};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(72);
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i % 18 < 9 ? 1 : 0, data[i]) << "Output wrong at " << i;
+ }
+ std::list<uint64_t> position(3, 0);
+ PositionProvider location(position);
+ rle->seek(location);
+ for (size_t i = 0; i < data.size(); ++i) {
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(i % 18 < 9 ? 1 : 0, data[0]) << "Output wrong at " << i;
+ }
+}
+
+TEST(BooleanRle, runsTestWithNull) {
+ const unsigned char buffer[] = {0xf7, 0xff, 0x80, 0x3f, 0xe0,
+ 0x0f, 0xf8, 0x03, 0xfe, 0x00};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(72);
+ std::vector<char> notNull(data.size(), 1);
+ rle->next(data.data(), data.size(), notNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i % 18 < 9 ? 1 : 0, data[i]) << "Output wrong at " << i;
+ }
+ std::list<uint64_t> position(3, 0);
+ PositionProvider location(position);
+ rle->seek(location);
+ for (size_t i = 0; i < data.size(); ++i) {
+ rle->next(data.data(), 1, notNull.data());
+ EXPECT_EQ(i % 18 < 9 ? 1 : 0, data[0]) << "Output wrong at " << i;
+ }
+}
+
+TEST(BooleanRle, skipTest) {
+ // stream copied from Java's TestBitFieldReader.testSkips
+ // for i in 0..16383
+ // if i < 8192
+ // out.write(i & 1)
+ // else
+ // out.write((i / 3) & 1)
+ const unsigned char buffer[] = {
+ 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55,
+ 0x7f, 0x55, 0x6f, 0x55, 0x80, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0x80, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0x80, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(1);
+ for (size_t i = 0; i < 16384; i += 5) {
+ rle->next(data.data(), data.size(), nullptr);
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[0]) << "Output wrong at "
+ << i;
+ if (i < 16379) {
+ rle->skip(4);
+ }
+ rle->skip(0);
+ }
+}
+
+TEST(BooleanRle, skipTestWithNulls) {
+ // stream copied from Java's TestBitFieldReader.testSkips
+ // for i in 0..16383
+ // if i < 8192
+ // out.write(i & 1)
+ // else
+ // out.write((i / 3) & 1)
+ const unsigned char buffer[] = {
+ 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55,
+ 0x7f, 0x55, 0x6f, 0x55, 0x80, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0x80, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0x80, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(3);
+ std::vector<char> someNull(data.size(), 0);
+ someNull[1] = 1;
+ std::vector<char> allNull(data.size(), 0);
+ for (size_t i = 0; i < 16384; i += 5) {
+ data.assign(data.size(), -1);
+ rle->next(data.data(), data.size(), someNull.data());
+ EXPECT_EQ(0, data[0]) << "Output wrong at " << i;
+ EXPECT_EQ(0, data[2]) << "Output wrong at " << i;
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[1]) << "Output wrong at "
+ << i;
+ if (i < 16379) {
+ rle->skip(4);
+ }
+ rle->skip(0);
+ data.assign(data.size(), -1);
+ rle->next(data.data(), data.size(), allNull.data());
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(0, data[j]) << "Output wrong at " << i << ", " << j;
+ }
+ }
+}
+
+TEST(BooleanRle, seekTest) {
+ // stream copied from Java's TestBitFieldReader.testUncompressedSeek
+ // for i in 0..16383
+ // if i < 8192
+ // out.write(i & 1)
+ // else
+ // out.write((i / 3) & 1)
+ const unsigned char buffer[] = {
+ 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55,
+ 0x7f, 0x55, 0x6f, 0x55, 0x80, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0x80, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0x80, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(16384);
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[i]) << "Output wrong at "
+ << i;
+ }
+ // set up all of the positions
+ std::list<uint64_t> positions[16384];
+ for (uint64_t i = 0; i < 16384; ++i) {
+ const uint64_t bytePosn = i / 8;
+ // add the stream position
+ positions[i].push_back(
+ bytePosn < 1025
+ ? 2 * (bytePosn / 130)
+ : (bytePosn < 1152 ? 16 : 145 + 129 * ((bytePosn - 1152) / 128)));
+ // add the byte RLE position
+ positions[i].push_back(bytePosn < 1025 ? bytePosn % 130
+ : (bytePosn - 1024) % 128);
+ // add the bit position
+ positions[i].push_back(i % 8);
+ }
+ size_t i = 16384;
+ do {
+ --i;
+ PositionProvider location(positions[i]);
+ rle->seek(location);
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[i]) << "Output wrong at "
+ << i;
+ } while (i != 0);
+}
+
+TEST(BooleanRle, seekTestWithNulls) {
+ // stream copied from Java's TestBitFieldReader.testUncompressedSeek
+ // for i in 0..16383
+ // if i < 8192
+ // out.write(i & 1)
+ // else
+ // out.write((i / 3) & 1)
+ const unsigned char buffer[] = {
+ 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55, 0x7f, 0x55,
+ 0x7f, 0x55, 0x6f, 0x55, 0x80, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0x80, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0x80, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7,
+ 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x80, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71,
+ 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x80, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c, 0x71, 0xc7, 0x1c,
+ 0x71, 0xc7, 0x1c, 0x71};
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(16384);
+ std::vector<char> allNull(data.size(), 0);
+ std::vector<char> noNull(data.size(), 1);
+ rle->next(data.data(), data.size(), allNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(0, data[i]) << "Output wrong at " << i;
+ }
+ rle->next(data.data(), data.size(), noNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[i]) << "Output wrong at "
+ << i;
+ }
+ // set up all of the positions
+ std::list<uint64_t> positions[16384];
+ for (uint64_t i = 0; i < 16384; ++i) {
+ const uint64_t bytePosn = i / 8;
+ // add the stream position
+ // add the stream position
+ positions[i].push_back(
+ bytePosn < 1025
+ ? 2 * (bytePosn / 130)
+ : (bytePosn < 1152 ? 16 : 145 + 129 * ((bytePosn - 1152) / 128)));
+ // add the byte RLE position
+ positions[i].push_back(bytePosn < 1025 ? bytePosn % 130
+ : (bytePosn - 1024) % 128);
+ // add the bit position
+ positions[i].push_back(i % 8);
+ }
+ size_t i = 16384;
+ do {
+ --i;
+ PositionProvider location(positions[i]);
+ rle->seek(location);
+ rle->next(data.data(), 1, noNull.data());
+ EXPECT_EQ(i < 8192 ? i & 1 : (i / 3) & 1, data[i]) << "Output wrong at "
+ << i;
+ data[0] = -1;
+ rle->next(data.data(), 1, allNull.data());
+ EXPECT_EQ(0, data[0]) << "Output wrong at " << i;
+ } while (i != 0);
+}
+
+TEST(BooleanRle, seekBoolAndByteRLE) {
+ // ORC-181
+ // original data is as follows (1 is true and 0 is false):
+ // 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0,
+ // 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0,
+ // 0, 0, 1, 1
+ // The RLE result is 0xf9, 0xf0, 0xf0, 0xf7, 0x1c, 0x71, 0xc1, 0x80
+ // The position of the 21st number (index starts from 0) in the RLE result
+ // is [0, 2, 5]; the position of the 45th number is [0, 5, 5].
+ const char num[] = {1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1,
+ 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1,
+ 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 1};
+ const unsigned char buffer[] = {0xf9, 0xf0, 0xf0, 0xf7,
+ 0x1c, 0x71, 0xc1, 0x80};
+
+ std::unique_ptr<SeekableInputStream> stream(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer)));
+ std::unique_ptr<ByteRleDecoder> rle =
+ createBooleanRleDecoder(std::move(stream));
+ std::vector<char> data(sizeof(num) / sizeof(char));
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(num[i], data[i]) << "Output wrong at " << i;
+ }
+
+ std::list<uint64_t> pos21st = {0, 2, 5}, pos45th = {0, 5, 5};
+ PositionProvider posProvider21st(pos21st), posProvider45th(pos45th);
+ char value[1];
+ rle->seek(posProvider21st);
+ rle->next(value, 1, nullptr);
+ EXPECT_EQ(num[21], value[0]);
+ rle->seek(posProvider45th);
+ rle->next(value, 1, nullptr);
+ EXPECT_EQ(num[45], value[0]);
+}
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-orc-format.cc b/depends/storage/test/unit/format/test-orc-format.cc
new file mode 100644
index 0000000..a8a8188
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-format.cc
@@ -0,0 +1,529 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "gtest/gtest.h"
+
+#include "dbcommon/common/tuple-batch.h"
+#include "dbcommon/common/tuple-desc.h"
+#include "dbcommon/filesystem/file-system-manager.h"
+#include "dbcommon/filesystem/file-system.h"
+#include "dbcommon/filesystem/local/local-file-system.h"
+#include "dbcommon/log/logger.h"
+#include "dbcommon/testutil/tuple-batch-utils.h"
+#include "dbcommon/utils/parameters.h"
+
+#include "storage/format/orc/orc-format.h"
+#include "storage/testutil/format-util.h"
+
+using namespace testing; // NOLINT
+
+namespace storage {
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1IntColumn1Row) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("thil");
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(*desc, 0, 1);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumn1Row");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_NullValue) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchRandom(*desc, 666, 7, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_NullValue");
+ LOG_INFO("OK with noavx");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1IntColumnDirect) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchRandom(*desc, 0, 2005, false);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnDirect");
+ LOG_INFO("OK without nulls");
+ }
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchRandom(*desc, 6666666, 2003, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnDirect");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1IntColumnRepeat) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("hil");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchDuplicate(*desc, 0, 2005, false);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnRepeat");
+ LOG_INFO("OK without nulls");
+ }
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchDuplicate(*desc, 6666666, 2003, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnRepeat");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1IntColumnDelta) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("hil");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatch(*desc, 666, 2005, false);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnDelta");
+ LOG_INFO("OK without nulls");
+ }
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatch(*desc, 6666666, 2003, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnDelta");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1IntColumnPatched) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchPatch(*desc, 65536, 200, false);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnPatched");
+ LOG_INFO("OK without nulls");
+ }
+ {
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("il");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchPatch(*desc, 65536, 200, true);
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1IntColumnPatched");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1SmallIntColumnMaxValueRow) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("h");
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(*desc, 32767, 1);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1SmallIntColumnMaxValueRow");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1DoubleColumn1Row) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("fd");
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(*desc, 0, 1);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1DoubleColumn1Row");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1DoubleColumn1024Row) {
+ {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("fd");
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(*desc, 0, 2003);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1DoubleColumn1024Row");
+ }
+ {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("fd");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchPatch(*desc, 0, 2003, true);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1DoubleColumn1024Row");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1StringColumn1Row) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("s");
+ dbcommon::TupleBatch::uptr tb = tbu.generateTupleBatch(*desc, 0, 1);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1StringColumn1Row");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1StringColumnDirectEnc1024Row) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ orc::SeekableOutputStream::COMPRESS_BLOCK_SIZE = 1024;
+ dbcommon::TupleDesc::uptr desc;
+ dbcommon::TupleBatch::uptr tb;
+ { // direct encoding
+ desc = tbu.generateTupleDesc("s");
+ tb = tbu.generateTupleBatch(*desc, 0, 1024);
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1StringColumnDirectEnc1024Row",
+ "{\"dicthreshold\":\"1\"}");
+ LOG_INFO("OK without nulls");
+
+ desc = tbu.generateTupleDesc("s");
+ tb = tbu.generateTupleBatch(*desc, 0, 1024, true);
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1StringColumnDirectEnc1024Row",
+ "{\"dicthreshold\":\"1\"}");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_1StringColumnDictEnc1024Row) {
+ dbcommon::TupleBatchUtility tbu;
+ FormatUtility fmtu;
+ orc::SeekableOutputStream::COMPRESS_BLOCK_SIZE = 1024;
+ dbcommon::TupleDesc::uptr desc;
+ dbcommon::TupleBatch::uptr tb;
+ { // dictionary encoding
+ desc = tbu.generateTupleDesc("s");
+ tb = tbu.generateTupleBatchDuplicate(*desc, 0, 1024, false);
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1StringColumn1024Row_dictEnc_noNulls",
+ "{\"dicthreshold\":\"0\"}");
+ LOG_INFO("OK without nulls");
+
+ desc = tbu.generateTupleDesc("s");
+ tb = tbu.generateTupleBatchDuplicate(*desc, 0, 1024, true);
+ fmtu.writeThenReadCompare(
+ "orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_1StringColumn1024Row_dictEnc_withNulls",
+ "{\"dicthreshold\":\"0\"}");
+ LOG_INFO("OK with nulls");
+ }
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_MixColumn1024Row) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("btilhdfsvc");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchRandom(*desc, 0, 2003, true);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_MixColumn1024Row");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1i_start0_step8_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "i", 0, 8, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1i_start0_step8_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1i_start0_step1024_num1024_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+
+ // throw too big tuple batch exception
+ EXPECT_THROW(fmtu.multiBlockTest(
+ "orc", "i", 0, 1024, 1024, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1i_start0_step8_num256_"
+ "nt8_align128"),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step8_num256_nt8_align256) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 8, 18, 8, 256,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step8_num256_"
+ "nt8_align256");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1i_start0_step8_num256_nt8_align256) { // NOLINT
+ FormatUtility fmtu;
+
+ fmtu.multiBlockTest(
+ "orc", "i", 0, 8, 256, 8, 256,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1i_start0_step8_num256_"
+ "nt8_align256");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1i_start0_step1_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "i", 0, 1, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1i_start0_step1_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step1_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 1, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step1_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step2_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 2, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step2_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step3_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 3, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step3_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step4_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 4, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step4_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step5_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "s", 0, 5, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step5_num256_"
+ "nt8_align128");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step6_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ EXPECT_THROW(fmtu.multiBlockTest(
+ "orc", "s", 0, 6, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step9_num256_"
+ "nt8_align128"),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step7_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ EXPECT_THROW(fmtu.multiBlockTest(
+ "orc", "s", 0, 7, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_step10_num256_"
+ "nt8_align128"),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1s_start0_step13_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ EXPECT_THROW(
+ fmtu.multiBlockTest("orc", "s", 0, 13, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1s_start0_"
+ "step13_num256_nt8_align128"),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_1i1s_start0_step8_num256_nt8_align128) { // NOLINT
+ FormatUtility fmtu;
+ EXPECT_THROW(
+ fmtu.multiBlockTest("orc", "is", 0, 8, 256, 8, 128,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_1i1s_start0_"
+ "step8_num256_nt8_align128"),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num1024_nt16_align512) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "iiisss", 0, 1, 1024, 16, 512,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num1024_"
+ "nt16_align512");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num1024_nt32_align512) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "iiisss", 0, 1, 1024, 32, 512,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num1024_"
+ "nt32_align512");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num2048_nt512_align2048) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "iiisss", 0, 1, 2048, 512, 2048,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_3i3s_start0_step1_num2048_"
+ "nt512_align2048");
+}
+
+TEST(
+ TestORCFormat,
+ DISABLED_TestORCFormatReadWrite_MultiBlock_btilhdfsvc_start0_step1_num127_nt8_align512) { // NOLINT
+ FormatUtility fmtu;
+ fmtu.multiBlockTest(
+ "orc", "btilhdfsvc", 0, 1, 127, 8, 512,
+ "/tmp/"
+ "TestORCFormatReadWrite_MultiBlock_btilhdfsvc_start0_step1_num127_"
+ "nt8_align512");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_NegativeCase) {
+ FormatUtility fu;
+
+ std::unique_ptr<Format> format = fu.createFormat("orc");
+ format->beginScan(nullptr, nullptr, nullptr, nullptr, nullptr, false);
+ EXPECT_EQ(format->next(), nullptr);
+
+ EXPECT_THROW(format->doUpdate(nullptr), dbcommon::TransactionAbortException);
+ EXPECT_THROW(format->doDelete(nullptr), dbcommon::TransactionAbortException);
+
+ dbcommon::Parameters params1;
+ params1.set("number.tuples.per.batch", std::to_string(17));
+ EXPECT_THROW(fu.createFormat("orc", ¶ms1),
+ dbcommon::TransactionAbortException);
+
+ dbcommon::Parameters params2;
+ params2.set("format.block.align.size", std::to_string(17));
+ EXPECT_THROW(fu.createFormat("orc", ¶ms2),
+ dbcommon::TransactionAbortException);
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_BooleanType) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("B");
+ dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
+ dbcommon::TupleBatchWriter& writer = batch->getTupleBatchWriter();
+ writer[0]->append("t", false);
+ writer[0]->append("t", true);
+ writer[0]->append("f", false);
+ batch->incNumOfRows(3);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(batch),
+ "/tmp/TestORCFormatReadWrite_BooleanType");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_DateType) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("D");
+ dbcommon::TupleBatch::uptr batch(new dbcommon::TupleBatch(*desc, true));
+ dbcommon::TupleBatchWriter& writer = batch->getTupleBatchWriter();
+ writer[0]->append("2014-01-01", false);
+ writer[0]->append("1900-12-22", true);
+ writer[0]->append("1960-10-12 BC", false);
+ batch->incNumOfRows(3);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(batch),
+ "/tmp/TestORCFormatReadWrite_DateType");
+}
+
+TEST(TestORCFormat, TestORCFormatReadWrite_Binary) {
+ dbcommon::TupleBatchUtility tbu;
+ dbcommon::TupleDesc::uptr desc = tbu.generateTupleDesc("b");
+ dbcommon::TupleBatch::uptr tb =
+ tbu.generateTupleBatchRandom(*desc, 0, 2048, true);
+ FormatUtility fmtu;
+
+ fmtu.writeThenReadCompare("orc", desc.get(), std::move(tb),
+ "/tmp/TestORCFormatReadWrite_Binary");
+}
+
+} // namespace storage
diff --git a/depends/storage/test/unit/format/test-orc-int128.cc b/depends/storage/test/unit/format/test-orc-int128.cc
new file mode 100644
index 0000000..f2043a4
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-int128.cc
@@ -0,0 +1,620 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include "gtest/gtest.h"
+#include "storage/format/orc/int128.h"
+
+namespace orc {
+
+TEST(Int128, simpleTest) {
+ Int128 x = 12;
+ Int128 y = 13;
+ x += y;
+ EXPECT_EQ(25, x.toLong());
+ EXPECT_EQ("0x00000000000000000000000000000019", x.toHexString());
+ y -= 1;
+ EXPECT_EQ("0x0000000000000000000000000000000c", y.toHexString());
+ EXPECT_EQ(12, y.toLong());
+ EXPECT_EQ(0, y.getHighBits());
+ EXPECT_EQ(12, y.getLowBits());
+ y -= 20;
+ EXPECT_EQ("0xfffffffffffffffffffffffffffffff8", y.toHexString());
+ EXPECT_EQ(-8, y.toLong());
+ EXPECT_EQ(-1, y.getHighBits());
+ EXPECT_EQ(static_cast<uint64_t>(-8), y.getLowBits());
+ Int128 z;
+ EXPECT_EQ(0, z.toLong());
+}
+
+TEST(Int128, testNegate) {
+ Int128 n = -1000000000000;
+ EXPECT_EQ("0xffffffffffffffffffffff172b5af000", n.toHexString());
+ n.negate();
+ EXPECT_EQ(1000000000000, n.toLong());
+ n.abs();
+ EXPECT_EQ(1000000000000, n.toLong());
+ n.negate();
+ EXPECT_EQ(-1000000000000, n.toLong());
+ n.abs();
+ EXPECT_EQ(1000000000000, n.toLong());
+
+ Int128 big(0x12345678, 0x9abcdef0);
+ EXPECT_EQ("0x0000000012345678000000009abcdef0", big.toHexString());
+ EXPECT_EQ(305419896, big.getHighBits());
+ EXPECT_EQ(2596069104, big.getLowBits());
+ big.negate();
+ EXPECT_EQ("0xffffffffedcba987ffffffff65432110", big.toHexString());
+ EXPECT_EQ(0xffffffffedcba987, big.getHighBits());
+ EXPECT_EQ(0xffffffff65432110, big.getLowBits());
+ big.negate();
+ EXPECT_EQ("0x0000000012345678000000009abcdef0", big.toHexString());
+ big.invert();
+ EXPECT_EQ("0xffffffffedcba987ffffffff6543210f", big.toHexString());
+ big.invert();
+ EXPECT_EQ("0x0000000012345678000000009abcdef0", big.toHexString());
+}
+
+TEST(Int128, testPlus) {
+ Int128 n(0x1000, 0xfffffffffffffff0);
+ EXPECT_EQ("0x0000000000001000fffffffffffffff0", n.toHexString());
+ n += 0x20;
+ EXPECT_EQ("0x00000000000010010000000000000010", n.toHexString());
+ n -= 0x20;
+ EXPECT_EQ("0x0000000000001000fffffffffffffff0", n.toHexString());
+ n += Int128(2, 3);
+ EXPECT_EQ("0x0000000000001002fffffffffffffff3", n.toHexString());
+
+ Int128 x(static_cast<int64_t>(0xffffffffffffff00), 0x200);
+ EXPECT_EQ("0xffffffffffffff000000000000000200", x.toHexString());
+ x -= 0x300;
+ EXPECT_EQ("0xfffffffffffffeffffffffffffffff00", x.toHexString());
+ x -= 0x100;
+ EXPECT_EQ("0xfffffffffffffefffffffffffffffe00", x.toHexString());
+ x += 0x400;
+ EXPECT_EQ("0xffffffffffffff000000000000000200", x.toHexString());
+ x -= Int128(1, 2);
+ EXPECT_EQ("0xfffffffffffffeff00000000000001fe", x.toHexString());
+}
+
+TEST(Int128, testLogic) {
+ Int128 n = Int128(0x00000000100000002, 0x0000000400000008);
+ n |= Int128(0x0000001000000020, 0x0000004000000080);
+ EXPECT_EQ("0x00000011000000220000004400000088", n.toHexString());
+ n = Int128(0x0000111100002222, 0x0000333300004444);
+ n &= Int128(0x0000f00000000f00, 0x000000f00000000f);
+ EXPECT_EQ("0x00001000000002000000003000000004", n.toHexString());
+}
+
+TEST(Int128, testShift) {
+ Int128 n(0x123456789abcdef0, 0xfedcba9876543210);
+ EXPECT_EQ("0x123456789abcdef0fedcba9876543210", n.toHexString());
+ n <<= 0;
+ EXPECT_EQ("0x123456789abcdef0fedcba9876543210", n.toHexString());
+ n <<= 4;
+ EXPECT_EQ("0x23456789abcdef0fedcba98765432100", n.toHexString());
+ n <<= 8;
+ EXPECT_EQ("0x456789abcdef0fedcba9876543210000", n.toHexString());
+ n += 0x99;
+ EXPECT_EQ("0x456789abcdef0fedcba9876543210099", n.toHexString());
+ n <<= 64;
+ EXPECT_EQ("0xcba98765432100990000000000000000", n.toHexString());
+ n += 0x312;
+ EXPECT_EQ("0xcba98765432100990000000000000312", n.toHexString());
+ n <<= 120;
+ EXPECT_EQ("0x12000000000000000000000000000000", n.toHexString());
+ n += 0x411;
+ EXPECT_EQ("0x12000000000000000000000000000411", n.toHexString());
+ n <<= 128;
+ EXPECT_EQ(0, n.toLong());
+
+ n = Int128(0x123456789abcdef0, 0xfedcba9876543210);
+ EXPECT_EQ("0x123456789abcdef0fedcba9876543210", n.toHexString());
+ n >>= 0;
+ EXPECT_EQ("0x123456789abcdef0fedcba9876543210", n.toHexString());
+ n >>= 4;
+ EXPECT_EQ("0x0123456789abcdef0fedcba987654321", n.toHexString());
+ n >>= 8;
+ EXPECT_EQ("0x000123456789abcdef0fedcba9876543", n.toHexString());
+ n += Int128(0x2400000000000000, 0x0);
+ EXPECT_EQ("0x240123456789abcdef0fedcba9876543", n.toHexString());
+ n >>= 64;
+ EXPECT_EQ("0x0000000000000000240123456789abcd", n.toHexString());
+ n += Int128(0x2400000000000000, 0x0);
+ EXPECT_EQ("0x2400000000000000240123456789abcd", n.toHexString());
+ n >>= 129;
+ EXPECT_EQ(0, n.toLong());
+ n = Int128(static_cast<int64_t>(0xfedcba0987654321), 0x1234567890abcdef);
+ EXPECT_EQ("0xfedcba09876543211234567890abcdef", n.toHexString());
+ n >>= 64;
+ EXPECT_EQ("0xfffffffffffffffffedcba0987654321", n.toHexString());
+ n = Int128(static_cast<int64_t>(0xfedcba0987654321), 0x1234567890abcdef);
+ n >>= 129;
+ EXPECT_EQ("0xffffffffffffffffffffffffffffffff", n.toHexString());
+ n = Int128(-1, 0xffffffffffffffff);
+ n >>= 4;
+ EXPECT_EQ("0x0fffffffffffffffffffffffffffffff", n.toHexString());
+ n = Int128(-0x100, 0xffffffffffffffff);
+ n >>= 68;
+ EXPECT_EQ("0xfffffffffffffffffffffffffffffff0", n.toHexString());
+}
+
+TEST(Int128, testCompare) {
+ Int128 x = 123;
+ EXPECT_EQ(Int128(123), x);
+ EXPECT_EQ(true, x == 123);
+ EXPECT_EQ(true, !(x == 124));
+ EXPECT_EQ(true, !(x == -124));
+ EXPECT_EQ(true, !(x == Int128(2, 123)));
+ EXPECT_EQ(true, !(x != 123));
+ EXPECT_EQ(true, x != -123);
+ EXPECT_EQ(true, x != 124);
+ EXPECT_EQ(true, x != Int128(-1, 123));
+ x = Int128(0x123, 0x456);
+ EXPECT_EQ(true, !(x < Int128(0x123, 0x455)));
+ EXPECT_EQ(true, !(x < Int128(0x123, 0x456)));
+ EXPECT_EQ(true, x < Int128(0x123, 0x457));
+ EXPECT_EQ(true, !(x < Int128(0x122, 0x456)));
+ EXPECT_EQ(true, x < Int128(0x124, 0x456));
+
+ EXPECT_EQ(true, !(x <= Int128(0x123, 0x455)));
+ EXPECT_EQ(true, x <= Int128(0x123, 0x456));
+ EXPECT_EQ(true, x <= Int128(0x123, 0x457));
+ EXPECT_EQ(true, !(x <= Int128(0x122, 0x456)));
+ EXPECT_EQ(true, x <= Int128(0x124, 0x456));
+
+ EXPECT_EQ(true, x > Int128(0x123, 0x455));
+ EXPECT_EQ(true, !(x > Int128(0x123, 0x456)));
+ EXPECT_EQ(true, !(x > Int128(0x123, 0x457)));
+ EXPECT_EQ(true, x > Int128(0x122, 0x456));
+ EXPECT_EQ(true, !(x > Int128(0x124, 0x456)));
+
+ EXPECT_EQ(true, x >= Int128(0x123, 0x455));
+ EXPECT_EQ(true, x >= Int128(0x123, 0x456));
+ EXPECT_EQ(true, !(x >= Int128(0x123, 0x457)));
+ EXPECT_EQ(true, x >= Int128(0x122, 0x456));
+ EXPECT_EQ(true, !(x >= Int128(0x124, 0x456)));
+
+ EXPECT_EQ(true, Int128(-3) < Int128(-2));
+ EXPECT_EQ(true, Int128(-3) < Int128(0));
+ EXPECT_EQ(true, Int128(-3) < Int128(3));
+ EXPECT_EQ(true, Int128(0) < Int128(5));
+ EXPECT_EQ(true, Int128::minimumValue() < 0);
+ EXPECT_EQ(true, Int128(0) < Int128::maximumValue());
+ EXPECT_EQ(true, Int128::minimumValue() < Int128::maximumValue());
+}
+
+TEST(Int128, testHash) {
+ EXPECT_EQ(0, Int128().hash());
+ EXPECT_EQ(0x123, Int128(0x123).hash());
+ EXPECT_EQ(0xc3c3c3c3, Int128(0x0101010102020202, 0x4040404080808080).hash());
+ EXPECT_EQ(0x122, Int128(-0x123).hash());
+ EXPECT_EQ(0x12345678, Int128(0x1234567800000000, 0x0).hash());
+ EXPECT_EQ(0x12345678, Int128(0x12345678, 0x0).hash());
+ EXPECT_EQ(0x12345678, Int128(0x0, 0x1234567800000000).hash());
+ EXPECT_EQ(0x12345678, Int128(0x0, 0x12345678).hash());
+}
+
+TEST(Int128, testFitsInLong) {
+ EXPECT_EQ(true, Int128(0x0, 0x7fffffffffffffff).fitsInLong());
+ EXPECT_EQ(true, !Int128(0x0, 0x8000000000000000).fitsInLong());
+ EXPECT_EQ(true, !Int128(-1, 0x7fffffffffffffff).fitsInLong());
+ EXPECT_EQ(true, Int128(-1, 0x8000000000000000).fitsInLong());
+ EXPECT_EQ(true, !Int128(1, 0x8000000000000000).fitsInLong());
+ EXPECT_EQ(true, !Int128(1, 0x7fffffffffffffff).fitsInLong());
+ EXPECT_EQ(true, !Int128(-2, 0x8000000000000000).fitsInLong());
+ EXPECT_EQ(true, !Int128(-2, 0x7fffffffffffffff).fitsInLong());
+
+ EXPECT_EQ(0x7fffffffffffffff, Int128(0x0, 0x7fffffffffffffff).toLong());
+ EXPECT_THROW(Int128(1, 1).toLong(), std::runtime_error);
+ EXPECT_EQ(0x8000000000000000, Int128(-1, 0x8000000000000000).toLong());
+}
+
+TEST(Int128, testMultiply) {
+ Int128 x = 2;
+ x *= 3;
+ EXPECT_EQ(6, x.toLong());
+ x *= -4;
+ EXPECT_EQ(-24, x.toLong());
+ x *= 5;
+ EXPECT_EQ(-120, x.toLong());
+ x *= -7;
+ EXPECT_EQ(840, x.toLong());
+ x = Int128(0x0123456776543210, 0x1111222233334444);
+ x *= 2;
+ EXPECT_EQ(0x02468aceeca86420, x.getHighBits());
+ EXPECT_EQ(0x2222444466668888, x.getLowBits());
+
+ x = Int128(0x0534AB4C, 0x59D109ADF9892FCA);
+ x *= Int128(0, 0x9033b8c7a);
+ EXPECT_EQ("0x2eead9afd0c6e0e929c18da753113e44", x.toHexString());
+}
+
+TEST(Int128, testMultiplyInt) {
+ Int128 x = 2;
+ x *= 1;
+ EXPECT_EQ(2, x.toLong());
+ x *= 2;
+ EXPECT_EQ(4, x.toLong());
+
+ x = 5;
+ x *= 6432346;
+ EXPECT_EQ(6432346 * 5, x.toLong());
+
+ x = (1LL << 62) + (3LL << 34) + 3LL;
+ x *= 96;
+ EXPECT_EQ("0x00000000000000180000048000000120", x.toHexString());
+
+ x = 1;
+ x <<= 126;
+ EXPECT_EQ("0x40000000000000000000000000000000", x.toHexString());
+ x *= 2;
+ EXPECT_EQ("0x80000000000000000000000000000000", x.toHexString());
+ x *= 2;
+ EXPECT_EQ("0x00000000000000000000000000000000", x.toHexString());
+}
+
+TEST(Int128, testFillInArray) {
+ Int128 x(0x123456789abcdef0, 0x23456789abcdef01);
+ uint32_t array[4];
+ bool wasNegative;
+ EXPECT_EQ(4, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+ EXPECT_EQ(0x12345678, array[0]);
+ EXPECT_EQ(0x9abcdef0, array[1]);
+ EXPECT_EQ(0x23456789, array[2]);
+ EXPECT_EQ(0xabcdef01, array[3]);
+
+ x = 0;
+ EXPECT_EQ(0, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+
+ x = 1;
+ EXPECT_EQ(1, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+ EXPECT_EQ(1, array[0]);
+
+ x = -12345;
+ EXPECT_EQ(1, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, wasNegative);
+ EXPECT_EQ(12345, array[0]);
+
+ x = 0x80000000;
+ EXPECT_EQ(1, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+ EXPECT_EQ(0x80000000, array[0]);
+
+ x = Int128(0, 0x8000000000000000);
+ EXPECT_EQ(2, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+ EXPECT_EQ(0x80000000, array[0]);
+ EXPECT_EQ(0x0, array[1]);
+
+ x = Int128(0x80000000, 0x123456789abcdef0);
+ EXPECT_EQ(3, x.fillInArray(array, wasNegative));
+ EXPECT_EQ(true, !wasNegative);
+ EXPECT_EQ(0x80000000, array[0]);
+ EXPECT_EQ(0x12345678, array[1]);
+ EXPECT_EQ(0x9abcdef0, array[2]);
+}
+
+int64_t fls(uint32_t x);
+
+TEST(Int128, testFindLastSet) {
+ EXPECT_EQ(0, fls(0));
+ EXPECT_EQ(1, fls(1));
+ EXPECT_EQ(8, fls(0xff));
+ EXPECT_EQ(9, fls(0x100));
+ EXPECT_EQ(29, fls(0x12345678));
+ EXPECT_EQ(31, fls(0x40000000));
+ EXPECT_EQ(32, fls(0x80000000));
+}
+
+void shiftArrayLeft(uint32_t* array, int64_t length, int64_t bits);
+
+TEST(Int128, testShiftArrayLeft) {
+ uint32_t array[5];
+ // make sure nothing blows up
+ array[0] = 0x12345678;
+ shiftArrayLeft(0, 0, 30);
+ EXPECT_EQ(0x12345678, array[0]);
+
+ array[0] = 0x12345678;
+ shiftArrayLeft(array, 1, 0);
+ EXPECT_EQ(0x12345678, array[0]);
+
+ array[0] = 0x12345678;
+ array[1] = 0x9abcdef0;
+ shiftArrayLeft(array, 1, 3);
+ EXPECT_EQ(0x91a2b3c0, array[0]);
+ EXPECT_EQ(0x9abcdef0, array[1]);
+
+ array[0] = 0x12345678;
+ array[1] = 0x9abcdeff;
+ array[2] = 0xfedcba98;
+ array[3] = 0x76543210;
+ shiftArrayLeft(array, 4, 4);
+ EXPECT_EQ(0x23456789, array[0]);
+ EXPECT_EQ(0xabcdefff, array[1]);
+ EXPECT_EQ(0xedcba987, array[2]);
+ EXPECT_EQ(0x65432100, array[3]);
+
+ array[0] = 0;
+ array[1] = 0x12345678;
+ array[2] = 0x9abcdeff;
+ array[3] = 0xfedcba98;
+ array[4] = 0x76543210;
+ shiftArrayLeft(array, 5, 8);
+ EXPECT_EQ(0x00000012, array[0]);
+ EXPECT_EQ(0x3456789a, array[1]);
+ EXPECT_EQ(0xbcdefffe, array[2]);
+ EXPECT_EQ(0xdcba9876, array[3]);
+ EXPECT_EQ(0x54321000, array[4]);
+}
+
+void shiftArrayRight(uint32_t* array, int64_t length, int64_t bits);
+
+TEST(Int128, testShiftArrayRight) {
+ uint32_t array[4];
+ // make sure nothing blows up
+ array[0] = 0x12345678;
+ shiftArrayRight(0, 0, 30);
+ EXPECT_EQ(0x12345678, array[0]);
+
+ array[0] = 0x12345678;
+ array[1] = 0x9abcdef0;
+ shiftArrayRight(array, 1, 3);
+ EXPECT_EQ(0x2468acf, array[0]);
+ EXPECT_EQ(0x9abcdef0, array[1]);
+
+ array[0] = 0x12345678;
+ array[1] = 0x9abcdeff;
+ array[2] = 0xfedcba98;
+ array[3] = 0x76543210;
+ shiftArrayRight(array, 4, 4);
+ EXPECT_EQ(0x01234567, array[0]);
+ EXPECT_EQ(0x89abcdef, array[1]);
+ EXPECT_EQ(0xffedcba9, array[2]);
+ EXPECT_EQ(0x87654321, array[3]);
+}
+
+void fixDivisionSigns(Int128& result, Int128& remainder,
+ bool dividendWasNegative, bool divisorWasNegative);
+
+TEST(Int128, testFixDivisionSigns) {
+ Int128 x = 123;
+ Int128 y = 456;
+ fixDivisionSigns(x, y, false, false);
+ EXPECT_EQ(123, x.toLong());
+ EXPECT_EQ(456, y.toLong());
+
+ x = 123;
+ y = 456;
+ fixDivisionSigns(x, y, false, true);
+ EXPECT_EQ(-123, x.toLong());
+ EXPECT_EQ(456, y.toLong());
+
+ x = 123;
+ y = 456;
+ fixDivisionSigns(x, y, true, false);
+ EXPECT_EQ(-123, x.toLong());
+ EXPECT_EQ(-456, y.toLong());
+
+ x = 123;
+ y = 456;
+ fixDivisionSigns(x, y, true, true);
+ EXPECT_EQ(123, x.toLong());
+ EXPECT_EQ(-456, y.toLong());
+}
+
+void buildFromArray(Int128& value, uint32_t* array, int64_t length);
+
+TEST(Int128, testBuildFromArray) {
+ Int128 result;
+ uint32_t array[5] = {0x12345678, 0x9abcdef0, 0xfedcba98, 0x76543210, 0};
+
+ buildFromArray(result, array, 0);
+ EXPECT_EQ(0, result.toLong());
+
+ buildFromArray(result, array, 1);
+ EXPECT_EQ(0x12345678, result.toLong());
+
+ buildFromArray(result, array, 2);
+ EXPECT_EQ(0x123456789abcdef0, result.toLong());
+
+ buildFromArray(result, array, 3);
+ EXPECT_EQ("0x00000000123456789abcdef0fedcba98", result.toHexString());
+
+ buildFromArray(result, array, 4);
+ EXPECT_EQ("0x123456789abcdef0fedcba9876543210", result.toHexString());
+
+ EXPECT_THROW(buildFromArray(result, array, 5),
+ dbcommon::TransactionAbortException);
+}
+
+Int128 singleDivide(uint32_t* dividend, int64_t dividendLength,
+ uint32_t divisor, Int128& remainder,
+ bool dividendWasNegative, bool divisorWasNegative);
+
+TEST(Int128, testSingleDivide) {
+ Int128 remainder;
+ uint32_t dividend[4];
+
+ dividend[0] = 23;
+ Int128 result = singleDivide(dividend, 1, 5, remainder, true, false);
+ EXPECT_EQ(-4, result.toLong());
+ EXPECT_EQ(-3, remainder.toLong());
+
+ dividend[0] = 0x100;
+ dividend[1] = 0x120;
+ dividend[2] = 0x140;
+ dividend[3] = 0x160;
+ result = singleDivide(dividend, 4, 0x20, remainder, false, false);
+ EXPECT_EQ("0x00000008000000090000000a0000000b", result.toHexString());
+ EXPECT_EQ(0, remainder.toLong());
+
+ dividend[0] = 0x101;
+ dividend[1] = 0x122;
+ dividend[2] = 0x143;
+ dividend[3] = 0x164;
+ result = singleDivide(dividend, 4, 0x20, remainder, false, false);
+ EXPECT_EQ("0x00000008080000091000000a1800000b", result.toHexString());
+ EXPECT_EQ(4, remainder.toLong());
+
+ dividend[0] = 0x12345678;
+ dividend[1] = 0x9abcdeff;
+ dividend[2] = 0xfedcba09;
+ dividend[3] = 0x87654321;
+ result = singleDivide(dividend, 4, 123, remainder, false, false);
+ EXPECT_EQ("0x0025e390971c97aaaaa84c7077bc23ed", result.toHexString());
+ EXPECT_EQ(0x42, remainder.toLong());
+}
+
+TEST(Int128, testDivide) {
+ Int128 dividend;
+ Int128 result;
+ Int128 remainder;
+
+ dividend = 0x12345678;
+ result = dividend.divide(0x123456789abcdef0, remainder);
+ EXPECT_EQ(0, result.toLong());
+ EXPECT_EQ(0x12345678, remainder.toLong());
+
+ EXPECT_THROW(dividend.divide(0, remainder), std::runtime_error);
+
+ dividend = Int128(0x123456789abcdeff, 0xfedcba0987654321);
+ result = dividend.divide(123, remainder);
+ EXPECT_EQ("0x0025e390971c97aaaaa84c7077bc23ed", result.toHexString());
+ EXPECT_EQ(0x42, remainder.toLong());
+
+ dividend = Int128(0x111111112fffffff, 0xeeeeeeeedddddddd);
+ result = dividend.divide(0x1111111123456789, remainder);
+ EXPECT_EQ("0x000000000000000100000000beeeeef7", result.toHexString());
+ EXPECT_EQ("0x0000000000000000037d3b3d60479aae", remainder.toHexString());
+
+ dividend = 1234234662345;
+ result = dividend.divide(642337, remainder);
+ EXPECT_EQ(1921475, result.toLong());
+ EXPECT_EQ(175270, remainder.toLong());
+
+ dividend = Int128(0x42395ADC0534AB4C, 0x59D109ADF9892FCA);
+ result = dividend.divide(0x1234F09DC19A, remainder);
+ EXPECT_EQ("0x000000000003a327c1348bccd2f06c27", result.toHexString());
+ EXPECT_EQ("0x000000000000000000000cacef73b954", remainder.toHexString());
+
+ dividend = Int128(0xfffffffffffffff, 0xf000000000000000);
+ result = dividend.divide(Int128(0, 0x1000000000000000), remainder);
+ EXPECT_EQ("0x0000000000000000ffffffffffffffff", result.toHexString());
+ EXPECT_EQ(0, remainder.toLong());
+
+ dividend = Int128(0x4000000000000000, 0);
+ result = dividend.divide(Int128(0, 0x400000007fffffff), remainder);
+ EXPECT_EQ("0x0000000000000000fffffffe00000007", result.toHexString());
+ EXPECT_EQ("0x00000000000000003ffffffa80000007", remainder.toHexString());
+}
+
+TEST(Int128, testToString) {
+ Int128 num = Int128(0x123456789abcdef0, 0xfedcba0987654321);
+ EXPECT_EQ("24197857203266734881846307133640229665", num.toString());
+
+ num = Int128(0, 0xab54a98ceb1f0ad2);
+ EXPECT_EQ("12345678901234567890", num.toString());
+
+ num = 12345678;
+ EXPECT_EQ("12345678", num.toString());
+
+ num = -1234;
+ EXPECT_EQ("-1234", num.toString());
+
+ num = Int128(0x13f20d9c2, 0xfff89d38e1c70cb1);
+ EXPECT_EQ("98765432109876543210987654321", num.toString());
+ num.negate();
+ EXPECT_EQ("-98765432109876543210987654321", num.toString());
+
+ num = Int128("10000000000000000000000000000000000000");
+ EXPECT_EQ("10000000000000000000000000000000000000", num.toString());
+
+ num = Int128("-1234");
+ EXPECT_EQ("-1234", num.toString());
+
+ num = Int128("-12345678901122334455667788990011122233");
+ EXPECT_EQ("-12345678901122334455667788990011122233", num.toString());
+}
+
+TEST(Int128, testToDecimalString) {
+ Int128 num = Int128("98765432109876543210987654321098765432");
+ EXPECT_EQ("98765432109876543210987654321098765432", num.toDecimalString(0));
+ EXPECT_EQ("987654321098765432109876543210987.65432", num.toDecimalString(5));
+ num.negate();
+ EXPECT_EQ("-98765432109876543210987654321098765432", num.toDecimalString(0));
+ EXPECT_EQ("-987654321098765432109876543210987.65432", num.toDecimalString(5));
+ num = 123;
+ EXPECT_EQ("12.3", num.toDecimalString(1));
+ EXPECT_EQ("0.123", num.toDecimalString(3));
+ EXPECT_EQ("0.0123", num.toDecimalString(4));
+ EXPECT_EQ("0.00123", num.toDecimalString(5));
+
+ num = -123;
+ EXPECT_EQ("-123", num.toDecimalString(0));
+ EXPECT_EQ("-12.3", num.toDecimalString(1));
+ EXPECT_EQ("-0.123", num.toDecimalString(3));
+ EXPECT_EQ("-0.0123", num.toDecimalString(4));
+ EXPECT_EQ("-0.00123", num.toDecimalString(5));
+}
+
+TEST(Int128, testInt128Scale) {
+ Int128 num = Int128(10);
+ bool overflow = false;
+
+ num = scaleUpInt128ByPowerOfTen(num, 0, overflow);
+ EXPECT_FALSE(overflow);
+ EXPECT_EQ(Int128(10), num);
+
+ num = scaleUpInt128ByPowerOfTen(num, 5, overflow);
+ EXPECT_FALSE(overflow);
+ EXPECT_EQ(Int128(1000000), num);
+
+ num = scaleUpInt128ByPowerOfTen(num, 5, overflow);
+ EXPECT_FALSE(overflow);
+ EXPECT_EQ(Int128(100000000000l), num);
+
+ num = scaleUpInt128ByPowerOfTen(num, 20, overflow);
+ EXPECT_FALSE(overflow);
+ EXPECT_EQ(Int128("10000000000000000000000000000000"), num);
+
+ scaleUpInt128ByPowerOfTen(num, 10, overflow);
+ EXPECT_TRUE(overflow);
+
+ scaleUpInt128ByPowerOfTen(Int128::maximumValue(), 0, overflow);
+ EXPECT_FALSE(overflow);
+
+ scaleUpInt128ByPowerOfTen(Int128::maximumValue(), 1, overflow);
+ EXPECT_TRUE(overflow);
+
+ num = scaleDownInt128ByPowerOfTen(Int128(10001), 0);
+ EXPECT_EQ(Int128(10001), num);
+
+ num = scaleDownInt128ByPowerOfTen(Int128(10001), 2);
+ EXPECT_EQ(Int128(100), num);
+
+ num = scaleDownInt128ByPowerOfTen(Int128(10000), 5);
+ EXPECT_EQ(Int128(0), num);
+}
+
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-orc-proto-definition.cc b/depends/storage/test/unit/format/test-orc-proto-definition.cc
new file mode 100644
index 0000000..51f1ed3
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-proto-definition.cc
@@ -0,0 +1,351 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "dbcommon/log/exception.h"
+#include "dbcommon/log/logger.h"
+#include "gtest/gtest.h"
+#include "storage/format/orc/orc-proto-definition.h"
+#include "storage/format/orc/type-impl.h"
+#include "storage/format/orc/vector.h"
+#include "storage/testutil/file-utils.h"
+
+namespace orc {
+
+TEST(TESTOrcProtoDefinition, TestColumnStatistics) {
+ proto::ColumnStatistics stats;
+ ColumnStatisticsImpl cs(stats);
+ EXPECT_EQ(cs.getNumberOfValues(), stats.numberofvalues());
+}
+
+TEST(TESTOrcProtoDefinition, TestBinaryColumnStatistics) {
+ proto::ColumnStatistics stats;
+ BinaryColumnStatisticsImpl bcs(stats, true);
+
+ EXPECT_EQ(bcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(bcs.getTotalLength(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(bcs.hasTotalLength(), false);
+
+ proto::ColumnStatistics stats1;
+ BinaryColumnStatisticsImpl bcs1(stats1, false);
+ EXPECT_EQ(bcs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(bcs.getTotalLength(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(bcs1.hasTotalLength(), false);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_binarystatistics()->set_sum(1000);
+ BinaryColumnStatisticsImpl bcs2(stats2, true);
+
+ EXPECT_EQ(bcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(bcs2.getTotalLength(), 1000);
+ EXPECT_EQ(bcs2.hasTotalLength(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_binarystatistics()->set_sum(2000);
+ BinaryColumnStatisticsImpl bcs3(stats3, false);
+
+ EXPECT_EQ(bcs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(bcs3.getTotalLength(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(bcs3.hasTotalLength(), false);
+}
+
+TEST(TESTOrcProtoDefinition, TestBoolearColumnStatistics) {
+ proto::ColumnStatistics stats;
+ BooleanColumnStatisticsImpl bcs(stats, true);
+
+ EXPECT_EQ(bcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(bcs.getFalseCount(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(bcs.getTrueCount(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats1;
+ BooleanColumnStatisticsImpl bcs1(stats1, false);
+ EXPECT_EQ(bcs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(bcs1.getFalseCount(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(bcs1.getTrueCount(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_bucketstatistics()->add_count(1000);
+ BooleanColumnStatisticsImpl bcs2(stats2, true);
+
+ EXPECT_EQ(bcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(bcs2.getFalseCount(),
+ bcs2.getNumberOfValues() - bcs2.getTrueCount());
+ EXPECT_EQ(bcs2.getTrueCount(), 1000);
+ EXPECT_EQ(bcs2.hasCount(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_bucketstatistics()->add_count(2000);
+ BooleanColumnStatisticsImpl bcs3(stats3, false);
+
+ EXPECT_EQ(bcs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(bcs3.getFalseCount(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(bcs3.getTrueCount(), dbcommon::TransactionAbortException);
+}
+
+TEST(TESTOrcProtoDefinition, TestDateColumnStatistics) {
+ proto::ColumnStatistics stats;
+ DateColumnStatisticsImpl dcs(stats, true);
+
+ EXPECT_EQ(dcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(dcs.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs.getMinimum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats1;
+ DateColumnStatisticsImpl dcs1(stats1, false);
+ EXPECT_EQ(dcs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(dcs1.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs1.getMinimum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_datestatistics()->set_maximum(10000);
+ stats2.mutable_datestatistics()->set_minimum(1);
+ DateColumnStatisticsImpl dcs2(stats2, true);
+
+ EXPECT_EQ(dcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(dcs2.getMaximum(), 10000);
+ EXPECT_EQ(dcs2.getMinimum(), 1);
+ EXPECT_EQ(dcs2.hasMaximum(), true);
+ EXPECT_EQ(dcs2.hasMinimum(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_datestatistics()->set_maximum(20000);
+ stats3.mutable_datestatistics()->set_minimum(2);
+ DateColumnStatisticsImpl dcs3(stats3, false);
+
+ EXPECT_EQ(dcs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(dcs3.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs3.getMinimum(), dbcommon::TransactionAbortException);
+}
+
+TEST(TESTOrcProtoDefinition, DecimalColumnStatisticsImpl) {
+ proto::ColumnStatistics stats;
+ DecimalColumnStatisticsImpl dcs(stats, true);
+
+ EXPECT_EQ(dcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(dcs.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs.getSum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats1;
+ DecimalColumnStatisticsImpl dcs1(stats1, false);
+ EXPECT_EQ(dcs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(dcs1.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs1.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs1.getSum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_decimalstatistics()->set_maximum("1000.0");
+ stats2.mutable_decimalstatistics()->set_minimum("1.0");
+ stats2.mutable_decimalstatistics()->set_sum("1000000.0");
+ DecimalColumnStatisticsImpl dcs2(stats2, true);
+
+ EXPECT_EQ(dcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(dcs2.getMaximum().toString(), "1000.0");
+ EXPECT_EQ(dcs2.getMinimum().toString(), "1.0");
+ EXPECT_EQ(dcs2.getSum().toString(), "1000000.0");
+ EXPECT_EQ(dcs2.hasMaximum(), true);
+ EXPECT_EQ(dcs2.hasMinimum(), true);
+ EXPECT_EQ(dcs2.hasSum(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_decimalstatistics()->set_maximum("1000.0");
+ stats3.mutable_decimalstatistics()->set_minimum("1.0");
+ stats3.mutable_decimalstatistics()->set_sum("1000000.0");
+ DecimalColumnStatisticsImpl dcs3(stats3, false);
+
+ EXPECT_EQ(dcs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(dcs3.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs3.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs3.getSum(), dbcommon::TransactionAbortException);
+}
+
+TEST(TESTOrcProtoDefinition, TestDecimal) {
+ Decimal d("1000.0");
+ std::string s = d.toString();
+ EXPECT_EQ(s, "1000.0");
+}
+
+TEST(TESTOrcProtoDefinition, DoubleColumnStatisticsImpl) {
+ proto::ColumnStatistics stats;
+ DoubleColumnStatisticsImpl dcs(stats);
+
+ EXPECT_EQ(dcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(dcs.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(dcs.getSum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_doublestatistics()->set_maximum(1000.0);
+ stats2.mutable_doublestatistics()->set_minimum(1.0);
+ stats2.mutable_doublestatistics()->set_sum(1000000.0);
+ DoubleColumnStatisticsImpl dcs2(stats2);
+
+ EXPECT_EQ(dcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(dcs2.getMaximum(), 1000.0);
+ EXPECT_EQ(dcs2.getMinimum(), 1.0);
+ EXPECT_EQ(dcs2.getSum(), 1000000.0);
+ EXPECT_EQ(dcs2.hasMaximum(), true);
+ EXPECT_EQ(dcs2.hasMinimum(), true);
+ EXPECT_EQ(dcs2.hasSum(), true);
+}
+
+TEST(TESTOrcProtoDefinition, IntegerColumnStatisticsImpl) {
+ proto::ColumnStatistics stats;
+ IntegerColumnStatisticsImpl ics(stats);
+
+ EXPECT_EQ(ics.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(ics.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(ics.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(ics.getSum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_intstatistics()->set_maximum(1000);
+ stats2.mutable_intstatistics()->set_minimum(1);
+ stats2.mutable_intstatistics()->set_sum(1000000);
+ IntegerColumnStatisticsImpl ics2(stats2);
+
+ EXPECT_EQ(ics2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(ics2.getMaximum(), 1000);
+ EXPECT_EQ(ics2.getMinimum(), 1);
+ EXPECT_EQ(ics2.getSum(), 1000000);
+ EXPECT_EQ(ics2.hasMaximum(), true);
+ EXPECT_EQ(ics2.hasMinimum(), true);
+ EXPECT_EQ(ics2.hasSum(), true);
+}
+
+TEST(TESTOrcProtoDefinition, StringColumnStatisticsImpl) {
+ proto::ColumnStatistics stats;
+ StringColumnStatisticsImpl scs(stats, true);
+
+ EXPECT_EQ(scs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(scs.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs.getTotalLength(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats1;
+ StringColumnStatisticsImpl scs1(stats, false);
+
+ EXPECT_EQ(scs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(scs1.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs1.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs1.getTotalLength(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_stringstatistics()->set_maximum("1000");
+ stats2.mutable_stringstatistics()->set_minimum("1");
+ stats2.mutable_stringstatistics()->set_sum(1000000);
+ StringColumnStatisticsImpl scs2(stats2, true);
+
+ EXPECT_EQ(scs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_STREQ(scs2.getMaximum(), "1000");
+ EXPECT_STREQ(scs2.getMinimum(), "1");
+ EXPECT_EQ(scs2.getTotalLength(), 1000000);
+ EXPECT_EQ(scs2.hasMaximum(), true);
+ EXPECT_EQ(scs2.hasMinimum(), true);
+ EXPECT_EQ(scs2.hasTotalLength(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_stringstatistics()->set_maximum("1000");
+ stats3.mutable_stringstatistics()->set_minimum("1");
+ stats3.mutable_stringstatistics()->set_sum(1000000);
+ StringColumnStatisticsImpl scs3(stats3, false);
+
+ EXPECT_EQ(scs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(scs3.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs3.getMinimum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(scs3.getTotalLength(), dbcommon::TransactionAbortException);
+}
+
+TEST(TESTOrcProtoDefinition, TimestampColumnStatisticsImpl) {
+ proto::ColumnStatistics stats;
+ TimestampColumnStatisticsImpl tcs(stats, true);
+
+ EXPECT_EQ(tcs.getNumberOfValues(), stats.numberofvalues());
+ EXPECT_THROW(tcs.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(tcs.getMinimum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats1;
+ TimestampColumnStatisticsImpl tcs1(stats, false);
+
+ EXPECT_EQ(tcs1.getNumberOfValues(), stats1.numberofvalues());
+ EXPECT_THROW(tcs1.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(tcs1.getMinimum(), dbcommon::TransactionAbortException);
+
+ proto::ColumnStatistics stats2;
+ stats2.mutable_timestampstatistics()->set_maximum(1000);
+ stats2.mutable_timestampstatistics()->set_minimum(1);
+ TimestampColumnStatisticsImpl tcs2(stats2, true);
+
+ EXPECT_EQ(tcs2.getNumberOfValues(), stats2.numberofvalues());
+ EXPECT_EQ(tcs2.getMaximum(), 1000);
+ EXPECT_EQ(tcs2.getMinimum(), 1);
+ EXPECT_EQ(tcs2.hasMaximum(), true);
+ EXPECT_EQ(tcs2.hasMinimum(), true);
+
+ proto::ColumnStatistics stats3;
+ stats3.mutable_timestampstatistics()->set_maximum(1000);
+ stats3.mutable_timestampstatistics()->set_minimum(1);
+ TimestampColumnStatisticsImpl tcs3(stats3, false);
+
+ EXPECT_EQ(tcs3.getNumberOfValues(), stats3.numberofvalues());
+ EXPECT_THROW(tcs3.getMaximum(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(tcs3.getMinimum(), dbcommon::TransactionAbortException);
+}
+
+TEST(TESTOrcProtoDefinition, StreamInformationImpl) {
+ proto::Stream stream;
+ stream.set_column(0);
+ stream.set_kind(orc::proto::Stream_Kind::Stream_Kind_DATA);
+ stream.set_length(100);
+ StreamInformationImpl si(80, stream);
+
+ EXPECT_EQ(si.getColumnId(), 0);
+ EXPECT_EQ(si.getKind(), StreamKind::StreamKind_DATA);
+ EXPECT_EQ(si.getLength(), 100);
+ EXPECT_EQ(si.getOffset(), 80);
+}
+
+TEST(TESTOrcProtoDefinition, StripeInformationImpl) {
+ InputStream *input = nullptr;
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+ StripeInformationImpl si(80, 0, 1, 2, 3, input, *pool,
+ CompressionKind::CompressionKind_LZ4, 88);
+
+ EXPECT_EQ(si.getOffset(), 80);
+ EXPECT_EQ(si.getIndexLength(), 0);
+ EXPECT_EQ(si.getDataLength(), 1);
+ EXPECT_EQ(si.getFooterLength(), 2);
+ EXPECT_EQ(si.getNumberOfRows(), 3);
+}
+
+TEST(TESTOrcProtoDefinition, StatisticsImpl) {
+ proto::StripeStatistics stripeStats;
+ orc::proto::ColumnStatistics *cs = stripeStats.add_colstats();
+ cs->set_hasnull(true);
+ cs->set_numberofvalues(100);
+
+ StatisticsImpl si(stripeStats, true);
+
+ EXPECT_EQ(si.getNumberOfColumns(), 1);
+ EXPECT_EQ(si.getColumnStatistics(0)->getNumberOfValues(), 100);
+}
+
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-orc-rle.cc b/depends/storage/test/unit/format/test-orc-rle.cc
new file mode 100644
index 0000000..e7d3d42
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-rle.cc
@@ -0,0 +1,2863 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <iostream>
+#include <vector>
+#include "gtest/gtest.h"
+#include "storage/format/orc/rle.h"
+
+#define ARRAY_SIZE(array) (sizeof(array) / sizeof(*array))
+namespace orc {
+
+std::vector<int64_t> decodeRLEv2(const unsigned char* bytes, uint32_t l,
+ size_t n, size_t count,
+ const char* notNull = nullptr) {
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(bytes, l)),
+ true, RleVersion_2, *dbcommon::getDefaultPool());
+ std::vector<int64_t> results;
+ for (size_t i = 0; i < count; i += n) {
+ size_t remaining = count - i;
+ size_t nread = std::min(n, remaining);
+ std::vector<int64_t> data(nread);
+ rle->next(data.data(), nread, notNull);
+ if (notNull) {
+ notNull += nread;
+ }
+ results.insert(results.end(), data.begin(), data.end());
+ }
+
+ return results;
+}
+
+void checkResults(const std::vector<int64_t>& e, const std::vector<int64_t>& a,
+ size_t n, const char* notNull = nullptr) {
+ EXPECT_EQ(e.size(), a.size()) << "vectors differ in size";
+ for (size_t i = 0; i < e.size(); ++i) {
+ if (!notNull || notNull[i]) {
+ EXPECT_EQ(e[i], a[i]) << "Output wrong at " << i << ", n=" << n;
+ }
+ }
+}
+
+TEST(RLEv2, basicDelta0) {
+ const size_t count = 20;
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < count; ++i) {
+ values.push_back(static_cast<int64_t>(i));
+ }
+
+ const unsigned char bytes[] = {0xc0, 0x13, 0x00, 0x02};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
+ checkResults(values, decodeRLEv2(bytes, l, count, count), count);
+}
+
+TEST(RLEv2, basicDelta1) {
+ std::vector<int64_t> values(5);
+ values[0] = -500;
+ values[1] = -400;
+ values[2] = -350;
+ values[3] = -325;
+ values[4] = -310;
+
+ const unsigned char bytes[] = {0xce, 0x04, 0xe7, 0x07, 0xc8,
+ 0x01, 0x32, 0x19, 0x0f};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicDelta2) {
+ std::vector<int64_t> values(5);
+ values[0] = -500;
+ values[1] = -600;
+ values[2] = -650;
+ values[3] = -675;
+ values[4] = -710;
+
+ const unsigned char bytes[] = {0xce, 0x04, 0xe7, 0x07, 0xc7,
+ 0x01, 0x32, 0x19, 0x23};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicDelta3) {
+ std::vector<int64_t> values(5);
+ values[0] = 500;
+ values[1] = 400;
+ values[2] = 350;
+ values[3] = 325;
+ values[4] = 310;
+
+ const unsigned char bytes[] = {0xce, 0x04, 0xe8, 0x07, 0xc7,
+ 0x01, 0x32, 0x19, 0x0f};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicDelta4) {
+ std::vector<int64_t> values(5);
+ values[0] = 500;
+ values[1] = 600;
+ values[2] = 650;
+ values[3] = 675;
+ values[4] = 710;
+
+ const unsigned char bytes[] = {0xce, 0x04, 0xe8, 0x07, 0xc8,
+ 0x01, 0x32, 0x19, 0x23};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, delta0Width) {
+ const unsigned char buffer[] = {0x4e, 0x2, 0x0, 0x1, 0x2,
+ 0xc0, 0x2, 0x42, 0x0};
+ std::unique_ptr<RleDecoder> decoder = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ false, RleVersion_2, *dbcommon::getDefaultPool());
+ int64_t values[6];
+ decoder->next(values, 6, 0);
+ EXPECT_EQ(0, values[0]);
+ EXPECT_EQ(1, values[1]);
+ EXPECT_EQ(2, values[2]);
+ EXPECT_EQ(0x42, values[3]);
+ EXPECT_EQ(0x42, values[4]);
+ EXPECT_EQ(0x42, values[5]);
+}
+
+TEST(RLEv2, basicDelta0WithNulls) {
+ std::vector<int64_t> values;
+ std::vector<char> notNull;
+ for (size_t i = 0; i < 20; ++i) {
+ values.push_back(static_cast<int64_t>(i));
+ notNull.push_back(true);
+ // throw in a null every third value
+ bool addNull = (i % 3 == 0);
+ if (addNull) {
+ values.push_back(-1);
+ notNull.push_back(false);
+ }
+ }
+
+ const unsigned char bytes[] = {0xc0, 0x13, 0x00, 0x02};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ const size_t count = values.size();
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count, notNull.data()), 1,
+ notNull.data());
+ checkResults(values, decodeRLEv2(bytes, l, 3, count, notNull.data()), 3,
+ notNull.data());
+ checkResults(values, decodeRLEv2(bytes, l, 7, count, notNull.data()), 7,
+ notNull.data());
+ checkResults(values, decodeRLEv2(bytes, l, count, count, notNull.data()),
+ count, notNull.data());
+}
+
+TEST(RLEv2, shortRepeats) {
+ const size_t runLength = 7;
+ const size_t nVals = 10;
+ const size_t count = nVals * runLength;
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < nVals; ++i) {
+ for (size_t j = 0; j < runLength; ++j) {
+ values.push_back(static_cast<int64_t>(i));
+ }
+ }
+
+ const unsigned char bytes[] = {0x04, 0x00, 0x04, 0x02, 0x04, 0x04, 0x04,
+ 0x06, 0x04, 0x08, 0x04, 0x0a, 0x04, 0x0c,
+ 0x04, 0x0e, 0x04, 0x10, 0x04, 0x12};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
+ checkResults(values, decodeRLEv2(bytes, l, count, count), count);
+}
+
+TEST(RLEv2, multiByteShortRepeats) {
+ const size_t runLength = 7;
+ const size_t nVals = 3;
+ const size_t count = nVals * runLength;
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < nVals; ++i) {
+ for (size_t j = 0; j < runLength; ++j) {
+ values.push_back(static_cast<int64_t>(i) + (1LL << 62));
+ }
+ }
+
+ const unsigned char bytes[] = {0x3c, 0x80, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x3c, 0x80, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x02, 0x3c, 0x80, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x04};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
+ checkResults(values, decodeRLEv2(bytes, l, count, count), count);
+}
+
+TEST(RLEv2, 0to2Repeat1Direct) {
+ const unsigned char buffer[] = {0x46, 0x02, 0x02, 0x40};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ true, RleVersion_2, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(3);
+ rle->next(data.data(), 3, nullptr);
+
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i, data[i]) << "Output wrong at " << i;
+ }
+}
+
+TEST(RLEv2, bitSize2Direct) {
+ // 0,1 repeated 10 times (signed ints)
+ const size_t count = 20;
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < count; ++i) {
+ values.push_back(i % 2);
+ }
+
+ const unsigned char bytes[] = {0x42, 0x13, 0x22, 0x22, 0x22, 0x22, 0x22};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
+ checkResults(values, decodeRLEv2(bytes, l, count, count), count);
+}
+
+TEST(RLEv2, bitSize4Direct) {
+ // 0,2 repeated 10 times (signed ints)
+ const size_t count = 20;
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < count; ++i) {
+ values.push_back((i % 2) * 2);
+ }
+
+ const unsigned char bytes[] = {0x46, 0x13, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x04, 0x04, 0x04, 0x04, 0x04};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, count), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, count), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, count), 7);
+ checkResults(values, decodeRLEv2(bytes, l, count, count), count);
+}
+
+TEST(RLEv2, multipleRunsDirect) {
+ std::vector<int64_t> values;
+ // 0,1 repeated 10 times (signed ints)
+ for (size_t i = 0; i < 20; ++i) {
+ values.push_back(i % 2);
+ }
+ // 0,2 repeated 10 times (signed ints)
+ for (size_t i = 0; i < 20; ++i) {
+ values.push_back((i % 2) * 2);
+ }
+
+ const unsigned char bytes[] = {0x42, 0x13, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x46, 0x13, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x04, 0x04, 0x04, 0x04};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, largeNegativesDirect) {
+ const unsigned char buffer[] = {
+ 0x7e, 0x04, 0xcf, 0xca, 0xcc, 0x91, 0xba, 0x38, 0x93, 0xab, 0x00,
+ 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x00, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x02, 0x99, 0xa5, 0xcc, 0x28, 0x03, 0xf7, 0xe0, 0xff};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ true, RleVersion_2, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(5);
+ rle->next(data.data(), 5, nullptr);
+
+ EXPECT_EQ(-7486502418706614742, data[0]) << "Output wrong at " << 0;
+ EXPECT_EQ(0, data[1]) << "Output wrong at " << 1;
+ EXPECT_EQ(1, data[2]) << "Output wrong at " << 2;
+ EXPECT_EQ(1, data[3]) << "Output wrong at " << 3;
+ EXPECT_EQ(-5535739865598783616, data[4]) << "Output wrong at " << 4;
+}
+
+TEST(RLEv2, overflowDirect) {
+ std::vector<int64_t> values(4);
+ values[0] = 4513343538618202719l;
+ values[1] = 4513343538618202711l;
+ values[2] = 2911390882471569739l;
+ values[3] = -9181829309989854913l;
+
+ const unsigned char bytes[] = {
+ 0x7e, 0x03, 0x7d, 0x45, 0x3c, 0x12, 0x41, 0x48, 0xf4, 0xbe, 0x7d, 0x45,
+ 0x3c, 0x12, 0x41, 0x48, 0xf4, 0xae, 0x50, 0xce, 0xad, 0x2a, 0x30, 0x0e,
+ 0xd2, 0x96, 0xfe, 0xd8, 0xd2, 0x38, 0x54, 0x6e, 0x3d, 0x81};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicPatched0) {
+ int32_t v[] = {2030, 2000, 2020, 1000000, 2040, 2050, 2060, 2070, 2080, 2090};
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < sizeof(v) / sizeof(int32_t); ++i) {
+ values.push_back(v[i]);
+ }
+
+ const unsigned char bytes[] = {0x8e, 0x09, 0x2b, 0x21, 0x07, 0xd0,
+ 0x1e, 0x00, 0x14, 0x70, 0x28, 0x32,
+ 0x3c, 0x46, 0x50, 0x5a, 0xfc, 0xe8};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicPatched1) {
+ int32_t v[] = {20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2,
+ 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1,
+ 2, 3, 1, 8, 30, 1, 3, 414, 1, 1, 135, 3, 3,
+ 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1,
+ 2, 2, 1, 1, 52, 4, 1, 2, 7, 1, 17, 334, 1,
+ 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6, 2,
+ 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1,
+ 33, 2, -13, 1, 2, 3, 13, 1, 92, 3, 13, 5, 14,
+ 9, 141, 12, 6, 15, 25};
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < sizeof(v) / sizeof(int32_t); ++i) {
+ values.push_back(v[i]);
+ }
+
+ const unsigned char bytes[] = {
+ 0x90, 0x6d, 0x04, 0xa4, 0x8d, 0x10, 0x83, 0xc2, 0x00, 0xf0, 0x70, 0x40,
+ 0x3c, 0x54, 0x18, 0x03, 0xc1, 0xc9, 0x80, 0x78, 0x3c, 0x21, 0x04, 0xf4,
+ 0x03, 0xc1, 0xc0, 0xe0, 0x80, 0x38, 0x20, 0x0f, 0x16, 0x83, 0x81, 0xe1,
+ 0x00, 0x70, 0x54, 0x56, 0x0e, 0x08, 0x6a, 0xc1, 0xc0, 0xe4, 0xa0, 0x40,
+ 0x20, 0x0e, 0xd5, 0x83, 0xc1, 0xc0, 0xf0, 0x79, 0x7c, 0x1e, 0x12, 0x09,
+ 0x84, 0x43, 0x00, 0xe0, 0x78, 0x3c, 0x1c, 0x0e, 0x20, 0x84, 0x41, 0xc0,
+ 0xf0, 0xa0, 0x38, 0x3d, 0x5b, 0x07, 0x03, 0xc1, 0xc0, 0xf0, 0x78, 0x4c,
+ 0x1d, 0x17, 0x07, 0x03, 0xdc, 0xc0, 0xf0, 0x98, 0x3c, 0x34, 0x0f, 0x07,
+ 0x83, 0x81, 0xe1, 0x00, 0x90, 0x38, 0x1e, 0x0e, 0x2c, 0x8c, 0x81, 0xc2,
+ 0xe0, 0x78, 0x00, 0x1c, 0x0f, 0x08, 0x06, 0x81, 0xc6, 0x90, 0x80, 0x68,
+ 0x24, 0x1b, 0x0b, 0x26, 0x83, 0x21, 0x30, 0xe0, 0x98, 0x3c, 0x6f, 0x06,
+ 0xb7, 0x03, 0x70};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, mixedPatchedAndShortRepeats) {
+ int32_t v[] = {20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139,
+ 2, 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2,
+ 32, 1, 2, 3, 1, 8, 30, 1, 3, 414, 1, 1,
+ 135, 3, 3, 1, 414, 2, 1, 2, 2, 594, 2, 5,
+ 6, 4, 11, 1, 2, 2, 1, 1, 52, 4, 1, 2,
+ 7, 1, 17, 334, 1, 2, 1, 2, 2, 6, 1, 266,
+ 1, 2, 217, 2, 6, 2, 13, 2, 2, 1, 2, 3,
+ 5, 1, 2, 1, 7244, 11813, 1, 33, 2, -13, 1, 2,
+ 3, 13, 1, 92, 3, 13, 5, 14, 9, 141, 12, 6,
+ 15, 25, 1, 1, 1, 46, 2, 1, 1, 141, 3, 1,
+ 1, 1, 1, 2, 1, 4, 34, 5, 78, 8, 1, 2,
+ 2, 1, 9, 10, 2, 1, 4, 13, 1, 5, 4, 4,
+ 19, 5, 1, 1, 1, 68, 33, 399, 1, 1885, 25, 5,
+ 2, 4, 1, 1, 2, 16, 1, 2966, 3, 1, 1, 25501,
+ 1, 1, 1, 66, 1, 3, 8, 131, 14, 5, 1, 2,
+ 2, 1, 1, 8, 1, 1, 2, 1, 5, 9, 2, 3,
+ 112, 13, 2, 2, 1, 5, 10, 3, 1, 1, 13, 2,
+ 3, 4, 1, 3, 1, 1, 2, 1, 1, 2, 4, 2,
+ 207, 1, 1, 2, 4, 3, 3, 2, 2, 16};
+ std::vector<int64_t> values;
+ for (size_t i = 0; i < sizeof(v) / sizeof(int32_t); ++i) {
+ values.push_back(v[i]);
+ }
+
+ const unsigned char bytes[] = {
+ 0x90, 0x6d, 0x04, 0xa4, 0x8d, 0x10, 0x83, 0xc2, 0x00, 0xf0, 0x70, 0x40,
+ 0x3c, 0x54, 0x18, 0x03, 0xc1, 0xc9, 0x80, 0x78, 0x3c, 0x21, 0x04, 0xf4,
+ 0x03, 0xc1, 0xc0, 0xe0, 0x80, 0x38, 0x20, 0x0f, 0x16, 0x83, 0x81, 0xe1,
+ 0x00, 0x70, 0x54, 0x56, 0x0e, 0x08, 0x6a, 0xc1, 0xc0, 0xe4, 0xa0, 0x40,
+ 0x20, 0x0e, 0xd5, 0x83, 0xc1, 0xc0, 0xf0, 0x79, 0x7c, 0x1e, 0x12, 0x09,
+ 0x84, 0x43, 0x00, 0xe0, 0x78, 0x3c, 0x1c, 0x0e, 0x20, 0x84, 0x41, 0xc0,
+ 0xf0, 0xa0, 0x38, 0x3d, 0x5b, 0x07, 0x03, 0xc1, 0xc0, 0xf0, 0x78, 0x4c,
+ 0x1d, 0x17, 0x07, 0x03, 0xdc, 0xc0, 0xf0, 0x98, 0x3c, 0x34, 0x0f, 0x07,
+ 0x83, 0x81, 0xe1, 0x00, 0x90, 0x38, 0x1e, 0x0e, 0x2c, 0x8c, 0x81, 0xc2,
+ 0xe0, 0x78, 0x00, 0x1c, 0x0f, 0x08, 0x06, 0x81, 0xc6, 0x90, 0x80, 0x68,
+ 0x24, 0x1b, 0x0b, 0x26, 0x83, 0x21, 0x30, 0xe0, 0x98, 0x3c, 0x6f, 0x06,
+ 0xb7, 0x03, 0x70, 0x00, 0x02, 0x5e, 0x05, 0x00, 0x5c, 0x00, 0x04, 0x00,
+ 0x02, 0x00, 0x02, 0x01, 0x1a, 0x00, 0x06, 0x01, 0x02, 0x8a, 0x16, 0x00,
+ 0x41, 0x01, 0x04, 0x00, 0xe1, 0x10, 0xd1, 0xc0, 0x04, 0x10, 0x08, 0x24,
+ 0x10, 0x03, 0x30, 0x01, 0x03, 0x0d, 0x21, 0x00, 0xb0, 0x00, 0x02, 0x5e,
+ 0x12, 0x00, 0x88, 0x00, 0x42, 0x03, 0x1e, 0x00, 0x02, 0x0e, 0xba, 0x00,
+ 0x32, 0x00, 0x0a, 0x00, 0x04, 0x00, 0x08, 0x00, 0x02, 0x00, 0x02, 0x00,
+ 0x04, 0x00, 0x20, 0x00, 0x02, 0x17, 0x2c, 0x00, 0x06, 0x00, 0x02, 0x00,
+ 0x02, 0xc7, 0x3a, 0x00, 0x02, 0x8c, 0x36, 0x00, 0xa2, 0x01, 0x82, 0x00,
+ 0x10, 0x70, 0x43, 0x42, 0x00, 0x02, 0x04, 0x00, 0x00, 0xe0, 0x00, 0x01,
+ 0x00, 0x10, 0x40, 0x10, 0x5b, 0xc6, 0x01, 0x02, 0x00, 0x20, 0x90, 0x40,
+ 0x00, 0x0c, 0x02, 0x08, 0x18, 0x00, 0x40, 0x00, 0x01, 0x00, 0x00, 0x08,
+ 0x30, 0x33, 0x80, 0x00, 0x02, 0x0c, 0x10, 0x20, 0x20, 0x47, 0x80, 0x13,
+ 0x4c};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+ // Read 1 at a time, then 3 at a time, etc.
+ checkResults(values, decodeRLEv2(bytes, l, 1, values.size()), 1);
+ checkResults(values, decodeRLEv2(bytes, l, 3, values.size()), 3);
+ checkResults(values, decodeRLEv2(bytes, l, 7, values.size()), 7);
+ checkResults(values, decodeRLEv2(bytes, l, values.size(), values.size()),
+ values.size());
+}
+
+TEST(RLEv2, basicDirectSeek) {
+ // 0,1 repeated 10 times (signed ints) followed by
+ // 0,2 repeated 10 times (signed ints)
+ const unsigned char bytes[] = {0x42, 0x13, 0x22, 0x22, 0x22, 0x22, 0x22,
+ 0x46, 0x13, 0x04, 0x04, 0x04, 0x04, 0x04,
+ 0x04, 0x04, 0x04, 0x04, 0x04};
+ uint32_t l = sizeof(bytes) / sizeof(char);
+
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(bytes, l)),
+ true, RleVersion_2, *dbcommon::getDefaultPool());
+ std::list<uint64_t> position;
+ position.push_back(7); // byte position; skip first 20 [0 to 19]
+ position.push_back(13); // value position; skip 13 more [20 to 32]
+
+ PositionProvider location(position);
+ rle->seek(location);
+ std::vector<int64_t> data(3);
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(2, data[0]);
+ EXPECT_EQ(0, data[1]);
+ EXPECT_EQ(2, data[2]);
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(0, data[0]);
+ EXPECT_EQ(2, data[1]);
+ EXPECT_EQ(0, data[2]);
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(2, data[0]);
+}
+
+TEST(RLEv2, bitsLeftByPreviousStream) {
+ // test for #109
+ // 118 DIRECT values, followed by PATHCED values
+ const unsigned char bytes[] = {
+ 0x5a, 0x75, 0x92, 0x42, 0x49, 0x09, 0x2b, 0xa4, 0xae, 0x92, 0xc2, 0x4b,
+ 0x89, 0x2f, 0x24, 0xbc, 0x93, 0x2a, 0x4c, 0xa9, 0x34, 0x24, 0xe0, 0x93,
+ 0x92, 0x4e, 0xe9, 0x40, 0xa5, 0x04, 0x94, 0x12, 0x62, 0xa9, 0xc9, 0xa7,
+ 0x26, 0x9c, 0xaa, 0x73, 0x09, 0xcd, 0x27, 0x34, 0x9c, 0xf2, 0x74, 0x49,
+ 0xd3, 0x27, 0x50, 0x9d, 0x42, 0x75, 0x29, 0xd4, 0xa7, 0x5a, 0x9d, 0xaa,
+ 0x79, 0x89, 0xe9, 0x27, 0xa4, 0x9e, 0xea, 0x7c, 0x29, 0xf6, 0x27, 0xdc,
+ 0x9f, 0xb2, 0x7f, 0x4a, 0x00, 0xa8, 0x14, 0xa0, 0x72, 0x82, 0x8a, 0x19,
+ 0x28, 0x6e, 0xa2, 0x52, 0x89, 0x4a, 0x28, 0x28, 0xa6, 0xa2, 0x9a, 0x8b,
+ 0x6a, 0x2d, 0xa8, 0xb8, 0xa2, 0xe2, 0x8b, 0xaa, 0x53, 0xa9, 0x54, 0xa5,
+ 0x92, 0x98, 0x6a, 0x62, 0xa9, 0x9c, 0xa6, 0x8a, 0x9b, 0xea, 0x70, 0x29,
+ 0xd2, 0xa7, 0x52, 0x9d, 0x4a, 0x77, 0x29, 0xe0, 0xa7, 0xa2, 0x9e, 0xaa,
+ 0x7b, 0x29, 0xf0, 0xa7, 0xd2, 0xa0, 0x0a, 0x84, 0x2a, 0x18, 0xa8, 0x72,
+ 0xa1, 0xca, 0x89, 0x2a, 0x30, 0xa9, 0x4a, 0xa5, 0x4a, 0x96, 0x2a, 0xae,
+ 0xab, 0x02, 0xac, 0x2b, 0x8d, 0x2e, 0x60, 0xb9, 0x82, 0xe7, 0x2b, 0x9f,
+ 0xae, 0x84, 0xba, 0x52, 0xe9, 0xeb, 0xad, 0x2e, 0xb6, 0xbc, 0x32, 0xf1,
+ 0xcb, 0xcc, 0x2f, 0x42, 0xbd, 0x8a, 0xf7, 0xcb, 0xe1, 0xaf, 0xa4, 0xbe,
+ 0x9a, 0xfa, 0x6b, 0xeb, 0xaf, 0xba, 0xbe, 0xea, 0xfd, 0x2b, 0xf4, 0xaf,
+ 0xd8, 0xbf, 0xfb, 0x00,
+ 0x80, // <= end of DIRECT, start of PATCHED =>
+ 0x90, 0x6d, 0x04, 0xa4, 0x8d, 0x10, 0x83, 0xc2, 0x00, 0xf0, 0x70, 0x40,
+ 0x3c, 0x54, 0x18, 0x03, 0xc1, 0xc9, 0x80, 0x78, 0x3c, 0x21, 0x04, 0xf4,
+ 0x03, 0xc1, 0xc0, 0xe0, 0x80, 0x38, 0x20, 0x0f, 0x16, 0x83, 0x81, 0xe1,
+ 0x00, 0x70, 0x54, 0x56, 0x0e, 0x08, 0x6a, 0xc1, 0xc0, 0xe4, 0xa0, 0x40,
+ 0x20, 0x0e, 0xd5, 0x83, 0xc1, 0xc0, 0xf0, 0x79, 0x7c, 0x1e, 0x12, 0x09,
+ 0x84, 0x43, 0x00, 0xe0, 0x78, 0x3c, 0x1c, 0x0e, 0x20, 0x84, 0x41, 0xc0,
+ 0xf0, 0xa0, 0x38, 0x3d, 0x5b, 0x07, 0x03, 0xc1, 0xc0, 0xf0, 0x78, 0x4c,
+ 0x1d, 0x17, 0x07, 0x03, 0xdc, 0xc0, 0xf0, 0x98, 0x3c, 0x34, 0x0f, 0x07,
+ 0x83, 0x81, 0xe1, 0x00, 0x90, 0x38, 0x1e, 0x0e, 0x2c, 0x8c, 0x81, 0xc2,
+ 0xe0, 0x78, 0x00, 0x1c, 0x0f, 0x08, 0x06, 0x81, 0xc6, 0x90, 0x80, 0x68,
+ 0x24, 0x1b, 0x0b, 0x26, 0x83, 0x21, 0x30, 0xe0, 0x98, 0x3c, 0x6f, 0x06,
+ 0xb7, 0x03, 0x70};
+ uint32_t l = sizeof(bytes) / sizeof(unsigned char);
+
+ // PATCHED values.
+ int32_t v[] = {20, 2, 3, 2, 1, 3, 17, 71, 35, 2, 1, 139, 2,
+ 2, 3, 1783, 475, 2, 1, 1, 3, 1, 3, 2, 32, 1,
+ 2, 3, 1, 8, 30, 1, 3, 414, 1, 1, 135, 3, 3,
+ 1, 414, 2, 1, 2, 2, 594, 2, 5, 6, 4, 11, 1,
+ 2, 2, 1, 1, 52, 4, 1, 2, 7, 1, 17, 334, 1,
+ 2, 1, 2, 2, 6, 1, 266, 1, 2, 217, 2, 6, 2,
+ 13, 2, 2, 1, 2, 3, 5, 1, 2, 1, 7244, 11813, 1,
+ 33, 2, -13, 1, 2, 3, 13, 1, 92, 3, 13, 5, 14,
+ 9, 141, 12, 6, 15, 25};
+ uint32_t D = 118, P = sizeof(v) / sizeof(int32_t), N = D + P;
+
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(bytes, l)),
+ true, RleVersion_2, *dbcommon::getDefaultPool());
+
+ std::vector<int64_t> data(N);
+ rle->next(data.data(), N, nullptr);
+ // check patched values
+ for (size_t i = 0; i < P; ++i) {
+ EXPECT_EQ(v[i], data[i + D]);
+ }
+}
+
+TEST(RLEv1, simpleTest) {
+ const unsigned char buffer[] = {0x61, 0xff, 0x64, 0xfb, 0x02,
+ 0x03, 0x5, 0x7, 0xb};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ false, RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(105);
+ rle->next(data.data(), 105, nullptr);
+
+ for (size_t i = 0; i < 100; ++i) {
+ EXPECT_EQ(100 - i, data[i]) << "Output wrong at " << i;
+ }
+ EXPECT_EQ(2, data[100]);
+ EXPECT_EQ(3, data[101]);
+ EXPECT_EQ(5, data[102]);
+ EXPECT_EQ(7, data[103]);
+ EXPECT_EQ(11, data[104]);
+}
+
+TEST(RLEv1, signedNullLiteralTest) {
+ const unsigned char buffer[] = {0xf8, 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ true, RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(8);
+ std::vector<char> notNull(8, 1);
+ rle->next(data.data(), 8, notNull.data());
+
+ for (size_t i = 0; i < 8; ++i) {
+ EXPECT_EQ(i % 2 == 0 ? i / 2 : -((i + 1) / 2), data[i]);
+ }
+}
+
+TEST(RLEv1, splitHeader) {
+ const unsigned char buffer[] = {0x0, 0x00, 0xdc, 0xba, 0x98, 0x76};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer), 4)),
+ false, RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(200);
+ rle->next(data.data(), 3, nullptr);
+
+ for (size_t i = 0; i < 3; ++i) {
+ EXPECT_EQ(247864668, data[i]) << "Output wrong at " << i;
+ }
+}
+
+TEST(RLEv1, splitRuns) {
+ const unsigned char buffer[] = {0x7d, 0x01, 0xff, 0x01, 0xfb,
+ 0x01, 0x02, 0x03, 0x04, 0x05};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), false,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(200);
+ for (size_t i = 0; i < 42; ++i) {
+ rle->next(data.data(), 3, nullptr);
+ for (size_t j = 0; j < 3; ++j) {
+ EXPECT_EQ(255 + i * 3 + j, data[j])
+ << "Wrong output at " << i << ", " << j;
+ }
+ }
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(381, data[0]);
+ EXPECT_EQ(382, data[1]);
+ EXPECT_EQ(1, data[2]);
+ rle->next(data.data(), 3, nullptr);
+ EXPECT_EQ(2, data[0]);
+ EXPECT_EQ(3, data[1]);
+ EXPECT_EQ(4, data[2]);
+ rle->next(data.data(), 1, nullptr);
+ EXPECT_EQ(5, data[0]);
+}
+
+TEST(RLEv1, testSigned) {
+ const unsigned char buffer[] = {0x7f, 0xff, 0x20};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), true,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(100);
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(16 - i, data[i]) << "Wrong output at " << i;
+ }
+ rle->next(data.data(), 30, nullptr);
+ for (size_t i = 0; i < 30; ++i) {
+ EXPECT_EQ(16 - 100 - static_cast<int32_t>(i), data[i])
+ << "Wrong output at " << (i + 100);
+ }
+}
+
+TEST(RLEv1, testNull) {
+ const unsigned char buffer[] = {0x75, 0x02, 0x00};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), true,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(24);
+ std::vector<char> notNull(24);
+ for (size_t i = 0; i < notNull.size(); ++i) {
+ notNull[i] = (i + 1) % 2;
+ }
+ for (size_t i = 0; i < 10; ++i) {
+ for (size_t j = 0; j < data.size(); ++j) {
+ data[j] = -1;
+ }
+ rle->next(data.data(), 24, notNull.data());
+ for (size_t j = 0; j < 24; ++j) {
+ if (notNull[j]) {
+ EXPECT_EQ(i * 24 + j, data[j]);
+ } else {
+ EXPECT_EQ(-1, data[j]);
+ }
+ }
+ }
+}
+
+TEST(RLEv1, testAllNulls) {
+ const unsigned char buffer[] = {0xf0, 0x00, 0x01, 0x02, 0x03, 0x04, 0x05,
+ 0x06, 0x07, 0x08, 0x09, 0x0a, 0x0b, 0x0c,
+ 0x0d, 0x0e, 0x0f, 0x3d, 0x00, 0x12};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), false,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(16, -1);
+ std::vector<char> allNull(16, 0);
+ std::vector<char> noNull(16, 1);
+ rle->next(data.data(), 16, allNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(-1, data[i]) << "Output wrong at " << i;
+ }
+ rle->next(data.data(), data.size(), noNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(i, data[i]) << "Output wrong at " << i;
+ data[i] = -1;
+ }
+ rle->next(data.data(), data.size(), allNull.data());
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(-1, data[i]) << "Output wrong at " << i;
+ }
+ for (size_t i = 0; i < 4; ++i) {
+ rle->next(data.data(), data.size(), noNull.data());
+ for (size_t j = 0; j < data.size(); ++j) {
+ EXPECT_EQ(18, data[j]) << "Output wrong at " << i;
+ }
+ }
+ rle->next(data.data(), data.size(), allNull.data());
+}
+
+TEST(RLEv1, skipTest) {
+ // Create the RLE stream from Java's TestRunLengthIntegerEncoding.testSkips
+ // for (size_t i = 0; i < 1024; ++i)
+ // out.write(i);
+ // for (size_t i = 1024; i < 2048; ++i)
+ // out.write(i * 256);
+ // This causes the first half to be delta encoded and the second half to
+ // be literal encoded.
+ const unsigned char buffer[] = {
+ 127, 1, 0, 127, 1, 132, 2, 127, 1, 136, 4, 127, 1, 140, 6,
+ 127, 1, 144, 8, 127, 1, 148, 10, 127, 1, 152, 12, 111, 1, 156,
+ 14, 128, 128, 128, 32, 128, 132, 32, 128, 136, 32, 128, 140, 32, 128,
+ 144, 32, 128, 148, 32, 128, 152, 32, 128, 156, 32, 128, 160, 32, 128,
+ 164, 32, 128, 168, 32, 128, 172, 32, 128, 176, 32, 128, 180, 32, 128,
+ 184, 32, 128, 188, 32, 128, 192, 32, 128, 196, 32, 128, 200, 32, 128,
+ 204, 32, 128, 208, 32, 128, 212, 32, 128, 216, 32, 128, 220, 32, 128,
+ 224, 32, 128, 228, 32, 128, 232, 32, 128, 236, 32, 128, 240, 32, 128,
+ 244, 32, 128, 248, 32, 128, 252, 32, 128, 128, 33, 128, 132, 33, 128,
+ 136, 33, 128, 140, 33, 128, 144, 33, 128, 148, 33, 128, 152, 33, 128,
+ 156, 33, 128, 160, 33, 128, 164, 33, 128, 168, 33, 128, 172, 33, 128,
+ 176, 33, 128, 180, 33, 128, 184, 33, 128, 188, 33, 128, 192, 33, 128,
+ 196, 33, 128, 200, 33, 128, 204, 33, 128, 208, 33, 128, 212, 33, 128,
+ 216, 33, 128, 220, 33, 128, 224, 33, 128, 228, 33, 128, 232, 33, 128,
+ 236, 33, 128, 240, 33, 128, 244, 33, 128, 248, 33, 128, 252, 33, 128,
+ 128, 34, 128, 132, 34, 128, 136, 34, 128, 140, 34, 128, 144, 34, 128,
+ 148, 34, 128, 152, 34, 128, 156, 34, 128, 160, 34, 128, 164, 34, 128,
+ 168, 34, 128, 172, 34, 128, 176, 34, 128, 180, 34, 128, 184, 34, 128,
+ 188, 34, 128, 192, 34, 128, 196, 34, 128, 200, 34, 128, 204, 34, 128,
+ 208, 34, 128, 212, 34, 128, 216, 34, 128, 220, 34, 128, 224, 34, 128,
+ 228, 34, 128, 232, 34, 128, 236, 34, 128, 240, 34, 128, 244, 34, 128,
+ 248, 34, 128, 252, 34, 128, 128, 35, 128, 132, 35, 128, 136, 35, 128,
+ 140, 35, 128, 144, 35, 128, 148, 35, 128, 152, 35, 128, 156, 35, 128,
+ 160, 35, 128, 164, 35, 128, 168, 35, 128, 172, 35, 128, 176, 35, 128,
+ 180, 35, 128, 184, 35, 128, 188, 35, 128, 192, 35, 128, 196, 35, 128,
+ 200, 35, 128, 204, 35, 128, 208, 35, 128, 212, 35, 128, 216, 35, 128,
+ 220, 35, 128, 224, 35, 128, 228, 35, 128, 232, 35, 128, 236, 35, 128,
+ 240, 35, 128, 244, 35, 128, 248, 35, 128, 252, 35, 128, 128, 128, 36,
+ 128, 132, 36, 128, 136, 36, 128, 140, 36, 128, 144, 36, 128, 148, 36,
+ 128, 152, 36, 128, 156, 36, 128, 160, 36, 128, 164, 36, 128, 168, 36,
+ 128, 172, 36, 128, 176, 36, 128, 180, 36, 128, 184, 36, 128, 188, 36,
+ 128, 192, 36, 128, 196, 36, 128, 200, 36, 128, 204, 36, 128, 208, 36,
+ 128, 212, 36, 128, 216, 36, 128, 220, 36, 128, 224, 36, 128, 228, 36,
+ 128, 232, 36, 128, 236, 36, 128, 240, 36, 128, 244, 36, 128, 248, 36,
+ 128, 252, 36, 128, 128, 37, 128, 132, 37, 128, 136, 37, 128, 140, 37,
+ 128, 144, 37, 128, 148, 37, 128, 152, 37, 128, 156, 37, 128, 160, 37,
+ 128, 164, 37, 128, 168, 37, 128, 172, 37, 128, 176, 37, 128, 180, 37,
+ 128, 184, 37, 128, 188, 37, 128, 192, 37, 128, 196, 37, 128, 200, 37,
+ 128, 204, 37, 128, 208, 37, 128, 212, 37, 128, 216, 37, 128, 220, 37,
+ 128, 224, 37, 128, 228, 37, 128, 232, 37, 128, 236, 37, 128, 240, 37,
+ 128, 244, 37, 128, 248, 37, 128, 252, 37, 128, 128, 38, 128, 132, 38,
+ 128, 136, 38, 128, 140, 38, 128, 144, 38, 128, 148, 38, 128, 152, 38,
+ 128, 156, 38, 128, 160, 38, 128, 164, 38, 128, 168, 38, 128, 172, 38,
+ 128, 176, 38, 128, 180, 38, 128, 184, 38, 128, 188, 38, 128, 192, 38,
+ 128, 196, 38, 128, 200, 38, 128, 204, 38, 128, 208, 38, 128, 212, 38,
+ 128, 216, 38, 128, 220, 38, 128, 224, 38, 128, 228, 38, 128, 232, 38,
+ 128, 236, 38, 128, 240, 38, 128, 244, 38, 128, 248, 38, 128, 252, 38,
+ 128, 128, 39, 128, 132, 39, 128, 136, 39, 128, 140, 39, 128, 144, 39,
+ 128, 148, 39, 128, 152, 39, 128, 156, 39, 128, 160, 39, 128, 164, 39,
+ 128, 168, 39, 128, 172, 39, 128, 176, 39, 128, 180, 39, 128, 184, 39,
+ 128, 188, 39, 128, 192, 39, 128, 196, 39, 128, 200, 39, 128, 204, 39,
+ 128, 208, 39, 128, 212, 39, 128, 216, 39, 128, 220, 39, 128, 224, 39,
+ 128, 228, 39, 128, 232, 39, 128, 236, 39, 128, 240, 39, 128, 244, 39,
+ 128, 248, 39, 128, 252, 39, 128, 128, 128, 40, 128, 132, 40, 128, 136,
+ 40, 128, 140, 40, 128, 144, 40, 128, 148, 40, 128, 152, 40, 128, 156,
+ 40, 128, 160, 40, 128, 164, 40, 128, 168, 40, 128, 172, 40, 128, 176,
+ 40, 128, 180, 40, 128, 184, 40, 128, 188, 40, 128, 192, 40, 128, 196,
+ 40, 128, 200, 40, 128, 204, 40, 128, 208, 40, 128, 212, 40, 128, 216,
+ 40, 128, 220, 40, 128, 224, 40, 128, 228, 40, 128, 232, 40, 128, 236,
+ 40, 128, 240, 40, 128, 244, 40, 128, 248, 40, 128, 252, 40, 128, 128,
+ 41, 128, 132, 41, 128, 136, 41, 128, 140, 41, 128, 144, 41, 128, 148,
+ 41, 128, 152, 41, 128, 156, 41, 128, 160, 41, 128, 164, 41, 128, 168,
+ 41, 128, 172, 41, 128, 176, 41, 128, 180, 41, 128, 184, 41, 128, 188,
+ 41, 128, 192, 41, 128, 196, 41, 128, 200, 41, 128, 204, 41, 128, 208,
+ 41, 128, 212, 41, 128, 216, 41, 128, 220, 41, 128, 224, 41, 128, 228,
+ 41, 128, 232, 41, 128, 236, 41, 128, 240, 41, 128, 244, 41, 128, 248,
+ 41, 128, 252, 41, 128, 128, 42, 128, 132, 42, 128, 136, 42, 128, 140,
+ 42, 128, 144, 42, 128, 148, 42, 128, 152, 42, 128, 156, 42, 128, 160,
+ 42, 128, 164, 42, 128, 168, 42, 128, 172, 42, 128, 176, 42, 128, 180,
+ 42, 128, 184, 42, 128, 188, 42, 128, 192, 42, 128, 196, 42, 128, 200,
+ 42, 128, 204, 42, 128, 208, 42, 128, 212, 42, 128, 216, 42, 128, 220,
+ 42, 128, 224, 42, 128, 228, 42, 128, 232, 42, 128, 236, 42, 128, 240,
+ 42, 128, 244, 42, 128, 248, 42, 128, 252, 42, 128, 128, 43, 128, 132,
+ 43, 128, 136, 43, 128, 140, 43, 128, 144, 43, 128, 148, 43, 128, 152,
+ 43, 128, 156, 43, 128, 160, 43, 128, 164, 43, 128, 168, 43, 128, 172,
+ 43, 128, 176, 43, 128, 180, 43, 128, 184, 43, 128, 188, 43, 128, 192,
+ 43, 128, 196, 43, 128, 200, 43, 128, 204, 43, 128, 208, 43, 128, 212,
+ 43, 128, 216, 43, 128, 220, 43, 128, 224, 43, 128, 228, 43, 128, 232,
+ 43, 128, 236, 43, 128, 240, 43, 128, 244, 43, 128, 248, 43, 128, 252,
+ 43, 128, 128, 128, 44, 128, 132, 44, 128, 136, 44, 128, 140, 44, 128,
+ 144, 44, 128, 148, 44, 128, 152, 44, 128, 156, 44, 128, 160, 44, 128,
+ 164, 44, 128, 168, 44, 128, 172, 44, 128, 176, 44, 128, 180, 44, 128,
+ 184, 44, 128, 188, 44, 128, 192, 44, 128, 196, 44, 128, 200, 44, 128,
+ 204, 44, 128, 208, 44, 128, 212, 44, 128, 216, 44, 128, 220, 44, 128,
+ 224, 44, 128, 228, 44, 128, 232, 44, 128, 236, 44, 128, 240, 44, 128,
+ 244, 44, 128, 248, 44, 128, 252, 44, 128, 128, 45, 128, 132, 45, 128,
+ 136, 45, 128, 140, 45, 128, 144, 45, 128, 148, 45, 128, 152, 45, 128,
+ 156, 45, 128, 160, 45, 128, 164, 45, 128, 168, 45, 128, 172, 45, 128,
+ 176, 45, 128, 180, 45, 128, 184, 45, 128, 188, 45, 128, 192, 45, 128,
+ 196, 45, 128, 200, 45, 128, 204, 45, 128, 208, 45, 128, 212, 45, 128,
+ 216, 45, 128, 220, 45, 128, 224, 45, 128, 228, 45, 128, 232, 45, 128,
+ 236, 45, 128, 240, 45, 128, 244, 45, 128, 248, 45, 128, 252, 45, 128,
+ 128, 46, 128, 132, 46, 128, 136, 46, 128, 140, 46, 128, 144, 46, 128,
+ 148, 46, 128, 152, 46, 128, 156, 46, 128, 160, 46, 128, 164, 46, 128,
+ 168, 46, 128, 172, 46, 128, 176, 46, 128, 180, 46, 128, 184, 46, 128,
+ 188, 46, 128, 192, 46, 128, 196, 46, 128, 200, 46, 128, 204, 46, 128,
+ 208, 46, 128, 212, 46, 128, 216, 46, 128, 220, 46, 128, 224, 46, 128,
+ 228, 46, 128, 232, 46, 128, 236, 46, 128, 240, 46, 128, 244, 46, 128,
+ 248, 46, 128, 252, 46, 128, 128, 47, 128, 132, 47, 128, 136, 47, 128,
+ 140, 47, 128, 144, 47, 128, 148, 47, 128, 152, 47, 128, 156, 47, 128,
+ 160, 47, 128, 164, 47, 128, 168, 47, 128, 172, 47, 128, 176, 47, 128,
+ 180, 47, 128, 184, 47, 128, 188, 47, 128, 192, 47, 128, 196, 47, 128,
+ 200, 47, 128, 204, 47, 128, 208, 47, 128, 212, 47, 128, 216, 47, 128,
+ 220, 47, 128, 224, 47, 128, 228, 47, 128, 232, 47, 128, 236, 47, 128,
+ 240, 47, 128, 244, 47, 128, 248, 47, 128, 252, 47, 128, 128, 128, 48,
+ 128, 132, 48, 128, 136, 48, 128, 140, 48, 128, 144, 48, 128, 148, 48,
+ 128, 152, 48, 128, 156, 48, 128, 160, 48, 128, 164, 48, 128, 168, 48,
+ 128, 172, 48, 128, 176, 48, 128, 180, 48, 128, 184, 48, 128, 188, 48,
+ 128, 192, 48, 128, 196, 48, 128, 200, 48, 128, 204, 48, 128, 208, 48,
+ 128, 212, 48, 128, 216, 48, 128, 220, 48, 128, 224, 48, 128, 228, 48,
+ 128, 232, 48, 128, 236, 48, 128, 240, 48, 128, 244, 48, 128, 248, 48,
+ 128, 252, 48, 128, 128, 49, 128, 132, 49, 128, 136, 49, 128, 140, 49,
+ 128, 144, 49, 128, 148, 49, 128, 152, 49, 128, 156, 49, 128, 160, 49,
+ 128, 164, 49, 128, 168, 49, 128, 172, 49, 128, 176, 49, 128, 180, 49,
+ 128, 184, 49, 128, 188, 49, 128, 192, 49, 128, 196, 49, 128, 200, 49,
+ 128, 204, 49, 128, 208, 49, 128, 212, 49, 128, 216, 49, 128, 220, 49,
+ 128, 224, 49, 128, 228, 49, 128, 232, 49, 128, 236, 49, 128, 240, 49,
+ 128, 244, 49, 128, 248, 49, 128, 252, 49, 128, 128, 50, 128, 132, 50,
+ 128, 136, 50, 128, 140, 50, 128, 144, 50, 128, 148, 50, 128, 152, 50,
+ 128, 156, 50, 128, 160, 50, 128, 164, 50, 128, 168, 50, 128, 172, 50,
+ 128, 176, 50, 128, 180, 50, 128, 184, 50, 128, 188, 50, 128, 192, 50,
+ 128, 196, 50, 128, 200, 50, 128, 204, 50, 128, 208, 50, 128, 212, 50,
+ 128, 216, 50, 128, 220, 50, 128, 224, 50, 128, 228, 50, 128, 232, 50,
+ 128, 236, 50, 128, 240, 50, 128, 244, 50, 128, 248, 50, 128, 252, 50,
+ 128, 128, 51, 128, 132, 51, 128, 136, 51, 128, 140, 51, 128, 144, 51,
+ 128, 148, 51, 128, 152, 51, 128, 156, 51, 128, 160, 51, 128, 164, 51,
+ 128, 168, 51, 128, 172, 51, 128, 176, 51, 128, 180, 51, 128, 184, 51,
+ 128, 188, 51, 128, 192, 51, 128, 196, 51, 128, 200, 51, 128, 204, 51,
+ 128, 208, 51, 128, 212, 51, 128, 216, 51, 128, 220, 51, 128, 224, 51,
+ 128, 228, 51, 128, 232, 51, 128, 236, 51, 128, 240, 51, 128, 244, 51,
+ 128, 248, 51, 128, 252, 51, 128, 128, 128, 52, 128, 132, 52, 128, 136,
+ 52, 128, 140, 52, 128, 144, 52, 128, 148, 52, 128, 152, 52, 128, 156,
+ 52, 128, 160, 52, 128, 164, 52, 128, 168, 52, 128, 172, 52, 128, 176,
+ 52, 128, 180, 52, 128, 184, 52, 128, 188, 52, 128, 192, 52, 128, 196,
+ 52, 128, 200, 52, 128, 204, 52, 128, 208, 52, 128, 212, 52, 128, 216,
+ 52, 128, 220, 52, 128, 224, 52, 128, 228, 52, 128, 232, 52, 128, 236,
+ 52, 128, 240, 52, 128, 244, 52, 128, 248, 52, 128, 252, 52, 128, 128,
+ 53, 128, 132, 53, 128, 136, 53, 128, 140, 53, 128, 144, 53, 128, 148,
+ 53, 128, 152, 53, 128, 156, 53, 128, 160, 53, 128, 164, 53, 128, 168,
+ 53, 128, 172, 53, 128, 176, 53, 128, 180, 53, 128, 184, 53, 128, 188,
+ 53, 128, 192, 53, 128, 196, 53, 128, 200, 53, 128, 204, 53, 128, 208,
+ 53, 128, 212, 53, 128, 216, 53, 128, 220, 53, 128, 224, 53, 128, 228,
+ 53, 128, 232, 53, 128, 236, 53, 128, 240, 53, 128, 244, 53, 128, 248,
+ 53, 128, 252, 53, 128, 128, 54, 128, 132, 54, 128, 136, 54, 128, 140,
+ 54, 128, 144, 54, 128, 148, 54, 128, 152, 54, 128, 156, 54, 128, 160,
+ 54, 128, 164, 54, 128, 168, 54, 128, 172, 54, 128, 176, 54, 128, 180,
+ 54, 128, 184, 54, 128, 188, 54, 128, 192, 54, 128, 196, 54, 128, 200,
+ 54, 128, 204, 54, 128, 208, 54, 128, 212, 54, 128, 216, 54, 128, 220,
+ 54, 128, 224, 54, 128, 228, 54, 128, 232, 54, 128, 236, 54, 128, 240,
+ 54, 128, 244, 54, 128, 248, 54, 128, 252, 54, 128, 128, 55, 128, 132,
+ 55, 128, 136, 55, 128, 140, 55, 128, 144, 55, 128, 148, 55, 128, 152,
+ 55, 128, 156, 55, 128, 160, 55, 128, 164, 55, 128, 168, 55, 128, 172,
+ 55, 128, 176, 55, 128, 180, 55, 128, 184, 55, 128, 188, 55, 128, 192,
+ 55, 128, 196, 55, 128, 200, 55, 128, 204, 55, 128, 208, 55, 128, 212,
+ 55, 128, 216, 55, 128, 220, 55, 128, 224, 55, 128, 228, 55, 128, 232,
+ 55, 128, 236, 55, 128, 240, 55, 128, 244, 55, 128, 248, 55, 128, 252,
+ 55, 128, 128, 128, 56, 128, 132, 56, 128, 136, 56, 128, 140, 56, 128,
+ 144, 56, 128, 148, 56, 128, 152, 56, 128, 156, 56, 128, 160, 56, 128,
+ 164, 56, 128, 168, 56, 128, 172, 56, 128, 176, 56, 128, 180, 56, 128,
+ 184, 56, 128, 188, 56, 128, 192, 56, 128, 196, 56, 128, 200, 56, 128,
+ 204, 56, 128, 208, 56, 128, 212, 56, 128, 216, 56, 128, 220, 56, 128,
+ 224, 56, 128, 228, 56, 128, 232, 56, 128, 236, 56, 128, 240, 56, 128,
+ 244, 56, 128, 248, 56, 128, 252, 56, 128, 128, 57, 128, 132, 57, 128,
+ 136, 57, 128, 140, 57, 128, 144, 57, 128, 148, 57, 128, 152, 57, 128,
+ 156, 57, 128, 160, 57, 128, 164, 57, 128, 168, 57, 128, 172, 57, 128,
+ 176, 57, 128, 180, 57, 128, 184, 57, 128, 188, 57, 128, 192, 57, 128,
+ 196, 57, 128, 200, 57, 128, 204, 57, 128, 208, 57, 128, 212, 57, 128,
+ 216, 57, 128, 220, 57, 128, 224, 57, 128, 228, 57, 128, 232, 57, 128,
+ 236, 57, 128, 240, 57, 128, 244, 57, 128, 248, 57, 128, 252, 57, 128,
+ 128, 58, 128, 132, 58, 128, 136, 58, 128, 140, 58, 128, 144, 58, 128,
+ 148, 58, 128, 152, 58, 128, 156, 58, 128, 160, 58, 128, 164, 58, 128,
+ 168, 58, 128, 172, 58, 128, 176, 58, 128, 180, 58, 128, 184, 58, 128,
+ 188, 58, 128, 192, 58, 128, 196, 58, 128, 200, 58, 128, 204, 58, 128,
+ 208, 58, 128, 212, 58, 128, 216, 58, 128, 220, 58, 128, 224, 58, 128,
+ 228, 58, 128, 232, 58, 128, 236, 58, 128, 240, 58, 128, 244, 58, 128,
+ 248, 58, 128, 252, 58, 128, 128, 59, 128, 132, 59, 128, 136, 59, 128,
+ 140, 59, 128, 144, 59, 128, 148, 59, 128, 152, 59, 128, 156, 59, 128,
+ 160, 59, 128, 164, 59, 128, 168, 59, 128, 172, 59, 128, 176, 59, 128,
+ 180, 59, 128, 184, 59, 128, 188, 59, 128, 192, 59, 128, 196, 59, 128,
+ 200, 59, 128, 204, 59, 128, 208, 59, 128, 212, 59, 128, 216, 59, 128,
+ 220, 59, 128, 224, 59, 128, 228, 59, 128, 232, 59, 128, 236, 59, 128,
+ 240, 59, 128, 244, 59, 128, 248, 59, 128, 252, 59, 128, 128, 128, 60,
+ 128, 132, 60, 128, 136, 60, 128, 140, 60, 128, 144, 60, 128, 148, 60,
+ 128, 152, 60, 128, 156, 60, 128, 160, 60, 128, 164, 60, 128, 168, 60,
+ 128, 172, 60, 128, 176, 60, 128, 180, 60, 128, 184, 60, 128, 188, 60,
+ 128, 192, 60, 128, 196, 60, 128, 200, 60, 128, 204, 60, 128, 208, 60,
+ 128, 212, 60, 128, 216, 60, 128, 220, 60, 128, 224, 60, 128, 228, 60,
+ 128, 232, 60, 128, 236, 60, 128, 240, 60, 128, 244, 60, 128, 248, 60,
+ 128, 252, 60, 128, 128, 61, 128, 132, 61, 128, 136, 61, 128, 140, 61,
+ 128, 144, 61, 128, 148, 61, 128, 152, 61, 128, 156, 61, 128, 160, 61,
+ 128, 164, 61, 128, 168, 61, 128, 172, 61, 128, 176, 61, 128, 180, 61,
+ 128, 184, 61, 128, 188, 61, 128, 192, 61, 128, 196, 61, 128, 200, 61,
+ 128, 204, 61, 128, 208, 61, 128, 212, 61, 128, 216, 61, 128, 220, 61,
+ 128, 224, 61, 128, 228, 61, 128, 232, 61, 128, 236, 61, 128, 240, 61,
+ 128, 244, 61, 128, 248, 61, 128, 252, 61, 128, 128, 62, 128, 132, 62,
+ 128, 136, 62, 128, 140, 62, 128, 144, 62, 128, 148, 62, 128, 152, 62,
+ 128, 156, 62, 128, 160, 62, 128, 164, 62, 128, 168, 62, 128, 172, 62,
+ 128, 176, 62, 128, 180, 62, 128, 184, 62, 128, 188, 62, 128, 192, 62,
+ 128, 196, 62, 128, 200, 62, 128, 204, 62, 128, 208, 62, 128, 212, 62,
+ 128, 216, 62, 128, 220, 62, 128, 224, 62, 128, 228, 62, 128, 232, 62,
+ 128, 236, 62, 128, 240, 62, 128, 244, 62, 128, 248, 62, 128, 252, 62,
+ 128, 128, 63, 128, 132, 63, 128, 136, 63, 128, 140, 63, 128, 144, 63,
+ 128, 148, 63, 128, 152, 63, 128, 156, 63, 128, 160, 63, 128, 164, 63,
+ 128, 168, 63, 128, 172, 63, 128, 176, 63, 128, 180, 63, 128, 184, 63,
+ 128, 188, 63, 128, 192, 63, 128, 196, 63, 128, 200, 63, 128, 204, 63,
+ 128, 208, 63, 128, 212, 63, 128, 216, 63, 128, 220, 63, 128, 224, 63,
+ 128, 228, 63, 128, 232, 63, 128, 236, 63, 128, 240, 63, 128, 244, 63,
+ 128, 248, 63, 128, 252, 63};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), true,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(1);
+ for (size_t i = 0; i < 2048; i += 10) {
+ rle->next(data.data(), 1, nullptr);
+ if (i < 1024) {
+ EXPECT_EQ(i, data[0]) << "Wrong output at " << i;
+ } else {
+ EXPECT_EQ(256 * i, data[0]) << "Wrong output at " << i;
+ }
+ if (i < 2038) {
+ rle->skip(9);
+ }
+ rle->skip(0);
+ }
+}
+
+TEST(RLEv1, seekTest) {
+ // Create the RLE stream from Java's
+ // TestRunLengthIntegerEncoding.testUncompressedSeek
+ // for (size_t i = 0; i < 1024; ++i)
+ // out.write(i / 4);
+ // for (size_t i = 1024; i < 2048; ++i)
+ // out.write(2 * i);
+ // for (size_t i = 0; i < 2048; ++i)
+ // out.write(junk[i]);
+ // This causes the first half to be delta encoded and the second half to
+ // be literal encoded.
+ const unsigned char buffer[] = {
+ 1, 0, 0, 1, 0, 2, 1, 0, 4, 1, 0, 6, 1, 0, 8,
+ 1, 0, 10, 1, 0, 12, 1, 0, 14, 1, 0, 16, 1, 0, 18,
+ 1, 0, 20, 1, 0, 22, 1, 0, 24, 1, 0, 26, 1, 0, 28,
+ 1, 0, 30, 1, 0, 32, 1, 0, 34, 1, 0, 36, 1, 0, 38,
+ 1, 0, 40, 1, 0, 42, 1, 0, 44, 1, 0, 46, 1, 0, 48,
+ 1, 0, 50, 1, 0, 52, 1, 0, 54, 1, 0, 56, 1, 0, 58,
+ 1, 0, 60, 1, 0, 62, 1, 0, 64, 1, 0, 66, 1, 0, 68,
+ 1, 0, 70, 1, 0, 72, 1, 0, 74, 1, 0, 76, 1, 0, 78,
+ 1, 0, 80, 1, 0, 82, 1, 0, 84, 1, 0, 86, 1, 0, 88,
+ 1, 0, 90, 1, 0, 92, 1, 0, 94, 1, 0, 96, 1, 0, 98,
+ 1, 0, 100, 1, 0, 102, 1, 0, 104, 1, 0, 106, 1, 0, 108,
+ 1, 0, 110, 1, 0, 112, 1, 0, 114, 1, 0, 116, 1, 0, 118,
+ 1, 0, 120, 1, 0, 122, 1, 0, 124, 1, 0, 126, 1, 0, 128,
+ 1, 1, 0, 130, 1, 1, 0, 132, 1, 1, 0, 134, 1, 1, 0,
+ 136, 1, 1, 0, 138, 1, 1, 0, 140, 1, 1, 0, 142, 1, 1,
+ 0, 144, 1, 1, 0, 146, 1, 1, 0, 148, 1, 1, 0, 150, 1,
+ 1, 0, 152, 1, 1, 0, 154, 1, 1, 0, 156, 1, 1, 0, 158,
+ 1, 1, 0, 160, 1, 1, 0, 162, 1, 1, 0, 164, 1, 1, 0,
+ 166, 1, 1, 0, 168, 1, 1, 0, 170, 1, 1, 0, 172, 1, 1,
+ 0, 174, 1, 1, 0, 176, 1, 1, 0, 178, 1, 1, 0, 180, 1,
+ 1, 0, 182, 1, 1, 0, 184, 1, 1, 0, 186, 1, 1, 0, 188,
+ 1, 1, 0, 190, 1, 1, 0, 192, 1, 1, 0, 194, 1, 1, 0,
+ 196, 1, 1, 0, 198, 1, 1, 0, 200, 1, 1, 0, 202, 1, 1,
+ 0, 204, 1, 1, 0, 206, 1, 1, 0, 208, 1, 1, 0, 210, 1,
+ 1, 0, 212, 1, 1, 0, 214, 1, 1, 0, 216, 1, 1, 0, 218,
+ 1, 1, 0, 220, 1, 1, 0, 222, 1, 1, 0, 224, 1, 1, 0,
+ 226, 1, 1, 0, 228, 1, 1, 0, 230, 1, 1, 0, 232, 1, 1,
+ 0, 234, 1, 1, 0, 236, 1, 1, 0, 238, 1, 1, 0, 240, 1,
+ 1, 0, 242, 1, 1, 0, 244, 1, 1, 0, 246, 1, 1, 0, 248,
+ 1, 1, 0, 250, 1, 1, 0, 252, 1, 1, 0, 254, 1, 1, 0,
+ 128, 2, 1, 0, 130, 2, 1, 0, 132, 2, 1, 0, 134, 2, 1,
+ 0, 136, 2, 1, 0, 138, 2, 1, 0, 140, 2, 1, 0, 142, 2,
+ 1, 0, 144, 2, 1, 0, 146, 2, 1, 0, 148, 2, 1, 0, 150,
+ 2, 1, 0, 152, 2, 1, 0, 154, 2, 1, 0, 156, 2, 1, 0,
+ 158, 2, 1, 0, 160, 2, 1, 0, 162, 2, 1, 0, 164, 2, 1,
+ 0, 166, 2, 1, 0, 168, 2, 1, 0, 170, 2, 1, 0, 172, 2,
+ 1, 0, 174, 2, 1, 0, 176, 2, 1, 0, 178, 2, 1, 0, 180,
+ 2, 1, 0, 182, 2, 1, 0, 184, 2, 1, 0, 186, 2, 1, 0,
+ 188, 2, 1, 0, 190, 2, 1, 0, 192, 2, 1, 0, 194, 2, 1,
+ 0, 196, 2, 1, 0, 198, 2, 1, 0, 200, 2, 1, 0, 202, 2,
+ 1, 0, 204, 2, 1, 0, 206, 2, 1, 0, 208, 2, 1, 0, 210,
+ 2, 1, 0, 212, 2, 1, 0, 214, 2, 1, 0, 216, 2, 1, 0,
+ 218, 2, 1, 0, 220, 2, 1, 0, 222, 2, 1, 0, 224, 2, 1,
+ 0, 226, 2, 1, 0, 228, 2, 1, 0, 230, 2, 1, 0, 232, 2,
+ 1, 0, 234, 2, 1, 0, 236, 2, 1, 0, 238, 2, 1, 0, 240,
+ 2, 1, 0, 242, 2, 1, 0, 244, 2, 1, 0, 246, 2, 1, 0,
+ 248, 2, 1, 0, 250, 2, 1, 0, 252, 2, 1, 0, 254, 2, 1,
+ 0, 128, 3, 1, 0, 130, 3, 1, 0, 132, 3, 1, 0, 134, 3,
+ 1, 0, 136, 3, 1, 0, 138, 3, 1, 0, 140, 3, 1, 0, 142,
+ 3, 1, 0, 144, 3, 1, 0, 146, 3, 1, 0, 148, 3, 1, 0,
+ 150, 3, 1, 0, 152, 3, 1, 0, 154, 3, 1, 0, 156, 3, 1,
+ 0, 158, 3, 1, 0, 160, 3, 1, 0, 162, 3, 1, 0, 164, 3,
+ 1, 0, 166, 3, 1, 0, 168, 3, 1, 0, 170, 3, 1, 0, 172,
+ 3, 1, 0, 174, 3, 1, 0, 176, 3, 1, 0, 178, 3, 1, 0,
+ 180, 3, 1, 0, 182, 3, 1, 0, 184, 3, 1, 0, 186, 3, 1,
+ 0, 188, 3, 1, 0, 190, 3, 1, 0, 192, 3, 1, 0, 194, 3,
+ 1, 0, 196, 3, 1, 0, 198, 3, 1, 0, 200, 3, 1, 0, 202,
+ 3, 1, 0, 204, 3, 1, 0, 206, 3, 1, 0, 208, 3, 1, 0,
+ 210, 3, 1, 0, 212, 3, 1, 0, 214, 3, 1, 0, 216, 3, 1,
+ 0, 218, 3, 1, 0, 220, 3, 1, 0, 222, 3, 1, 0, 224, 3,
+ 1, 0, 226, 3, 1, 0, 228, 3, 1, 0, 230, 3, 1, 0, 232,
+ 3, 1, 0, 234, 3, 1, 0, 236, 3, 1, 0, 238, 3, 1, 0,
+ 240, 3, 1, 0, 242, 3, 1, 0, 244, 3, 1, 0, 246, 3, 1,
+ 0, 248, 3, 1, 0, 250, 3, 1, 0, 252, 3, 1, 0, 254, 3,
+ 127, 2, 128, 32, 127, 2, 136, 36, 127, 2, 144, 40, 127, 2, 152,
+ 44, 127, 2, 160, 48, 127, 2, 168, 52, 127, 2, 176, 56, 111, 2,
+ 184, 60, 128, 147, 150, 232, 240, 8, 168, 134, 179, 187, 12, 246, 145,
+ 173, 142, 11, 241, 162, 190, 162, 9, 239, 218, 128, 243, 5, 202, 175,
+ 131, 196, 12, 151, 253, 204, 160, 4, 229, 167, 247, 255, 12, 255, 177,
+ 140, 184, 7, 188, 145, 181, 229, 1, 178, 190, 158, 163, 8, 147, 179,
+ 151, 132, 8, 150, 133, 222, 129, 11, 193, 218, 187, 242, 14, 181, 177,
+ 154, 155, 9, 150, 145, 194, 135, 8, 186, 222, 142, 242, 10, 140, 195,
+ 254, 237, 11, 141, 189, 143, 198, 14, 229, 146, 237, 203, 8, 251, 162,
+ 179, 211, 3, 222, 237, 175, 145, 13, 221, 178, 163, 162, 3, 211, 192,
+ 165, 189, 14, 230, 228, 168, 250, 4, 141, 140, 247, 178, 7, 143, 164,
+ 170, 152, 2, 131, 166, 136, 26, 171, 143, 232, 134, 12, 158, 239, 246,
+ 204, 11, 133, 128, 213, 223, 14, 255, 213, 190, 250, 15, 143, 162, 252,
+ 157, 4, 204, 181, 135, 245, 7, 206, 241, 254, 136, 4, 184, 182, 211,
+ 190, 15, 172, 156, 202, 135, 10, 249, 180, 139, 131, 4, 202, 128, 204,
+ 221, 9, 131, 247, 166, 249, 8, 141, 236, 241, 185, 3, 128, 229, 150,
+ 186, 2, 237, 189, 141, 218, 9, 193, 240, 241, 156, 3, 210, 142, 198,
+ 202, 10, 227, 241, 194, 234, 7, 145, 180, 228, 254, 6, 171, 249, 185,
+ 188, 11, 215, 135, 224, 219, 4, 133, 132, 178, 165, 7, 205, 180, 133,
+ 209, 11, 198, 253, 246, 145, 12, 190, 194, 153, 146, 8, 139, 220, 235,
+ 249, 1, 170, 203, 205, 159, 6, 136, 130, 154, 166, 14, 250, 189, 153,
+ 191, 7, 178, 163, 191, 158, 12, 251, 138, 135, 245, 10, 175, 249, 219,
+ 164, 14, 136, 185, 220, 188, 7, 170, 135, 221, 146, 7, 209, 224, 204,
+ 171, 11, 216, 144, 236, 172, 1, 133, 205, 202, 170, 6, 215, 250, 133,
+ 181, 3, 181, 133, 142, 158, 5, 166, 192, 134, 238, 13, 246, 243, 233,
+ 218, 12, 163, 202, 238, 241, 14, 241, 214, 224, 215, 2, 212, 192, 237,
+ 243, 10, 163, 165, 163, 206, 6, 159, 161, 227, 152, 14, 209, 234, 225,
+ 249, 13, 167, 206, 188, 161, 3, 143, 209, 188, 214, 11, 184, 224, 210,
+ 200, 10, 185, 171, 199, 183, 3, 177, 229, 245, 86, 255, 183, 178, 142,
+ 9, 232, 209, 135, 151, 8, 191, 153, 174, 175, 7, 190, 245, 224, 174,
+ 9, 243, 165, 145, 169, 1, 145, 161, 221, 249, 13, 195, 221, 244, 240,
+ 5, 157, 156, 217, 237, 15, 143, 201, 155, 207, 5, 169, 136, 192, 238,
+ 12, 135, 223, 244, 200, 2, 137, 228, 167, 187, 1, 134, 212, 158, 155,
+ 15, 186, 224, 212, 214, 7, 193, 141, 216, 241, 2, 246, 159, 138, 117,
+ 216, 230, 215, 29, 204, 178, 147, 255, 8, 195, 140, 136, 164, 11, 234,
+ 204, 155, 222, 10, 193, 156, 138, 187, 8, 161, 161, 184, 212, 1, 128,
+ 141, 162, 133, 13, 180, 211, 132, 210, 9, 239, 203, 201, 177, 5, 236,
+ 191, 140, 207, 13, 173, 205, 192, 186, 7, 179, 214, 222, 136, 8, 189,
+ 142, 204, 152, 5, 221, 176, 135, 241, 1, 223, 146, 195, 166, 11, 146,
+ 133, 226, 137, 6, 150, 243, 247, 1, 153, 246, 184, 42, 234, 194, 229,
+ 98, 237, 144, 253, 133, 11, 196, 131, 158, 244, 6, 218, 149, 253, 221,
+ 7, 219, 180, 234, 156, 10, 179, 255, 197, 218, 13, 150, 137, 240, 204,
+ 9, 240, 185, 181, 203, 2, 160, 194, 146, 246, 5, 131, 168, 191, 138,
+ 4, 158, 245, 240, 150, 15, 157, 202, 136, 14, 135, 154, 226, 240, 5,
+ 153, 168, 212, 222, 8, 128, 218, 198, 244, 133, 13, 183, 245, 153, 118,
+ 139, 141, 238, 141, 1, 235, 193, 197, 5, 169, 141, 210, 62, 231, 186,
+ 238, 219, 6, 141, 243, 204, 242, 12, 172, 165, 150, 187, 13, 163, 254,
+ 250, 230, 12, 203, 166, 166, 223, 3, 177, 155, 168, 182, 4, 213, 130,
+ 148, 221, 3, 150, 178, 146, 235, 6, 149, 226, 237, 225, 2, 177, 149,
+ 218, 10, 205, 241, 161, 21, 186, 239, 197, 189, 15, 132, 249, 249, 171,
+ 5, 130, 223, 220, 167, 5, 171, 235, 129, 84, 207, 145, 246, 231, 2,
+ 183, 176, 230, 148, 11, 180, 142, 254, 128, 1, 171, 251, 177, 177, 1,
+ 188, 190, 157, 222, 11, 140, 195, 192, 141, 10, 200, 139, 160, 247, 9,
+ 139, 247, 194, 144, 1, 160, 160, 234, 208, 11, 174, 210, 150, 196, 15,
+ 209, 201, 176, 208, 14, 199, 183, 218, 132, 8, 175, 143, 188, 168, 7,
+ 172, 234, 158, 248, 11, 192, 223, 160, 152, 7, 178, 134, 130, 235, 3,
+ 243, 134, 181, 181, 4, 225, 135, 251, 236, 7, 203, 166, 149, 169, 10,
+ 181, 213, 156, 193, 12, 239, 138, 235, 252, 2, 183, 243, 201, 133, 10,
+ 137, 186, 227, 237, 13, 255, 188, 221, 148, 14, 188, 156, 198, 143, 15,
+ 223, 224, 252, 208, 9, 160, 241, 190, 221, 13, 195, 241, 163, 241, 9,
+ 199, 253, 138, 163, 12, 173, 251, 143, 133, 12, 167, 246, 153, 247, 14,
+ 237, 223, 140, 174, 14, 219, 229, 138, 242, 2, 200, 163, 210, 86, 197,
+ 251, 199, 241, 9, 243, 211, 209, 132, 3, 178, 176, 152, 224, 13, 195,
+ 131, 248, 159, 5, 194, 255, 160, 171, 14, 145, 243, 143, 173, 3, 222,
+ 168, 246, 134, 2, 178, 145, 204, 240, 1, 176, 240, 236, 165, 14, 254,
+ 145, 162, 165, 8, 243, 173, 131, 238, 3, 247, 192, 235, 163, 4, 244,
+ 239, 180, 203, 15, 214, 167, 152, 233, 13, 176, 158, 206, 235, 9, 252,
+ 150, 228, 160, 13, 148, 243, 234, 239, 2, 225, 152, 250, 167, 5, 252,
+ 143, 229, 254, 4, 184, 202, 161, 157, 14, 233, 190, 185, 195, 9, 159,
+ 223, 240, 216, 11, 132, 172, 243, 200, 6, 212, 182, 191, 194, 13, 230,
+ 245, 240, 130, 12, 189, 146, 233, 239, 2, 155, 190, 214, 183, 15, 159,
+ 222, 148, 155, 13, 195, 158, 248, 112, 224, 219, 145, 234, 12, 145, 169,
+ 172, 135, 10, 234, 184, 245, 220, 4, 138, 150, 232, 212, 5, 132, 195,
+ 135, 214, 5, 181, 247, 216, 205, 12, 239, 160, 183, 178, 9, 161, 143,
+ 210, 206, 11, 248, 209, 207, 94, 166, 178, 165, 97, 133, 162, 246, 212,
+ 9, 206, 240, 235, 156, 1, 200, 228, 176, 252, 12, 163, 215, 219, 141,
+ 1, 236, 133, 216, 202, 9, 220, 170, 222, 242, 10, 239, 203, 197, 220,
+ 11, 148, 218, 209, 161, 7, 185, 175, 210, 171, 15, 153, 213, 208, 214,
+ 15, 188, 239, 128, 244, 13, 141, 220, 136, 166, 12, 150, 148, 250, 175,
+ 13, 130, 145, 226, 216, 1, 216, 204, 215, 193, 9, 191, 211, 181, 229,
+ 14, 233, 168, 165, 9, 240, 188, 146, 132, 12, 173, 220, 201, 244, 4,
+ 140, 147, 190, 199, 15, 190, 213, 175, 213, 1, 254, 212, 239, 171, 10,
+ 200, 161, 168, 144, 10, 161, 188, 230, 163, 6, 192, 198, 213, 167, 3,
+ 240, 251, 180, 243, 5, 202, 165, 247, 147, 7, 173, 191, 133, 228, 3,
+ 229, 139, 154, 210, 7, 147, 254, 164, 236, 13, 162, 214, 180, 128, 8,
+ 202, 176, 252, 143, 13, 154, 179, 169, 149, 3, 169, 156, 168, 229, 1,
+ 164, 128, 214, 138, 15, 128, 239, 253, 160, 181, 2, 232, 203, 196, 235,
+ 11, 181, 153, 131, 240, 12, 145, 178, 179, 206, 12, 134, 244, 215, 141,
+ 10, 138, 228, 171, 244, 7, 246, 160, 221, 177, 14, 176, 231, 208, 135,
+ 9, 194, 210, 159, 234, 2, 238, 250, 139, 146, 10, 249, 191, 224, 241,
+ 10, 250, 140, 140, 147, 5, 190, 185, 216, 220, 15, 248, 131, 153, 236,
+ 9, 140, 219, 183, 252, 14, 254, 184, 223, 216, 14, 253, 211, 235, 254,
+ 14, 252, 180, 147, 152, 9, 147, 221, 188, 174, 1, 222, 219, 180, 185,
+ 12, 185, 175, 244, 136, 9, 214, 147, 217, 182, 4, 191, 193, 233, 157,
+ 2, 238, 191, 156, 211, 14, 229, 221, 129, 224, 2, 230, 212, 248, 128,
+ 3, 186, 165, 136, 84, 129, 216, 148, 139, 15, 150, 231, 196, 184, 8,
+ 160, 156, 253, 171, 2, 156, 198, 161, 183, 11, 164, 181, 155, 137, 8,
+ 133, 196, 192, 213, 6, 140, 174, 143, 152, 12, 142, 202, 143, 192, 9,
+ 128, 167, 234, 152, 13, 214, 131, 156, 246, 14, 167, 223, 250, 135, 4,
+ 233, 185, 236, 128, 1, 138, 131, 251, 181, 9, 184, 141, 213, 136, 15,
+ 171, 224, 222, 192, 12, 244, 168, 162, 144, 1, 212, 183, 184, 200, 9,
+ 177, 193, 168, 174, 14, 249, 175, 129, 197, 1, 142, 181, 130, 162, 10,
+ 214, 197, 196, 214, 4, 148, 146, 228, 202, 13, 213, 154, 241, 127, 165,
+ 166, 144, 164, 4, 205, 251, 139, 128, 13, 244, 188, 143, 236, 12, 190,
+ 247, 138, 217, 8, 185, 201, 217, 187, 4, 130, 142, 167, 137, 4, 139,
+ 185, 215, 95, 136, 170, 224, 218, 9, 154, 158, 177, 200, 15, 227, 154,
+ 189, 136, 15, 224, 233, 220, 179, 3, 227, 203, 160, 188, 7, 236, 228,
+ 239, 162, 15, 214, 227, 159, 242, 4, 151, 252, 232, 42, 151, 166, 168,
+ 245, 3, 135, 180, 250, 243, 15, 167, 254, 137, 160, 13, 214, 240, 225,
+ 152, 8, 190, 229, 204, 136, 13, 150, 219, 186, 10, 163, 249, 225, 249,
+ 6, 215, 233, 254, 162, 9, 171, 204, 237, 189, 5, 229, 137, 174, 157,
+ 6, 135, 205, 140, 164, 10, 189, 136, 130, 244, 1, 210, 222, 223, 247,
+ 1, 189, 128, 142, 203, 12, 232, 241, 180, 195, 12, 237, 228, 243, 183,
+ 7, 218, 155, 204, 158, 14, 235, 167, 134, 183, 6, 171, 218, 141, 128,
+ 3, 184, 152, 251, 187, 10, 138, 217, 169, 182, 2, 210, 140, 240, 138,
+ 7, 150, 156, 232, 128, 9, 209, 231, 181, 174, 14, 243, 210, 173, 34,
+ 220, 254, 188, 199, 14, 245, 195, 226, 124, 141, 228, 248, 228, 15, 158,
+ 166, 194, 150, 6, 152, 220, 238, 252, 13, 179, 132, 217, 220, 15, 213,
+ 168, 186, 245, 4, 241, 243, 200, 226, 10, 216, 178, 141, 137, 13, 134,
+ 176, 169, 179, 6, 212, 242, 197, 75, 175, 222, 238, 237, 10, 185, 143,
+ 171, 166, 6, 180, 198, 129, 170, 5, 159, 129, 176, 134, 11, 130, 248,
+ 213, 183, 12, 204, 162, 169, 238, 8, 139, 139, 145, 227, 15, 232, 239,
+ 206, 163, 3, 145, 157, 143, 183, 10, 250, 190, 179, 189, 3, 185, 138,
+ 211, 215, 3, 179, 147, 158, 165, 13, 231, 226, 199, 245, 11, 147, 179,
+ 178, 190, 1, 208, 217, 154, 195, 14, 226, 194, 229, 142, 8, 198, 175,
+ 184, 231, 4, 199, 198, 191, 24, 184, 134, 226, 231, 10, 152, 208, 222,
+ 254, 1, 134, 167, 234, 69, 175, 214, 177, 218, 3, 218, 234, 128, 162,
+ 3, 160, 177, 187, 166, 3, 201, 210, 191, 159, 13, 240, 152, 160, 250,
+ 6, 235, 130, 214, 240, 11, 128, 237, 251, 245, 225, 3, 245, 237, 174,
+ 230, 9, 252, 148, 229, 201, 7, 152, 148, 165, 153, 7, 223, 238, 242,
+ 16, 156, 212, 237, 228, 7, 139, 153, 178, 37, 219, 217, 217, 172, 15,
+ 178, 168, 128, 199, 9, 236, 189, 144, 226, 12, 214, 248, 134, 230, 13,
+ 163, 252, 247, 55, 239, 252, 149, 196, 3, 230, 159, 214, 139, 6, 132,
+ 200, 241, 154, 2, 129, 231, 153, 173, 12, 235, 131, 255, 157, 2, 246,
+ 190, 145, 55, 205, 201, 240, 141, 9, 188, 202, 199, 189, 6, 196, 235,
+ 245, 205, 11, 249, 253, 241, 223, 6, 187, 250, 137, 241, 9, 133, 135,
+ 168, 146, 8, 132, 248, 219, 156, 8, 132, 241, 185, 4, 198, 209, 147,
+ 129, 11, 229, 192, 218, 178, 4, 199, 210, 138, 166, 13, 244, 148, 172,
+ 141, 2, 194, 215, 171, 220, 1, 192, 248, 230, 128, 2, 238, 167, 209,
+ 222, 11, 240, 200, 227, 150, 11, 182, 217, 170, 158, 14, 223, 223, 254,
+ 201, 10, 140, 164, 245, 175, 2, 178, 140, 153, 102, 139, 145, 181, 242,
+ 8, 188, 154, 214, 154, 15, 149, 187, 204, 192, 2, 223, 153, 219, 51,
+ 245, 236, 130, 133, 5, 197, 138, 169, 80, 243, 162, 164, 167, 1, 206,
+ 232, 180, 137, 12, 180, 191, 164, 226, 8, 162, 180, 231, 222, 13, 184,
+ 143, 156, 74, 134, 230, 248, 219, 10, 203, 156, 149, 205, 1, 219, 205,
+ 173, 167, 10, 174, 146, 180, 141, 7, 214, 231, 229, 231, 10, 181, 246,
+ 174, 180, 15, 236, 175, 222, 241, 7, 191, 150, 253, 209, 8, 233, 139,
+ 167, 149, 13, 142, 249, 150, 223, 10, 220, 151, 135, 222, 5, 138, 228,
+ 133, 131, 4, 232, 183, 160, 245, 3, 157, 219, 209, 200, 5, 159, 242,
+ 142, 148, 13, 241, 207, 248, 177, 11, 179, 226, 169, 150, 13, 169, 201,
+ 212, 218, 8, 172, 214, 220, 31, 155, 173, 251, 231, 12, 221, 150, 137,
+ 174, 15, 146, 137, 251, 255, 14, 245, 216, 203, 138, 1, 163, 170, 194,
+ 133, 12, 205, 157, 188, 131, 12, 184, 220, 161, 97, 162, 240, 190, 243,
+ 2, 213, 134, 147, 251, 3, 178, 160, 193, 188, 14, 214, 153, 226, 140,
+ 12, 191, 208, 235, 174, 13, 138, 188, 204, 236, 11, 214, 135, 129, 235,
+ 10, 198, 242, 226, 128, 11, 154, 219, 163, 144, 7, 236, 134, 217, 197,
+ 2, 181, 248, 144, 157, 8, 150, 174, 195, 224, 12, 156, 247, 234, 192,
+ 7, 156, 206, 174, 246, 2, 181, 214, 138, 155, 1, 246, 242, 141, 152,
+ 9, 207, 157, 139, 243, 1, 153, 135, 158, 249, 6, 162, 129, 144, 170,
+ 13, 227, 162, 245, 246, 1, 130, 237, 192, 208, 13, 187, 165, 153, 215,
+ 8, 178, 141, 203, 163, 15, 172, 179, 180, 172, 10, 206, 200, 237, 194,
+ 12, 129, 235, 165, 143, 7, 129, 230, 217, 244, 8, 223, 249, 152, 233,
+ 2, 160, 224, 204, 187, 10, 167, 211, 138, 247, 7, 207, 204, 131, 200,
+ 1, 207, 240, 161, 219, 9, 219, 213, 129, 183, 11, 186, 163, 243, 198,
+ 13, 217, 197, 175, 218, 8, 195, 228, 209, 137, 1, 149, 253, 193, 190,
+ 8, 216, 231, 225, 190, 15, 244, 168, 191, 152, 6, 180, 210, 162, 198,
+ 9, 172, 159, 195, 158, 9, 173, 151, 226, 34, 143, 231, 162, 212, 6,
+ 250, 171, 192, 187, 11, 229, 212, 155, 156, 9, 234, 159, 165, 254, 8,
+ 180, 154, 227, 197, 3, 175, 158, 214, 235, 8, 164, 157, 160, 130, 4,
+ 158, 223, 243, 254, 10, 178, 236, 213, 212, 12, 194, 173, 185, 159, 6,
+ 184, 214, 195, 172, 5, 128, 161, 203, 183, 194, 10, 207, 218, 209, 222,
+ 12, 136, 166, 226, 224, 3, 148, 153, 145, 214, 4, 164, 178, 253, 243,
+ 4, 173, 162, 237, 129, 4, 236, 134, 193, 169, 14, 140, 234, 164, 190,
+ 7, 211, 148, 252, 223, 8, 213, 149, 180, 170, 12, 194, 182, 191, 205,
+ 15, 206, 233, 190, 211, 2, 241, 136, 223, 152, 12, 184, 185, 231, 176,
+ 10, 201, 166, 182, 211, 4, 209, 201, 205, 235, 1, 141, 184, 205, 173,
+ 15, 244, 222, 218, 113, 175, 190, 179, 140, 4, 234, 232, 231, 183, 8,
+ 174, 167, 140, 130, 9, 169, 157, 136, 196, 14, 187, 244, 242, 135, 7,
+ 248, 183, 178, 253, 10, 135, 216, 152, 153, 15, 226, 223, 172, 161, 11,
+ 236, 183, 231, 216, 3, 183, 169, 209, 137, 13, 130, 219, 233, 167, 4,
+ 168, 132, 197, 161, 7, 164, 146, 152, 207, 4, 239, 229, 147, 130, 2,
+ 172, 156, 244, 148, 6, 171, 253, 185, 213, 4, 184, 181, 241, 207, 1,
+ 144, 250, 219, 222, 1, 213, 189, 209, 177, 10, 207, 252, 251, 239, 9,
+ 181, 132, 203, 147, 6, 159, 135, 181, 18, 215, 252, 202, 234, 7, 207,
+ 215, 210, 222, 12, 195, 211, 185, 171, 14, 178, 132, 165, 140, 9, 139,
+ 160, 171, 250, 1, 248, 176, 203, 170, 14, 148, 184, 131, 141, 4, 158,
+ 226, 204, 197, 3, 215, 157, 148, 219, 15, 228, 206, 156, 132, 3, 234,
+ 206, 202, 231, 8, 232, 177, 135, 215, 10, 173, 253, 176, 172, 5, 144,
+ 188, 170, 229, 14, 200, 165, 144, 50, 198, 153, 206, 184, 3, 150, 128,
+ 128, 141, 14, 155, 221, 221, 199, 12, 229, 199, 160, 156, 3, 176, 172,
+ 200, 97, 222, 255, 134, 158, 9, 233, 155, 199, 193, 14, 146, 216, 186,
+ 250, 13, 156, 152, 194, 212, 8, 254, 190, 240, 232, 2, 178, 210, 194,
+ 160, 3, 142, 216, 141, 184, 10, 173, 210, 214, 187, 2, 161, 211, 201,
+ 143, 5, 213, 149, 210, 222, 15, 134, 165, 184, 171, 9, 211, 175, 153,
+ 241, 9, 227, 201, 184, 213, 1, 173, 225, 213, 176, 13, 143, 228, 200,
+ 151, 12, 224, 224, 224, 186, 8, 188, 153, 234, 254, 7, 137, 188, 238,
+ 186, 8, 166, 236, 135, 180, 13, 202, 174, 133, 194, 13, 179, 243, 158,
+ 193, 13, 210, 173, 128, 149, 2, 208, 216, 158, 168, 13, 205, 251, 152,
+ 230, 3, 245, 245, 254, 163, 9, 211, 243, 234, 164, 9, 173, 221, 221,
+ 215, 4, 146, 220, 209, 198, 1, 235, 237, 170, 130, 7, 181, 227, 149,
+ 141, 2, 170, 245, 149, 217, 5, 153, 179, 215, 195, 14, 249, 206, 140,
+ 148, 1, 247, 200, 219, 152, 15, 165, 228, 197, 152, 11, 234, 192, 242,
+ 244, 6, 217, 229, 173, 147, 3, 216, 209, 206, 189, 7, 165, 171, 221,
+ 214, 2, 151, 250, 211, 138, 2, 144, 169, 182, 176, 13, 179, 254, 191,
+ 225, 3, 244, 147, 218, 212, 3, 129, 187, 183, 253, 10, 218, 149, 188,
+ 168, 10, 223, 241, 149, 129, 8, 209, 128, 150, 126, 153, 139, 195, 131,
+ 6, 201, 208, 246, 221, 1, 194, 165, 175, 173, 5, 197, 133, 207, 196,
+ 2, 192, 211, 129, 210, 7, 211, 147, 163, 220, 9, 173, 191, 188, 152,
+ 1, 169, 242, 205, 20, 167, 133, 213, 211, 2, 213, 226, 129, 166, 12,
+ 186, 202, 155, 203, 5, 180, 251, 220, 174, 12, 145, 228, 247, 146, 12,
+ 196, 151, 247, 184, 10, 217, 233, 238, 147, 6, 149, 174, 181, 128, 13,
+ 128, 246, 173, 207, 15, 200, 162, 139, 103, 237, 199, 220, 252, 7, 208,
+ 201, 133, 231, 3, 140, 148, 223, 137, 5, 128, 242, 251, 140, 228, 11,
+ 214, 205, 158, 228, 2, 147, 190, 212, 138, 4, 228, 228, 253, 154, 9,
+ 146, 191, 248, 187, 8, 168, 200, 246, 160, 4, 224, 168, 147, 211, 11,
+ 153, 197, 133, 229, 5, 176, 131, 167, 203, 6, 213, 183, 189, 178, 10,
+ 185, 222, 229, 183, 5, 171, 185, 208, 162, 15, 203, 130, 137, 201, 6,
+ 236, 152, 138, 176, 1, 221, 200, 169, 183, 11, 237, 230, 219, 108, 152,
+ 247, 239, 145, 14, 242, 220, 245, 148, 6, 183, 147, 218, 144, 11, 236,
+ 190, 230, 197, 1, 253, 147, 205, 165, 10, 181, 130, 138, 249, 10, 193,
+ 135, 148, 142, 10, 232, 132, 254, 163, 4, 244, 153, 241, 197, 13, 251,
+ 150, 230, 242, 10, 211, 255, 182, 243, 3, 247, 137, 150, 236, 5, 137,
+ 168, 208, 161, 10, 192, 178, 137, 210, 13, 192, 158, 177, 203, 7, 237,
+ 221, 208, 153, 4, 180, 129, 195, 139, 4, 195, 220, 254, 129, 8, 235,
+ 249, 252, 142, 2, 171, 195, 208, 162, 12, 205, 185, 192, 166, 9, 208,
+ 205, 169, 160, 10, 156, 148, 150, 185, 2, 246, 165, 207, 129, 12, 145,
+ 207, 129, 130, 15, 253, 209, 184, 133, 11, 247, 226, 200, 185, 9, 193,
+ 147, 150, 128, 8, 251, 208, 155, 45, 251, 142, 248, 144, 15, 174, 199,
+ 157, 236, 12, 206, 215, 156, 131, 14, 224, 242, 193, 145, 9, 194, 231,
+ 136, 243, 7, 135, 188, 221, 220, 10, 252, 138, 172, 180, 15, 222, 245,
+ 235, 161, 2, 147, 195, 191, 195, 7, 191, 205, 163, 247, 3, 237, 172,
+ 239, 187, 6, 137, 141, 231, 233, 10, 246, 253, 140, 184, 5, 191, 252,
+ 199, 190, 13, 235, 212, 206, 220, 8, 163, 219, 233, 232, 13, 166, 129,
+ 242, 168, 12, 131, 217, 184, 209, 7, 138, 139, 223, 216, 8, 186, 152,
+ 149, 207, 6, 229, 191, 144, 149, 8, 223, 167, 204, 251, 1, 181, 240,
+ 166, 200, 9, 194, 230, 150, 122, 210, 176, 221, 179, 5, 137, 169, 225,
+ 196, 2, 190, 138, 243, 173, 10, 155, 224, 148, 154, 15, 180, 176, 218,
+ 153, 2, 194, 220, 179, 239, 3, 209, 243, 151, 171, 1, 135, 192, 192,
+ 129, 3, 154, 145, 158, 166, 8, 174, 159, 201, 207, 1, 134, 247, 247,
+ 152, 5, 169, 139, 159, 171, 3, 173, 170, 159, 244, 15, 201, 205, 215,
+ 223, 9, 227, 214, 226, 134, 14, 237, 245, 216, 153, 1, 207, 208, 244,
+ 63, 136, 146, 237, 215, 2, 131, 173, 129, 187, 4, 150, 204, 222, 185,
+ 6, 243, 177, 246, 252, 5, 246, 173, 234, 215, 14, 207, 252, 211, 199,
+ 3, 177, 211, 230, 228, 5, 208, 143, 209, 191, 13, 173, 192, 232, 246,
+ 12, 132, 255, 207, 139, 14, 171, 129, 141, 173, 7, 255, 222, 227, 255,
+ 12, 155, 193, 184, 244, 14, 171, 144, 214, 163, 1, 241, 232, 221, 228,
+ 15, 188, 160, 210, 226, 13, 189, 190, 189, 5, 204, 252, 250, 234, 10,
+ 228, 161, 153, 190, 9, 210, 208, 187, 214, 7, 198, 154, 214, 242, 9,
+ 197, 163, 254, 27, 220, 251, 130, 172, 2, 193, 147, 157, 255, 14, 242,
+ 131, 138, 180, 14, 200, 239, 175, 239, 5, 181, 157, 238, 152, 1, 203,
+ 211, 156, 220, 10, 210, 166, 223, 241, 2, 214, 243, 250, 244, 10, 238,
+ 200, 226, 216, 9, 168, 140, 235, 228, 14, 149, 176, 161, 188, 9, 180,
+ 224, 247, 138, 11, 168, 159, 157, 226, 7, 216, 226, 212, 131, 5, 158,
+ 162, 174, 190, 2, 147, 131, 155, 194, 4, 227, 156, 248, 169, 14, 210,
+ 216, 130, 142, 14, 233, 234, 248, 230, 13, 146, 190, 216, 248, 9, 128,
+ 173, 190, 149, 182, 11, 254, 210, 132, 152, 8, 211, 239, 231, 248, 9,
+ 132, 255, 247, 168, 7, 149, 224, 145, 136, 14, 162, 220, 148, 134, 6,
+ 204, 244, 192, 159, 8, 178, 160, 245, 237, 15, 193, 167, 249, 251, 5,
+ 238, 159, 153, 199, 9, 228, 225, 136, 225, 9, 147, 221, 134, 220, 7,
+ 249, 129, 250, 131, 5, 255, 249, 227, 129, 15, 183, 246, 177, 190, 10,
+ 217, 182, 196, 128, 6, 136, 242, 159, 173, 1, 244, 128, 137, 210, 10,
+ 154, 223, 230, 173, 7, 193, 171, 203, 220, 9, 193, 222, 146, 129, 2,
+ 159, 229, 247, 153, 1, 205, 139, 189, 204, 13, 181, 152, 211, 186, 3,
+ 252, 181, 234, 182, 4, 230, 212, 233, 169, 13, 134, 211, 157, 165, 1,
+ 218, 165, 218, 239, 4, 148, 140, 245, 130, 11, 197, 152, 165, 199, 2,
+ 235, 219, 158, 232, 9, 187, 231, 171, 149, 12, 134, 191, 248, 157, 3,
+ 219, 140, 128, 208, 1, 181, 140, 225, 226, 15, 234, 239, 208, 170, 10,
+ 166, 152, 192, 138, 15, 237, 204, 242, 197, 12, 230, 224, 210, 68, 128,
+ 170, 249, 251, 10, 193, 202, 171, 142, 7, 235, 192, 224, 175, 14, 147,
+ 243, 214, 94, 165, 202, 243, 157, 6, 192, 178, 204, 211, 8, 242, 240,
+ 207, 231, 4, 251, 234, 238, 218, 1, 207, 227, 224, 149, 4, 155, 215,
+ 210, 203, 2, 164, 248, 235, 166, 6, 226, 234, 165, 222, 13, 228, 197,
+ 249, 231, 14, 169, 172, 201, 163, 14, 149, 206, 208, 159, 15, 178, 216,
+ 205, 227, 15, 210, 228, 223, 220, 5, 161, 214, 153, 136, 11, 181, 178,
+ 246, 212, 7, 128, 131, 238, 218, 13, 138, 156, 141, 139, 15, 134, 187,
+ 137, 234, 4, 152, 215, 181, 142, 6, 160, 185, 166, 193, 13, 213, 145,
+ 204, 240, 13, 190, 164, 216, 231, 13, 251, 208, 176, 231, 4, 243, 160,
+ 187, 150, 5, 235, 251, 246, 205, 3, 142, 232, 229, 222, 5, 227, 251,
+ 238, 161, 12, 224, 198, 250, 176, 3, 187, 162, 200, 223, 5, 199, 133,
+ 234, 181, 3, 167, 160, 247, 232, 4, 174, 198, 216, 180, 15, 144, 251,
+ 131, 187, 10, 161, 171, 169, 190, 9, 223, 175, 171, 171, 4, 141, 165,
+ 211, 128, 5, 139, 239, 131, 173, 3, 211, 163, 253, 45, 212, 199, 216,
+ 226, 11, 137, 216, 228, 198, 3, 216, 209, 199, 233, 3, 249, 144, 225,
+ 146, 1, 216, 184, 225, 218, 9, 197, 219, 219, 247, 12, 214, 227, 243,
+ 240, 14, 221, 155, 244, 141, 4, 239, 249, 179, 130, 4, 161, 187, 191,
+ 135, 3, 245, 241, 237, 241, 12, 194, 211, 209, 238, 5, 252, 210, 135,
+ 149, 1, 134, 241, 220, 170, 12, 175, 208, 242, 229, 9, 181, 144, 172,
+ 202, 7, 170, 195, 174, 180, 5, 198, 153, 178, 158, 6, 146, 142, 204,
+ 119, 137, 185, 250, 204, 10, 208, 190, 240, 166, 1, 138, 183, 212, 226,
+ 3, 241, 240, 245, 140, 15, 250, 184, 161, 117, 198, 194, 173, 133, 15,
+ 135, 247, 179, 180, 11, 158, 233, 195, 162, 2, 209, 143, 142, 203, 13,
+ 156, 215, 224, 192, 5, 228, 223, 167, 163, 6, 253, 160, 223, 182, 5,
+ 178, 178, 223, 147, 5, 150, 180, 221, 189, 10, 168, 197, 173, 169, 6,
+ 166, 146, 252, 254, 15, 154, 211, 198, 238, 6, 182, 166, 227, 223, 3,
+ 152, 209, 173, 192, 3, 147, 255, 130, 153, 9, 152, 159, 128, 195, 7,
+ 204, 199, 174, 227, 8, 149, 133, 142, 33, 236, 185, 160, 136, 14, 154,
+ 137, 143, 236, 7, 246, 149, 237, 166, 3, 150, 184, 224, 232, 3, 204,
+ 220, 171, 245, 15, 128, 131, 146, 236, 219, 10, 168, 253, 226, 198, 3,
+ 196, 185, 159, 245, 14, 246, 239, 172, 207, 7, 172, 188, 238, 233, 13,
+ 193, 158, 247, 192, 10, 178, 146, 230, 233, 8, 143, 221, 252, 145, 5,
+ 169, 173, 160, 149, 7, 141, 199, 235, 35, 225, 224, 227, 213, 7, 233,
+ 249, 164, 132, 11, 255, 158, 248, 254, 2, 248, 200, 154, 176, 3, 168,
+ 248, 134, 165, 8, 145, 177, 231, 188, 10, 189, 223, 182, 129, 7, 246,
+ 146, 219, 62, 185, 190, 133, 217, 3, 228, 177, 227, 170, 1, 230, 175,
+ 223, 120, 150, 130, 206, 166, 5, 223, 216, 157, 168, 1, 225, 151, 175,
+ 248, 5, 140, 228, 227, 235, 7, 243, 148, 219, 250, 3, 250, 215, 234,
+ 130, 1, 191, 146, 221, 133, 8, 220, 223, 135, 100, 233, 148, 197, 224,
+ 11, 164, 203, 178, 134, 9, 170, 133, 159, 133, 8, 162, 189, 239, 68,
+ 144, 186, 204, 211, 6, 167, 218, 219, 144, 2, 208, 155, 181, 237, 2,
+ 253, 223, 151, 180, 15, 137, 132, 173, 135, 7, 172, 137, 239, 146, 13,
+ 250, 140, 255, 211, 11, 231, 134, 228, 145, 3, 149, 220, 253, 168, 10,
+ 236, 163, 149, 221, 10, 247, 151, 236, 190, 6, 166, 210, 238, 52, 192,
+ 248, 168, 229, 9, 237, 182, 227, 199, 12, 189, 199, 195, 216, 12, 178,
+ 236, 220, 158, 2, 247, 182, 235, 221, 14, 219, 148, 216, 159, 15, 158,
+ 234, 200, 167, 2, 184, 132, 251, 232, 2, 138, 227, 158, 204, 14, 225,
+ 192, 227, 165, 8, 130, 214, 149, 173, 13, 210, 140, 161, 181, 9, 222,
+ 217, 168, 158, 10, 220, 222, 238, 137, 10, 237, 248, 184, 57, 167, 213,
+ 169, 132, 5, 236, 173, 141, 25, 131, 201, 181, 180, 4, 133, 182, 179,
+ 134, 14, 243, 180, 195, 169, 11, 145, 153, 139, 242, 14, 210, 148, 136,
+ 230, 2, 174, 147, 246, 185, 7, 185, 230, 252, 230, 10, 247, 210, 139,
+ 242, 13, 187, 227, 199, 158, 14, 186, 209, 178, 166, 8, 148, 174, 212,
+ 154, 6, 193, 139, 246, 160, 4, 180, 129, 135, 190, 7, 253, 202, 252,
+ 194, 1, 145, 192, 198, 192, 2, 136, 201, 194, 165, 5, 238, 198, 216,
+ 222, 8, 148, 132, 194, 231, 2, 179, 212, 226, 152, 13, 216, 203, 190,
+ 81, 241, 158, 205, 205, 3, 153, 250, 248, 251, 11, 157, 223, 163, 229,
+ 11, 160, 240, 198, 156, 13, 155, 254, 151, 138, 14, 219, 233, 172, 254,
+ 4, 186, 194, 189, 227, 4, 169, 243, 181, 201, 14, 161, 158, 146, 201,
+ 3, 135, 139, 242, 206, 4, 222, 141, 186, 201, 11, 247, 182, 166, 198,
+ 12, 141, 168, 155, 172, 4, 206, 218, 254, 175, 4, 140, 213, 159, 204,
+ 7, 214, 128, 160, 215, 9, 253, 242, 237, 147, 8, 162, 233, 151, 181,
+ 5, 183, 223, 151, 21, 132, 164, 206, 242, 1, 179, 227, 155, 165, 11,
+ 189, 251, 195, 212, 3, 154, 195, 137, 190, 6, 129, 212, 227, 177, 4,
+ 185, 141, 235, 183, 7, 233, 220, 229, 174, 4, 215, 138, 248, 25, 161,
+ 210, 193, 241, 14, 239, 201, 231, 152, 12, 240, 169, 204, 169, 14, 228,
+ 195, 196, 225, 6, 250, 159, 144, 234, 1, 167, 238, 191, 142, 11, 202,
+ 222, 151, 207, 9, 205, 219, 185, 142, 3, 230, 224, 187, 235, 5, 194,
+ 167, 210, 173, 7, 235, 250, 253, 178, 12, 239, 128, 215, 198, 13, 130,
+ 141, 191, 238, 3, 173, 252, 172, 217, 14, 129, 203, 164, 16, 191, 131,
+ 153, 141, 8, 133, 200, 131, 240, 15, 173, 165, 172, 11, 182, 247, 244,
+ 165, 9, 128, 238, 232, 219, 37, 214, 148, 220, 206, 10, 199, 154, 167,
+ 130, 1, 188, 191, 233, 235, 9, 167, 131, 215, 154, 5, 133, 224, 241,
+ 202, 1, 237, 213, 192, 223, 4, 160, 202, 178, 132, 10, 248, 217, 142,
+ 133, 12, 199, 164, 231, 189, 5, 240, 129, 134, 189, 6, 173, 135, 204,
+ 176, 15, 164, 142, 214, 137, 8, 208, 169, 163, 251, 15, 196, 171, 247,
+ 187, 14, 230, 177, 251, 130, 13, 200, 234, 146, 173, 4, 252, 218, 210,
+ 212, 10, 206, 187, 236, 129, 5, 165, 161, 220, 171, 11, 135, 129, 179,
+ 205, 2, 240, 251, 134, 254, 3, 136, 185, 186, 220, 10, 230, 142, 156,
+ 211, 1, 215, 243, 241, 179, 12, 141, 140, 140, 166, 5, 136, 183, 213,
+ 220, 14, 182, 213, 134, 202, 10, 177, 197, 170, 230, 6, 210, 133, 203,
+ 128, 14, 145, 196, 176, 139, 5, 191, 143, 140, 133, 11, 247, 155, 221,
+ 233, 10, 131, 192, 238, 143, 3, 194, 196, 146, 129, 9, 245, 183, 142,
+ 133, 6, 200, 197, 143, 185, 2, 133, 144, 194, 144, 4, 149, 202, 240,
+ 36, 230, 214, 182, 211, 5, 254, 227, 217, 246, 2, 128, 164, 220, 255,
+ 5, 132, 138, 149, 153, 6, 200, 139, 167, 97, 203, 137, 179, 195, 2,
+ 141, 176, 199, 134, 9, 165, 244, 225, 254, 3, 136, 180, 252, 193, 3,
+ 200, 165, 159, 207, 12, 147, 222, 142, 148, 5, 191, 146, 228, 191, 9,
+ 213, 255, 236, 152, 13, 132, 240, 164, 174, 2, 204, 152, 214, 3, 251,
+ 240, 222, 248, 10, 219, 208, 211, 189, 15, 175, 252, 221, 88, 182, 234,
+ 154, 107, 208, 190, 199, 159, 2, 209, 139, 150, 182, 13, 212, 219, 146,
+ 154, 15, 221, 178, 221, 188, 11, 148, 200, 197, 17, 129, 218, 170, 253,
+ 11, 164, 244, 228, 252, 2, 220, 175, 146, 195, 15, 141, 223, 154, 232,
+ 9, 227, 186, 130, 220, 8, 153, 157, 145, 139, 12, 233, 140, 173, 183,
+ 12, 223, 255, 155, 139, 13, 162, 238, 129, 242, 11, 252, 162, 211, 191,
+ 2, 228, 182, 210, 101, 171, 202, 191, 167, 11, 247, 189, 170, 255, 3,
+ 217, 150, 238, 215, 10, 173, 188, 234, 177, 5, 166, 139, 147, 132, 12,
+ 230, 216, 153, 200, 3, 182, 202, 167, 210, 12, 222, 169, 137, 180, 7,
+ 253, 249, 181, 197, 2, 198, 205, 156, 192, 12, 168, 135, 243, 185, 2,
+ 138, 158, 139, 159, 11, 138, 210, 248, 255, 14, 157, 141, 161, 207, 9,
+ 218, 206, 244, 191, 4, 222, 169, 188, 238, 5, 133, 211, 152, 218, 14,
+ 248, 191, 242, 250, 13, 217, 188, 239, 231, 14, 137, 198, 135, 144, 1,
+ 231, 227, 214, 168, 7, 128, 136, 152, 103, 150, 151, 161, 171, 12, 251,
+ 222, 212, 229, 4, 154, 193, 182, 62, 251, 246, 205, 142, 3, 132, 140,
+ 242, 166, 14, 165, 231, 192, 250, 6, 136, 154, 230, 163, 1, 230, 228,
+ 246, 182, 3, 187, 215, 217, 177, 8, 137, 171, 251, 15, 211, 128, 230,
+ 244, 15, 160, 146, 188, 255, 4, 204, 242, 150, 194, 1, 128, 184, 177,
+ 139, 14, 139, 209, 245, 134, 11, 241, 167, 181, 139, 5, 159, 129, 160,
+ 74, 159, 200, 133, 222, 5, 157, 204, 165, 199, 10, 193, 159, 169, 151,
+ 11, 205, 219, 226, 134, 9, 197, 252, 179, 128, 14, 230, 250, 244, 215,
+ 5, 207, 138, 239, 212, 14, 237, 216, 191, 199, 15, 250, 250, 198, 148,
+ 9, 212, 228, 174, 146, 15, 221, 137, 207, 196, 3, 146, 165, 245, 220,
+ 13, 157, 249, 149, 228, 5, 185, 219, 188, 185, 8, 212, 150, 240, 218,
+ 15, 128, 211, 229, 202, 129, 14, 132, 225, 178, 226, 1, 251, 195, 132,
+ 66, 210, 245, 154, 234, 5, 145, 183, 146, 177, 9, 218, 223, 128, 170,
+ 13, 238, 227, 168, 197, 11, 189, 225, 206, 179, 6, 221, 169, 239, 193,
+ 4, 194, 207, 170, 203, 7, 163, 206, 232, 197, 1, 160, 130, 131, 160,
+ 4, 139, 146, 149, 173, 10, 140, 240, 243, 180, 4, 231, 180, 202, 245,
+ 9, 146, 250, 195, 157, 1, 233, 199, 188, 210, 15, 253, 222, 137, 142,
+ 10, 174, 245, 231, 20, 219, 156, 185, 201, 5, 139, 137, 230, 135, 1,
+ 236, 207, 146, 138, 4, 149, 174, 164, 221, 4, 158, 227, 224, 210, 7,
+ 206, 150, 186, 244, 9, 156, 183, 159, 142, 13, 176, 152, 163, 193, 8,
+ 190, 229, 232, 155, 7, 234, 132, 236, 132, 9, 242, 254, 204, 134, 14,
+ 143, 226, 253, 180, 2, 138, 226, 214, 218, 2, 199, 228, 210, 186, 12,
+ 147, 179, 230, 254, 5, 249, 135, 247, 147, 10, 148, 253, 186, 214, 12,
+ 250, 240, 173, 159, 14, 162, 215, 177, 42, 162, 142, 248, 135, 3, 196,
+ 143, 150, 150, 10, 236, 221, 178, 147, 7, 165, 248, 197, 136, 7, 199,
+ 152, 158, 228, 13, 229, 215, 242, 194, 7, 145, 249, 246, 181, 13, 134,
+ 191, 196, 245, 3, 161, 251, 235, 200, 14, 255, 232, 248, 228, 10, 170,
+ 188, 227, 177, 14, 212, 202, 144, 143, 13, 199, 230, 234, 155, 10, 247,
+ 239, 142, 167, 6, 197, 129, 192, 235, 2, 207, 229, 194, 237, 12, 228,
+ 239, 211, 136, 3, 199, 135, 194, 244, 4, 167, 137, 158, 132, 15, 208,
+ 199, 176, 183, 2, 161, 181, 218, 155, 11, 218, 235, 160, 207, 5, 250,
+ 181, 244, 252, 9, 197, 130, 193, 168, 1, 153, 235, 181, 253, 2, 203,
+ 245, 229, 255, 11, 134, 136, 148, 249, 8, 179, 174, 133, 187, 8, 145,
+ 212, 156, 196, 7, 163, 222, 227, 236, 11, 242, 171, 200, 143, 12, 185,
+ 225, 231, 211, 15, 135, 230, 213, 153, 6, 254, 187, 227, 167, 2, 147,
+ 191, 160, 185, 12, 177, 145, 137, 133, 1, 241, 244, 217, 231, 3, 225,
+ 213, 246, 253, 11, 138, 185, 169, 229, 8, 129, 248, 228, 155, 4, 150,
+ 208, 194, 129, 13, 149, 233, 140, 159, 7, 149, 223, 199, 33, 153, 214,
+ 176, 117, 175, 193, 163, 144, 9, 135, 207, 150, 12, 216, 138, 151, 55,
+ 233, 245, 225, 219, 8, 215, 194, 201, 214, 6, 235, 254, 134, 70, 251,
+ 142, 174, 209, 12, 215, 218, 132, 174, 4, 209, 177, 189, 144, 3, 247,
+ 136, 205, 212, 8, 152, 220, 178, 208, 5, 183, 146, 202, 149, 6, 248,
+ 229, 196, 211, 12, 226, 191, 237, 227, 7, 234, 157, 195, 196, 4, 203,
+ 147, 213, 156, 1, 245, 161, 241, 97, 186, 245, 223, 246, 8, 170, 241,
+ 234, 188, 8, 171, 155, 201, 168, 8, 193, 168, 145, 142, 10, 254, 183,
+ 192, 202, 14, 137, 175, 147, 223, 9, 176, 133, 131, 166, 12, 211, 168,
+ 155, 225, 4, 197, 193, 255, 204, 8, 154, 208, 144, 165, 1, 134, 190,
+ 143, 217, 11, 148, 242, 203, 237, 11, 161, 142, 172, 215, 3, 166, 203,
+ 240, 162, 6, 200, 195, 186, 162, 7, 198, 211, 223, 252, 15, 132, 160,
+ 226, 204, 15, 158, 187, 167, 222, 6, 174, 214, 139, 220, 9, 130, 243,
+ 221, 206, 6, 190, 217, 211, 145, 4, 160, 255, 142, 201, 5, 201, 166,
+ 217, 174, 7, 240, 197, 130, 214, 7, 216, 133, 220, 184, 3, 241, 148,
+ 192, 185, 6, 213, 181, 240, 210, 2, 137, 194, 206, 172, 5, 221, 189,
+ 134, 241, 10, 128, 180, 234, 178, 219, 13, 203, 213, 182, 247, 10, 172,
+ 229, 222, 178, 15, 188, 154, 206, 196, 12, 240, 136, 172, 156, 11, 165,
+ 151, 164, 200, 7, 189, 152, 225, 146, 7, 214, 167, 205, 147, 4, 216,
+ 175, 130, 230, 10, 243, 162, 145, 154, 7, 155, 169, 190, 182, 6, 255,
+ 212, 152, 251, 6, 147, 152, 160, 237, 2, 170, 228, 233, 210, 13, 166,
+ 255, 247, 207, 14, 238, 175, 242, 171, 9, 174, 241, 193, 193, 4, 245,
+ 210, 147, 167, 14, 151, 233, 199, 154, 1, 193, 184, 194, 249, 9, 216,
+ 255, 201, 246, 10, 138, 198, 240, 208, 5, 187, 230, 137, 145, 1, 200,
+ 237, 144, 115, 131, 149, 167, 201, 15, 249, 130, 240, 202, 5, 141, 220,
+ 198, 233, 3, 216, 165, 204, 210, 12, 176, 166, 249, 207, 4, 244, 158,
+ 162, 140, 8, 174, 153, 181, 253, 14, 249, 157, 148, 130, 10, 178, 203,
+ 201, 162, 4, 161, 215, 176, 137, 3, 164, 232, 198, 200, 15, 141, 189,
+ 153, 206, 6, 148, 138, 219, 252, 12, 147, 134, 206, 210, 9, 214, 186,
+ 141, 183, 10, 235, 192, 204, 245, 10, 155, 177, 148, 174, 7, 246, 150,
+ 200, 167, 15, 134, 228, 212, 210, 7, 128, 198, 173, 133, 10, 173, 148,
+ 155, 170, 4, 131, 242, 205, 148, 14, 154, 220, 156, 236, 11, 213, 150,
+ 219, 145, 4, 171, 231, 199, 224, 12, 190, 139, 161, 155, 3, 136, 151,
+ 199, 129, 9, 182, 161, 156, 237, 1, 218, 151, 248, 132, 13, 201, 207,
+ 164, 115, 190, 137, 205, 255, 11, 191, 198, 251, 165, 10, 234, 205, 249,
+ 181, 3, 172, 185, 218, 244, 14, 134, 171, 214, 151, 9, 152, 245, 182,
+ 215, 10, 204, 161, 209, 196, 14, 180, 134, 204, 240, 4, 242, 196, 170,
+ 185, 13, 156, 255, 134, 178, 14, 203, 145, 211, 216, 3, 190, 148, 160,
+ 180, 14, 189, 162, 214, 209, 10, 238, 176, 239, 248, 15, 151, 163, 176,
+ 168, 5, 152, 247, 207, 238, 14, 181, 238, 168, 251, 8, 181, 189, 202,
+ 33, 232, 239, 229, 226, 5, 133, 156, 212, 180, 4, 224, 169, 249, 216,
+ 4, 198, 245, 205, 147, 8, 231, 232, 149, 230, 8, 243, 161, 191, 162,
+ 3, 194, 189, 237, 227, 15, 223, 185, 161, 232, 4, 153, 233, 249, 155,
+ 8, 240, 147, 199, 249, 5, 135, 205, 250, 160, 11, 252, 183, 238, 210,
+ 10, 244, 146, 156, 160, 5, 196, 252, 142, 22, 191, 148, 222, 231, 11,
+ 182, 201, 163, 219, 13, 199, 238, 233, 179, 4, 180, 199, 255, 249, 4,
+ 254, 237, 180, 213, 2, 211, 221, 157, 151, 7, 178, 192, 158, 241, 13,
+ 133, 212, 252, 51, 146, 221, 241, 177, 7, 137, 246, 204, 171, 5, 138,
+ 209, 144, 231, 2, 153, 213, 230, 179, 4, 171, 244, 213, 172, 3, 191,
+ 201, 249, 129, 3, 184, 184, 186, 243, 11, 145, 176, 183, 103, 145, 131,
+ 206, 147, 14, 136, 134, 191, 173, 9, 180, 164, 241, 245, 5, 172, 133,
+ 212, 167, 8, 198, 162, 158, 244, 4, 232, 175, 222, 231, 12, 146, 246,
+ 134, 196, 8, 147, 248, 177, 230, 8, 145, 216, 180, 139, 12, 224, 254,
+ 191, 222, 1, 182, 145, 213, 232, 10, 178, 139, 143, 237, 9, 253, 230,
+ 172, 181, 13, 225, 218, 252, 132, 6, 141, 175, 159, 197, 14, 185, 222,
+ 237, 246, 3, 154, 184, 245, 228, 11, 193, 198, 235, 204, 10, 182, 239,
+ 253, 136, 15, 205, 143, 161, 211, 7, 164, 207, 235, 220, 4, 158, 235,
+ 183, 187, 9, 203, 201, 147, 139, 3, 169, 181, 153, 201, 11, 222, 206,
+ 192, 251, 12, 221, 253, 242, 152, 11, 128, 249, 190, 248, 152, 5, 151,
+ 199, 221, 227, 14, 209, 246, 133, 200, 7, 246, 181, 176, 131, 9, 165,
+ 219, 139, 171, 4, 254, 130, 187, 208, 8, 144, 221, 189, 192, 10, 163,
+ 146, 139, 166, 12, 231, 177, 223, 205, 9, 229, 179, 214, 227, 2, 132,
+ 153, 150, 154, 5, 242, 250, 159, 171, 9, 144, 228, 238, 120, 168, 206,
+ 130, 107, 145, 144, 235, 248, 1, 254, 218, 166, 129, 4, 237, 129, 235,
+ 7, 150, 199, 251, 175, 9, 252, 199, 200, 168, 9, 172, 147, 153, 151,
+ 5, 168, 129, 129, 188, 13, 200, 166, 192, 192, 8, 154, 184, 218, 232,
+ 2, 155, 202, 193, 156, 12, 182, 241, 250, 153, 13, 180, 141, 206, 141,
+ 6, 206, 129, 157, 153, 12, 132, 158, 212, 247, 14, 160, 135, 203, 238,
+ 12, 216, 173, 204, 156, 9, 166, 214, 242, 138, 14, 178, 248, 246, 135,
+ 1, 244, 219, 210, 155, 3, 208, 155, 189, 180, 3, 156, 189, 171, 174,
+ 13, 162, 161, 233, 108, 231, 134, 177, 255, 11, 216, 159, 226, 244, 2,
+ 130, 227, 211, 185, 6, 169, 146, 187, 143, 2, 153, 225, 150, 187, 9,
+ 210, 153, 211, 181, 13, 147, 216, 152, 173, 3, 246, 236, 142, 33, 222,
+ 140, 194, 241, 10, 171, 251, 248, 210, 11, 239, 197, 137, 242, 8, 132,
+ 179, 189, 209, 6, 181, 221, 179, 161, 2, 168, 172, 241, 163, 2, 203,
+ 241, 250, 226, 12, 184, 188, 237, 210, 8, 228, 163, 153, 234, 5, 222,
+ 162, 216, 142, 13, 235, 251, 186, 239, 1, 199, 133, 166, 158, 9, 134,
+ 241, 161, 174, 3, 183, 248, 214, 158, 7, 181, 248, 184, 143, 11, 234,
+ 152, 151, 169, 9, 198, 134, 159, 251, 7, 144, 176, 211, 121, 199, 255,
+ 166, 132, 5, 201, 243, 215, 189, 14, 213, 240, 205, 223, 10, 205, 191,
+ 234, 185, 9, 240, 221, 255, 234, 5, 210, 250, 179, 148, 7, 185, 162,
+ 155, 243, 3, 140, 197, 165, 222, 12, 150, 143, 215, 241, 13, 138, 138,
+ 246, 30, 236, 151, 243, 235, 12, 232, 222, 197, 223, 5, 177, 198, 228,
+ 194, 4, 130, 172, 242, 221, 11, 208, 235, 221, 161, 1, 254, 141, 148,
+ 144, 14, 168, 251, 185, 179, 9, 247, 144, 244, 178, 12, 209, 235, 151,
+ 183, 9, 131, 208, 184, 182, 13, 135, 245, 255, 250, 2, 173, 149, 179,
+ 144, 12, 135, 248, 137, 220, 10, 233, 194, 242, 248, 6, 212, 132, 219,
+ 149, 12, 204, 211, 128, 213, 2, 137, 232, 221, 213, 9, 253, 167, 158,
+ 148, 3, 230, 179, 147, 176, 13, 224, 201, 179, 191, 9, 135, 168, 142,
+ 253, 13, 241, 194, 141, 216, 6, 153, 214, 245, 216, 11, 195, 145, 195,
+ 142, 15, 242, 152, 180, 191, 11, 229, 219, 238, 220, 11, 248, 241, 183,
+ 229, 15, 231, 171, 213, 81, 205, 182, 151, 253, 6, 170, 162, 168, 177,
+ 10, 231, 205, 251, 209, 3, 253, 168, 199, 198, 12, 252, 138, 233, 210,
+ 8, 234, 156, 212, 168, 11, 250, 136, 144, 228, 7, 168, 238, 236, 143,
+ 1, 180, 204, 171, 173, 13, 180, 221, 130, 239, 4, 253, 135, 233, 166,
+ 9, 233, 248, 248, 182, 1, 239, 198, 243, 139, 14, 160, 215, 214, 199,
+ 9, 229, 211, 167, 193, 9, 247, 135, 221, 142, 9, 205, 178, 155, 150,
+ 3, 254, 172, 151, 215, 14, 170, 242, 195, 176, 5, 207, 226, 194, 155,
+ 11, 216, 223, 149, 43, 240, 135, 144, 187, 13, 139, 215, 216, 182, 15,
+ 135, 209, 192, 226, 6, 251, 144, 191, 169, 8, 226, 207, 136, 188, 2,
+ 255, 128, 205, 245, 4, 214, 222, 198, 178, 4, 128, 160, 134, 134, 201,
+ 1, 227, 171, 159, 179, 3, 247, 175, 155, 247, 11, 130, 208, 142, 189,
+ 1, 209, 251, 137, 239, 14, 196, 246, 217, 190, 2, 216, 236, 193, 250,
+ 4, 171, 135, 202, 174, 13, 157, 230, 183, 194, 15, 151, 155, 192, 234,
+ 5, 192, 160, 198, 226, 7, 246, 249, 139, 215, 1, 163, 181, 142, 210,
+ 4, 138, 246, 219, 179, 4, 143, 187, 253, 153, 10, 190, 131, 161, 171,
+ 8, 193, 185, 156, 210, 9, 221, 200, 245, 253, 2, 234, 176, 164, 194,
+ 4, 234, 206, 138, 90, 226, 227, 130, 184, 6, 213, 198, 190, 208, 10,
+ 234, 213, 248, 154, 8, 242, 195, 155, 149, 9, 248, 145, 209, 218, 3,
+ 162, 176, 130, 131, 9, 187, 166, 140, 162, 11, 189, 169, 188, 197, 7,
+ 197, 240, 176, 226, 3, 158, 243, 236, 114, 214, 252, 228, 253, 15, 237,
+ 210, 163, 153, 11, 253, 202, 188, 196, 3, 198, 237, 141, 147, 3, 145,
+ 225, 201, 203, 8, 162, 160, 216, 149, 7, 136, 166, 225, 139, 2, 230,
+ 144, 134, 245, 8, 213, 208, 144, 236, 5, 140, 163, 160, 219, 8, 244,
+ 181, 176, 132, 13, 141, 168, 184, 252, 11, 238, 218, 178, 216, 9, 254,
+ 164, 216, 173, 8, 233, 173, 221, 183, 10, 241, 148, 151, 179, 5, 192,
+ 198, 255, 228, 7, 237, 131, 167, 203, 14, 136, 194, 238, 162, 4, 233,
+ 138, 144, 191, 13, 184, 167, 194, 138, 5, 243, 150, 162, 137, 1, 216,
+ 138, 143, 176, 2, 168, 185, 251, 188, 1, 226, 135, 148, 99, 221, 188,
+ 199, 199, 12, 172, 200, 179, 146, 2, 215, 133, 179, 248, 1, 166, 136,
+ 133, 29, 179, 161, 179, 46, 149, 179, 140, 236, 9, 206, 194, 254, 173,
+ 9, 199, 144, 152, 197, 13, 130, 186, 169, 197, 14, 234, 210, 198, 186,
+ 11, 166, 157, 141, 196, 8, 135, 171, 179, 243, 10, 136, 135, 199, 243,
+ 1, 239, 233, 248, 191, 6, 162, 208, 228, 225, 7, 218, 186, 182, 162,
+ 9, 197, 229, 188, 155, 7, 252, 160, 180, 95, 152, 167, 185, 156, 15,
+ 169, 157, 242, 208, 6, 206, 213, 229, 223, 4, 205, 225, 209, 237, 9,
+ 208, 223, 194, 178, 6, 185, 206, 145, 140, 11, 162, 153, 203, 219, 11,
+ 191, 254, 170, 128, 15, 138, 198, 198, 231, 12, 171, 145, 216, 178, 11,
+ 250, 208, 197, 186, 14, 230, 209, 184, 146, 15, 136, 148, 233, 177, 2,
+ 224, 176, 162, 142, 12, 171, 216, 173, 150, 5, 190, 236, 246, 207, 8,
+ 146, 159, 218, 24, 149, 184, 172, 144, 12, 215, 138, 134, 222, 8, 230,
+ 138, 234, 150, 10, 132, 233, 180, 129, 7, 246, 243, 136, 179, 15, 180,
+ 245, 254, 253, 8, 217, 162, 135, 169, 14, 175, 223, 178, 21, 248, 184,
+ 135, 155, 3, 194, 214, 241, 156, 10, 150, 140, 157, 20, 245, 219, 214,
+ 189, 9, 195, 224, 165, 187, 6, 143, 205, 236, 165, 14, 177, 147, 215,
+ 253, 3, 149, 236, 255, 166, 10, 183, 205, 220, 209, 12, 135, 254, 156,
+ 236, 12, 253, 196, 175, 55, 159, 249, 156, 132, 7, 206, 138, 221, 129,
+ 1, 131, 237, 190, 202, 4, 203, 213, 202, 160, 13, 142, 239, 143, 188,
+ 13, 140, 181, 178, 132, 5, 196, 160, 202, 171, 13, 165, 231, 144, 248,
+ 3, 218, 192, 242, 222, 6, 182, 201, 241, 138, 7, 146, 141, 216, 156,
+ 9, 253, 199, 128, 8, 143, 152, 133, 227, 10, 161, 133, 237, 138, 5,
+ 155, 167, 242, 192, 11, 131, 131, 221, 252, 10, 173, 222, 208, 175, 4,
+ 222, 246, 234, 182, 15, 186, 223, 226, 234, 4, 128, 134, 162, 161, 166,
+ 8, 173, 187, 185, 226, 15, 193, 158, 170, 192, 15, 157, 143, 170, 233,
+ 7, 236, 143, 129, 250, 7, 170, 139, 148, 165, 15, 227, 193, 248, 149,
+ 1, 193, 175, 193, 161, 3, 201, 133, 138, 248, 1, 248, 238, 181, 148,
+ 4, 148, 149, 163, 224, 5, 140, 176, 170, 226, 1, 210, 131, 226, 211,
+ 11, 177, 220, 204, 252, 6, 172, 166, 215, 221, 7, 207, 206, 190, 142,
+ 9, 180, 178, 244, 139, 13, 205, 186, 224, 193, 5, 203, 134, 137, 186,
+ 9, 131, 254, 156, 161, 2, 251, 240, 204, 196, 9, 174, 198, 211, 220,
+ 6, 203, 229, 158, 140, 7, 141, 224, 133, 196, 2, 185, 203, 211, 149,
+ 14, 212, 173, 245, 172, 10, 171, 172, 253, 175, 15, 146, 215, 253, 240,
+ 10, 129, 217, 236, 156, 12, 196, 183, 197, 250, 9, 189, 203, 169, 148,
+ 11, 221, 247, 223, 173, 14, 218, 190, 182, 170, 9, 188, 230, 139, 223,
+ 2, 152, 181, 134, 241, 11, 137, 184, 151, 151, 11, 224, 248, 137, 176,
+ 12, 233, 234, 254, 228, 13, 202, 199, 164, 253, 15, 205, 152, 196, 208,
+ 8, 245, 131, 154, 210, 13, 173, 230, 205, 208, 6, 138, 165, 240, 198,
+ 14, 231, 137, 175, 129, 13, 246, 163, 168, 158, 1, 213, 246, 226, 226,
+ 7, 211, 214, 201, 234, 15, 173, 179, 224, 157, 10, 146, 223, 141, 141,
+ 14, 249, 209, 212, 241, 7, 152, 138, 146, 237, 1, 178, 175, 134, 132,
+ 10, 201, 203, 154, 188, 14, 148, 188, 172, 196, 4, 170, 188, 178, 162,
+ 9, 255, 227, 197, 169, 15, 163, 196, 216, 154, 13, 202, 217, 150, 190,
+ 1, 156, 213, 189, 194, 11, 192, 206, 130, 164, 3, 197, 163, 251, 148,
+ 5, 145, 250, 232, 230, 5, 141, 198, 246, 236, 6, 254, 143, 215, 185,
+ 10, 139, 223, 210, 246, 12, 158, 243, 217, 139, 12, 218, 234, 151, 205,
+ 5, 239, 161, 218, 141, 11, 189, 145, 169, 172, 10, 218, 224, 248, 247,
+ 6, 229, 195, 222, 160, 15, 220, 251, 218, 156, 8, 235, 140, 138, 160,
+ 5, 138, 133, 155, 195, 5, 248, 204, 241, 223, 7, 174, 250, 182, 238,
+ 12, 190, 151, 165, 58, 132, 155, 154, 219, 10, 160, 136, 241, 163, 8,
+ 145, 179, 243, 103, 156, 198, 227, 136, 12, 154, 158, 219, 228, 6, 138,
+ 134, 248, 146, 4, 139, 141, 198, 253, 5, 193, 167, 232, 162, 7, 227,
+ 182, 238, 134, 13, 153, 232, 167, 238, 13, 179, 178, 133, 162, 1, 144,
+ 247, 180, 57, 228, 228, 193, 222, 6, 249, 173, 245, 228, 1, 128, 173,
+ 168, 233, 7, 128, 193, 229, 199, 11, 222, 247, 155, 204, 8, 193, 156,
+ 248, 222, 5, 161, 217, 242, 184, 6, 221, 249, 182, 235, 11, 136, 232,
+ 167, 217, 2, 175, 247, 247, 133, 6, 190, 197, 160, 205, 12, 228, 232,
+ 159, 194, 14, 138, 157, 191, 164, 12, 165, 147, 184, 78, 175, 222, 170,
+ 114, 215, 206, 218, 235, 9, 172, 178, 244, 228, 9, 207, 184, 158, 134,
+ 15, 220, 242, 151, 210, 5, 135, 214, 152, 169, 10, 147, 180, 187, 157,
+ 15, 169, 248, 147, 144, 8, 148, 170, 166, 139, 7, 221, 178, 241, 141,
+ 12, 129, 229, 166, 231, 10, 167, 164, 187, 169, 3, 210, 166, 162, 164,
+ 9, 145, 229, 163, 141, 3, 169, 219, 228, 254, 13, 195, 154, 185, 239,
+ 2, 232, 144, 143, 245, 1, 182, 224, 131, 205, 4, 203, 167, 128, 129,
+ 2, 157, 147, 212, 142, 2, 157, 143, 162, 249, 10, 223, 183, 231, 195,
+ 7, 162, 136, 180, 151, 14, 244, 221, 234, 162, 7, 175, 243, 194, 204,
+ 12, 128, 201, 131, 148, 223, 15, 245, 173, 201, 153, 15, 179, 154, 219,
+ 169, 10, 146, 160, 147, 195, 12, 235, 208, 250, 180, 3, 180, 137, 229,
+ 254, 5, 167, 134, 245, 142, 10, 231, 130, 163, 232, 15, 235, 167, 185,
+ 137, 12, 179, 205, 207, 135, 8, 130, 159, 158, 216, 6, 203, 218, 229,
+ 194, 2, 244, 246, 217, 133, 3, 242, 158, 230, 208, 2, 221, 195, 182,
+ 229, 4, 214, 236, 135, 214, 10, 137, 214, 209, 246, 7, 189, 198, 158,
+ 200, 9, 214, 201, 139, 246, 11, 222, 180, 147, 211, 14, 146, 235, 149,
+ 248, 11, 173, 234, 137, 145, 14, 135, 210, 187, 187, 3, 155, 174, 179,
+ 178, 14, 158, 193, 253, 239, 12, 185, 178, 240, 176, 7, 129, 232, 208,
+ 205, 1, 244, 167, 133, 23, 236, 181, 197, 142, 8, 177, 189, 200, 176,
+ 12, 151, 218, 206, 138, 3, 153, 238, 153, 179, 7, 252, 224, 230, 173,
+ 13, 201, 177, 222, 166, 13, 158, 168, 219, 249, 3, 227, 223, 144, 175,
+ 8, 162, 217, 224, 176, 11, 153, 139, 169, 180, 7, 230, 211, 215, 224,
+ 14, 181, 189, 210, 219, 5, 170, 217, 206, 227, 3, 246, 166, 191, 147,
+ 2, 193, 209, 134, 215, 8, 187, 151, 160, 226, 10, 253, 149, 235, 174,
+ 15, 168, 167, 137, 177, 14, 175, 245, 198, 225, 10, 189, 211, 166, 129,
+ 4, 155, 158, 199, 212, 15, 246, 158, 210, 172, 12, 208, 222, 167, 183,
+ 4, 168, 151, 160, 228, 4, 254, 136, 200, 147, 3, 162, 243, 242, 217,
+ 11, 236, 201, 184, 252, 15, 177, 243, 230, 233, 11, 148, 179, 150, 255,
+ 6, 174, 237, 246, 167, 3, 209, 233, 193, 130, 12, 200, 240, 131, 186,
+ 2, 178, 157, 168, 198, 6, 143, 190, 249, 232, 1, 230, 135, 144, 204,
+ 1, 174, 150, 157, 136, 15, 183, 158, 183, 162, 6, 231, 245, 145, 241,
+ 14, 233, 184, 247, 249, 9, 234, 135, 237, 186, 2, 159, 177, 236, 132,
+ 6, 180, 190, 237, 183, 3, 171, 196, 155, 188, 2, 242, 181, 155, 167,
+ 8, 199, 137, 219, 202, 13, 190, 192, 159, 241, 10, 180, 160, 136, 200,
+ 5, 225, 221, 234, 202, 13, 139, 193, 211, 222, 12, 229, 247, 247, 254,
+ 7, 178, 167, 200, 189, 9, 236, 145, 224, 209, 12, 189, 181, 181, 142,
+ 6, 211, 255, 195, 137, 10, 231, 134, 244, 223, 2, 192, 216, 170, 244,
+ 3, 128, 132, 231, 150, 12, 210, 164, 150, 104, 232, 225, 252, 210, 4,
+ 199, 178, 132, 135, 12, 147, 224, 226, 167, 14, 135, 143, 239, 248, 3,
+ 190, 250, 237, 208, 10, 147, 135, 174, 171, 6, 189, 157, 158, 198, 10,
+ 152, 177, 153, 236, 6, 191, 223, 190, 248, 14, 186, 138, 156, 164, 2,
+ 149, 132, 208, 195, 5, 194, 194, 155, 218, 7, 253, 246, 182, 184, 11,
+ 140, 169, 156, 177, 11, 227, 184, 167, 231, 4, 172, 213, 253, 232, 8,
+ 240, 193, 129, 233, 10, 223, 212, 219, 155, 2, 142, 193, 243, 221, 2,
+ 176, 192, 237, 244, 4, 178, 239, 243, 212, 8, 210, 200, 130, 32, 231,
+ 254, 253, 141, 6, 218, 244, 156, 198, 11, 152, 151, 178, 192, 13, 222,
+ 212, 142, 204, 7, 218, 137, 247, 160, 8, 143, 221, 142, 144, 14, 129,
+ 184, 237, 128, 10, 203, 237, 139, 172, 3, 149, 196, 216, 182, 11, 246,
+ 160, 169, 231, 15, 186, 238, 171, 161, 1, 176, 176, 172, 169, 7, 137,
+ 191, 244, 226, 13, 184, 161, 159, 26, 128, 167, 143, 134, 3, 242, 254,
+ 138, 225, 13, 238, 184, 199, 235, 10, 214, 203, 144, 197, 11, 198, 220,
+ 212, 219, 7, 229, 145, 174, 165, 5, 128, 209, 130, 231, 167, 13, 198,
+ 196, 179, 211, 7, 182, 132, 201, 165, 10, 235, 153, 140, 252, 3, 217,
+ 181, 237, 175, 7, 161, 199, 151, 225, 11, 142, 199, 232, 132, 4, 153,
+ 249, 128, 186, 1, 131, 229, 136, 172, 12, 159, 136, 140, 195, 12, 243,
+ 153, 183, 233, 5, 145, 158, 229, 145, 2, 193, 243, 236, 167, 14, 133,
+ 208, 141, 205, 14, 210, 174, 175, 132, 2, 172, 189, 148, 149, 10, 255,
+ 234, 231, 149, 9, 214, 219, 150, 166, 5, 173, 182, 178, 187, 3, 148,
+ 239, 172, 131, 4, 245, 159, 177, 254, 15, 154, 239, 200, 245, 15, 182,
+ 184, 242, 193, 13, 183, 158, 219, 253, 3, 129, 224, 170, 184, 1, 252,
+ 215, 217, 217, 2, 194, 206, 254, 158, 5, 232, 208, 253, 206, 11, 132,
+ 164, 179, 184, 14, 186, 220, 196, 189, 10, 185, 135, 128, 185, 2, 163,
+ 174, 141, 221, 3, 151, 252, 208, 215, 12, 176, 237, 217, 182, 6, 129,
+ 224, 235, 156, 7, 146, 177, 225, 230, 3, 208, 175, 157, 130, 8, 218,
+ 247, 170, 228, 14, 250, 251, 140, 220, 14, 212, 188, 171, 195, 1, 212,
+ 181, 207, 205, 2, 163, 184, 196, 185, 5, 142, 159, 199, 176, 1, 130,
+ 132, 140, 203, 9, 199, 243, 181, 168, 15, 247, 242, 204, 164, 11, 174,
+ 146, 198, 218, 3, 230, 163, 238, 136, 12, 130, 178, 158, 249, 10, 149,
+ 240, 204, 191, 9, 228, 204, 214, 170, 4, 242, 168, 132, 63, 207, 143,
+ 253, 240, 10, 210, 166, 246, 186, 2, 168, 216, 222, 162, 5, 241, 226,
+ 208, 203, 10, 216, 150, 185, 141, 12, 173, 130, 130, 204, 5, 167, 140,
+ 140, 158, 7, 224, 204, 187, 243, 4, 212, 247, 211, 212, 5, 171, 193,
+ 169, 172, 8, 148, 129, 238, 88, 239, 170, 248, 251, 3, 230, 252, 155,
+ 197, 3, 148, 170, 220, 211, 8, 172, 146, 230, 175, 6, 235, 243, 165,
+ 186, 4, 230, 158, 157, 187, 3, 207, 206, 145, 175, 14, 136, 225, 235,
+ 190, 15, 135, 175, 253, 169, 1, 155, 237, 177, 179, 5, 180, 161, 155,
+ 160, 15, 138, 133, 179, 167, 12, 221, 170, 176, 202, 6, 171, 131, 178,
+ 247, 8, 146, 239, 153, 251, 11, 128, 183, 154, 182, 3, 154, 240, 245,
+ 209, 6, 207, 137, 159, 171, 9, 192, 180, 191, 246, 11, 253, 143, 225,
+ 81, 222, 137, 185, 155, 8, 174, 188, 154, 243, 4, 170, 238, 166, 178,
+ 10, 226, 182, 176, 168, 7, 148, 175, 247, 131, 14, 173, 177, 239, 236,
+ 12, 177, 244, 159, 183, 7, 241, 150, 173, 185, 13, 232, 224, 169, 137,
+ 5, 207, 168, 131, 230, 1, 203, 162, 193, 202, 3, 131, 173, 240, 55,
+ 183, 215, 128, 137, 3, 133, 218, 167, 144, 14, 129, 136, 189, 243, 6,
+ 239, 157, 134, 248, 9, 172, 137, 234, 168, 2, 193, 240, 176, 235, 11,
+ 242, 148, 152, 217, 6, 253, 184, 164, 173, 11, 228, 215, 198, 128, 8,
+ 129, 252, 183, 6, 137, 250, 183, 210, 7, 208, 210, 163, 181, 6, 167,
+ 244, 197, 183, 15, 157, 215, 173, 242, 8, 211, 182, 254, 181, 12, 235,
+ 158, 194, 212, 9, 218, 154, 242, 147, 11, 220, 199, 237, 141, 12, 155,
+ 177, 201, 103, 254, 161, 191, 215, 8, 230, 235, 168, 49, 208, 227, 236,
+ 235, 12, 201, 130, 172, 253, 5, 180, 140, 169, 224, 15, 234, 243, 153,
+ 151, 12, 193, 190, 224, 143, 9, 129, 245, 133, 204, 8, 182, 209, 250,
+ 178, 8, 148, 139, 144, 193, 11, 230, 182, 245, 164, 7, 149, 204, 161,
+ 226, 14, 175, 229, 148, 166, 13, 148, 140, 189, 216, 3};
+ SeekableInputStream* const stream =
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer));
+ const int32_t junk[] = {
+ -1192035722, 1672896916, 1491444859, -1244121273, -791680696,
+ 1681943525, -571055948, -1744759283, -998345856, 240559198,
+ 1110691737, -1078127818, 1478213963, -1999074977, -1236487259,
+ 1081623627, 1461835677, 1591726278, -1952575303, -1153279155,
+ -490105022, 1763048303, -438594735, -1943318570, 665131315,
+ -992928519, -293947656, -27330946, -1617757142, 1557060559,
+ -1979359235, -2141705600, -568297608, 1062268262, 546298983,
+ 2078961052, 1350125334, -540110141, 1306099749, -1200938434,
+ -463354631, 329439552, -1302441847, -432946209, 1420346281,
+ -1051221106, -938249481, -1539784278, -633078252, -978731267,
+ -1561373991, 1629413219, 1092825247, -261977862, 838447829,
+ 1919107204, 1005793149, 1642588377, -1464918718, -1917550168,
+ 1003195972, 959160789, -1522112553, 181240876, -849957699,
+ -458276524, -702660955, 1860227091, 1705852155, -1998443154,
+ -360453561, 1463660586, -887384402, -1905027152, -1872509609,
+ -437752724, -1567069256, 1418352668, -460909277, -91142489,
+ -1223052800, 1097921652, -989185632, 1256987999, -177351034,
+ -1872472137, -789485410, -2128291599, -754152008, -1726480917,
+ -344889284, -196409605, 2041828611, 1030395933, -387646305,
+ 122767355, 31127980, 1207069862, -1514210082, 1440969525,
+ -1135691553, -222758993, 1750352704, 1293980890, -723071736,
+ 1827770358, -1000870743, -1082906010, -696877983, -252767279,
+ -1516790960, 815546697, 2030795, -44506509, 103592117,
+ -1482662967, 927187170, 1038067053, -1372409134, -1839775706,
+ 1288569419, 347516536, 794972304, -547875330, 2037259599,
+ -14750351, -789333636, -1172998669, 1751028141, -123944284,
+ -148751174, -5812342, -65684309, -901631668, -1730780359,
+ 1806879062, -1718574994, -502581670, -593823449, -500334763,
+ 917654667, -371046539, -11224409, -22297703, 2077801437,
+ 717176386, 712742849, -88095446, -377406568, -1498205212,
+ 135250842, -186007254, 1575202718, 1356337350, 1333002980,
+ -151543238, 1561151504, 2084754583, -1963332201, -1078676964,
+ -981959640, 1602476694, 964958176, 514867609, -592880058,
+ -1053778417, -1385343398, -1679005019, -399336120, -1348025564,
+ -1859939973, -1900785472, 2029569822, -1292867632, 1842863184,
+ -1326742626, -1647402852, -1615986391, -2004041108, -1927387127,
+ -388061550, 90851556, -1327038179, -407516410, 1845693465,
+ -704577762, 1924407265, -449969353, 275696175, 252281945,
+ 1918737432, 1112818815, -518024058, -574451772, 2092342266,
+ 1855130091, 1320798104, 1779205566, 385703114, -712984113,
+ 669819902, 1909731996, -1278685109, -1569593296, 881748738,
+ 1814556074, 1613634931, -385688735, -2071646094, -1773311888,
+ -118425506, 1721906928, -1349880393, 634302005, 760022405,
+ 761327810, -1692081627, -1260841016, -1558856657, 99218556,
+ 102018195, -1297008771, 164461607, 1741035812, -148600274,
+ 1286275446, 1462487726, -1573434104, 974796426, -2059029469,
+ -2104104269, 1866472414, -1650530055, 1795114251, 227296321,
+ 1276834604, -1985393888, -9742901, 1614958392, -659109655,
+ 2088223942, 223737183, 1388180799, 1359284324, -842845969,
+ 444248480, 792108792, 960424293, -507555799, -1025721075,
+ -1858379658, 1074173329, 1761578021, 425012429, -240453397,
+ 2024456210, -324280184, 1589154548, -1728079451, -1692822665,
+ 1356528899, 1061517573, 1931192379, 1215961560, 379843745,
+ 1361149623, -1461456893, 691110717, 2110459487, 1321410812,
+ 2009528006, 1972104767, -2012050687, 1233284414, -182949706,
+ 1670813423, -1217301469, 594224363, -299708512, 1966313463,
+ -369112947, 403641651, 88148317, -2024969729, 1133025739,
+ 314550032, 1534341518, 1083403602, -894963971, 1635904390,
+ 1275196039, 1770867136, 2003009771, -545216468, -135106165,
+ 1264541893, 2022351708, -1678497814, 151276090, 1283919338,
+ -1927614553, -206580733, 1377848647, 627609963, 1823245450,
+ -134096555, -574753171, -1744928487, 1723985722, 1167154655,
+ -599470685, 546628481, -100331078, 1303120516, 2089166733,
+ -2022155954, 456890992, -1002705650, 2049833270, 656668907,
+ -44900108, -525666700, -2134854916, -1778466708, 1099709483,
+ 1753848159, 10966731, -932986450, -1244650092, -735949590,
+ -836092531, -1380029252, -255869471, 259782569, -1689370655,
+ 1681300596, -998144311, 1911129837, -863029750, -402765462,
+ 1405052444, 325400133, 950928169, 1208813323, -1927723497,
+ -36025530, 1953996718, -130830587, -2119112967, 828918159,
+ 1875760908, -2110464282, -660032043, -1445534969, 1754377388,
+ 859122691, 79215786, -1457379224, -845505501, 715141530,
+ -1483079760, 1668988417, 1189423270, -2117214918, 439999476,
+ -1399973705, 467038141, -494559901, -1783874778, -1599666356,
+ -199642314, 1949521512, 1089253553, 645336035, -25686436,
+ 1450983836, 267113484, 73222595, -497431960, 438311597,
+ 442985552, -1777857701, 933496376, -1594540214, -505331447,
+ -1315298171, 1016898878, 966042892, -17718192, 1045280014,
+ -39208518, -2060138094, 1282411033, 1713508214, 1851842091,
+ -58654482, -474136376, 817547251, 296628738, -1658010049,
+ -299884790, 57814971, -1222513255, 869855902, 1558100706,
+ -905854845, -1326530206, -1092944323, 1103855106, 4668482,
+ 1477604451, -590041139, -1784763556, 282428730, 231044577,
+ 269278752, 1575627255, 1500279352, 1910855259, -1419761648,
+ 318679302, 107160345, -1193714758, 2041235102, -336170699,
+ -54224496, -676354875, -84222627, -175409338, 1620482599,
+ 1176801242, 1844243729, 77824988, 1438587267, -215131942,
+ -1383445358, 953582743, 1451014635, -2068176283, 1058786294,
+ -1159701920, -1767170805, 1441979975, 769713646, 540064005,
+ 525602292, -747255503, -1765923984, -1528763385, -1768241306,
+ -1168806485, 33265046, -1719626574, -2061575599, 2013225545,
+ -145323579, -1616398994, -1614251879, 101988124, 389536785,
+ -531784107, 1942497305, 1624000107, -1793946656, 1590267653,
+ 1454383595, 1477205155, 956593869, 341516726, -1104289307,
+ 1711827851, 1007508942, 392549262, -162616731, 1233239227,
+ -254895976, -932430285, 1789001809, -258910386, 1829247809,
+ -1165175134, 2050581337, 1388743894, 1680716327, -955562689,
+ -1196112257, -378740336, 1404672016, -1064391892, -209744680,
+ -1303657512, -1534080366, 1819175133, -1168503149, -144324898,
+ -1139294027, 2079078892, 830990906, 1281643674, 1239967702,
+ -36455895, -893671880, 1538788093, -1237546291, 1206167541,
+ 475817626, -1186645912, 539232082, 1475246031, 1699396377,
+ 838282081, 717780380, -1411838673, -1709848232, 504121732,
+ 627189322, 658484370, -538814615, 1922572726, 1004837510,
+ -1174373674, -1655080299, 2094525857, 355981927, -1636557369,
+ 1393356380, -624347557, -247050857, -2061086215, 119232442,
+ -549875608, 1132263989, 1210157527, -1950418773, -947805470,
+ 1473662460, -2039682564, 1511364593, 495775222, -1754933852,
+ 578631361, 974692628, 619906194, -270694776, 827229974,
+ -626474838, 217984348, 233537160, -1394225003, -1325367080,
+ -825843995, -19309008, -1051287340, -1709856232, -1924609250,
+ 1220845849, -262498310, 1923705916, 550530570, 475633807,
+ -2108852076, 407081906, 1182356405, 1433463924, -717627223,
+ 1985302280, 52562276, 462014051, 1892679691, -1685829454,
+ -432280051, 102304536, 1239474159, -1947789045, 1873237513,
+ 1162364430, 378408895, 436753561, 1401009671, -331011223,
+ -687420625, -2112505195, 1253509443, -1326656490, -223810162,
+ -1795864663, -1635326216, 1135351856, 1072514654, -1135464197,
+ 1799420691, 1814080421, -1813241050, 290458473, 1787024936,
+ -509812455, -1245699451, -1246584042, -628864855, 208287497,
+ -941972342, -282245339, 764591445, -1950018765, -155292605,
+ -2039181884, -1502132499, 927879221, -422951277, 1004131436,
+ -359377619, -279608972, 1795607112, -504889242, 491472122,
+ -1473703617, 1384613229, -1074969712, -132300841, -809001677,
+ -232707109, 718661985, -340386147, 1025520864, -1304716522,
+ -159879127, -21609621, -356163924, -1650473131, 749957789,
+ 1659608794, -1630468361, 1401873890, -826137197, -1745267595,
+ 2096479616, 108095652, -1070305783, 510702184, 681305350,
+ 1581358841, 373543787, -548048778, 1236252978, 1136594889,
+ 571396628, 1563585072, -777040205, 884269272, -1395109355,
+ -729593757, -2049576534, -881926310, 184632886, -1534407215,
+ -113998263, 1897790924, 827242297, -1493910748, 207409078,
+ -1381606655, -1469137051, -1357021665, 574603572, 1818109562,
+ -1462551998, -523689962, -784515708, -1377438213, 1830890656,
+ 1018570656, -563746679, 548954202, -1075828514, -284139126,
+ -1646923990, -1248333415, 1376072552, 328385806, 1612310907,
+ -2015376329, -1482101887, -1268324540, -1073923297, -47412286,
+ -2031027134, 1724101079, 1882428903, 1226325168, 1060182497,
+ -1439411972, 2068153022, 303922543, -1010299082, -527725408,
+ -868084535, -1453122373, 729915259, -1810431776, -1170855222,
+ -1854748370, 1653489747, -1024923202, 1166795461, 888317469,
+ -1095897075, -263817712, -1283775515, 128113057, 725330985,
+ -340535877, 1390305951, -2040698894, 295390234, 519468833,
+ -179502313, -404230148, 1113834573, 217655255, 697236931,
+ -447996629, -2135157399, -1308291941, -1886148018, -161160567,
+ -67015720, 360555652, -598747970, 865850123, -802081914,
+ 1971145595, -477789992, -776787161, 1811555304, -1735200791,
+ 1891237826, -986816598, -1744598976, -2001145934, -171623446,
+ -2118892089, 1848264734, -5746591, 1454333734, 1273178226,
+ 1030190121, 1328203427, -29346019, 314597102, -2012456161,
+ 1933656313, 787872740, -160286555, -1438880998, 387705257,
+ 1464818923, 1301041719, 1984783124, -1271147531, 1487861786,
+ 1042524116, 674928812, 333826191, -606298314, -1923024690,
+ 1893750313, -1852775093, 1334513545, -1533194135, 1098945727,
+ -1334639594, 982450114, -1887582219, 811767569, 1106779430,
+ 2128521241, -801057249, 1282615287, 1309743218, -1036048202,
+ -675233917, -2015133312, -1407597980, -805866925, 181664900,
+ 1428234298, 987551693, -1305045729, -269637537, -161413456,
+ -1825022695, -464152091, 594365822, 1788687667, 173257923,
+ 654002541, 1479451402, -343189027, -1317263094, -1632991710,
+ 434048963, -218104622, -2116821787, 1386879989, 2024277523,
+ -1683903287, 71981107, 1472146048, -954561185, -1929121846,
+ -99278026, -836661907, 1161399456, 645528633, -229497534,
+ -559683816, -347755982, 846036498, 1843706545, 1988047218,
+ -1916349205, -2046432139, 2117711385, 768342313, -1484993937,
+ -1028574363, 1840103616, 2024908549, 648097475, 820426188,
+ 1813302864, -1862894699, 1853557023, -645272638, -694642746,
+ -484368118, 770488839, -1646124786, 453988784, -771295390,
+ -459096420, -646899732, 2068517271, 1404075720, -1273309905,
+ -582314992, -671770951, -449870790, -48212202, 1579880938,
+ -476878341, 513340524, -153887805, 1303129644, -1736144611,
+ 1997437163, -551454447, -539393656, -410513105, -1730002043,
+ 787100897, 156300478, 1655413827, -1314804760, -1017480219,
+ 725995733, 837174883, 125404041, -1422872133, 174985128,
+ 506105285, -2026814521, 122957373, 2018881699, -1531346372,
+ 304642639, -1823589353, 738989518, 842332146, -728492095,
+ 691793049, 1406905611, 848671060, 2146403475, 921228493,
+ 503081371, 470135884, -1234198474, 1009780684, 1177932262,
+ -34718027, 1887702646, 1052893773, 443393403, 512495115,
+ 2136307494, -1438483586, 476864340, 2001989218, 1022729211,
+ 1855835926, -1410262945, 1184679065, -689936200, -961809237,
+ -37581255, -1029470257, -1480892021, -401541056, 453202492,
+ 1112595988, -1405938761, -941021151, 65758395, -496021405,
+ 179072114, 126610419, 711573643, -176404016, -797304305,
+ 1052539142, -531326266, 137188861, -1079747744, 104921070,
+ -1577624885, 1214665426, 1079238997, 72216401, 892964488,
+ -285963924, 383166184, -2067986431, -947233029, 1764614742,
+ 1564468029, -421298612, -1385150219, 1439869174, -871204348,
+ 55432339, 1314201120, -1685876151, -1703440863, 300653337,
+ -1977445820, -2046493998, 309926543, 378495260, 1958992069,
+ -1113354289, 1792193921, 1263805225, 1373967983, 1352521646,
+ -60235319, -675624276, 26323830, -591835714, -1885760899,
+ -1519938874, -1998677577, 375457065, 1000260823, -1450154397,
+ -1864463548, -1911093470, 1114002525, 833260426, -571392737,
+ 1004593242, -204444351, -336121865, 710431300, 1173033399,
+ 376979722, -1770804506, 85447404, -484026297, -1606360717,
+ -1582593999, 1774771216, -1889730446, -669358702, 641183901,
+ -1956035797, -479348625, -619594436, 1553417071, -1684327868,
+ -583232007, 587191975, 1019475270, 1299447851, -1094565055,
+ 726858321, -22214620, 254396674, -1515419866, -491290335,
+ 870396109, -589067521, -998073181, -585938741, -27198124,
+ -1998075025, -1636627064, 1922665080, 907579634, 245499901,
+ -1491598228, 1290991525, -417806055, 783775795, 987384289,
+ -1664073398, -1818943544, 518513473, -1972739863, -17076929,
+ -1087578336, -2130735619, -11897175, 1247714779, 39549495,
+ 1424721195, -136636068, 1321021406, -699064532, -212744195,
+ -637015415, 1346785936, 1615976060, -735897892, 869318776,
+ -2064220631, 1083884434, 2142530152, 1941891810, 1747938419,
+ 584211108, 1430935230, 673025767, -1522239571, -349593668,
+ 534830840, 1439125060, 221479859, -1665023212, -711033607,
+ 1976217028, 1419826523, -912609625, 1879662953, -683020553,
+ -1481737184, -1453041404, -419287042, 1209159969, -810667515,
+ 328331620, -554189827, -38670987, 758568371, 392902911,
+ 805013760, 831693442, 102032100, -339108454, -1214835719,
+ -535575827, 471829764, 1693706596, -692180874, -1274840224,
+ -1770889195, 316972034, 3851814, -1468783678, -2077914158,
+ -93044504, 112417435, 301526952, -1801634537, 2040682218,
+ -1540074671, 18395658, -1607816833, 399285522, 2083671022,
+ -1317230535, -1170230962, -1622288205, -1668653877, -1756594160,
+ 1595947921, 335177918, 106581426, -1517810326, -536170364,
+ -1434305965, -723341079, 1614963411, 478361139, 1696920219,
+ 994126447, -341229183, 1677955939, 329146836, 1508992901,
+ 2013205637, -1291068239, 603886509, 786926191, -1973621955,
+ 1873694716, -1987964717, -151056773, -982178036, 108200448,
+ 1655973323, -643471294, 65458253, -417971646, 1919828738,
+ -933763539, 171755140, 460249395, -1125856734, -16739013,
+ -2135736362, 670532752, 203611302, 1890987520, -1483650118,
+ -683059705, -77856848, -769700368, -1416934159, -1500850145,
+ -1215059687, -1879473955, 763272883, -1968038568, -2088236599,
+ 1229512381, 2032523562, -474604143, 1842260297, -776126031,
+ -1134008029, 2108556714, -1880709482, 237393986, -69243134,
+ 782458217, -1259490761, 1788876781, 1549080823, -859428959,
+ -605940335, 1018516449, -207426450, 570450064, -1389536390,
+ 592346118, -1331252532, 165183113, -2099745269, -1356937151,
+ 21822807, -748103470, -142393926, 547509238, -634686347,
+ 1026300111, 1330070951, 1759768014, 1142187544, 968694111,
+ 1213038901, 1885970361, -323991688, 363518085, -1672108324,
+ -804048074, -1363075581, 1701273418, 1911929917, 44447185,
+ 410977169, 1365427170, 959862646, -948485651, -1849935396,
+ -1009669619, -1801379401, 525897667, -1955430097, -1448024640,
+ 1931243285, 1760694954, -1371363748, -846322684, -381157475,
+ -1725454696, 411728882, -659046884, -2017706580, 326504936,
+ -1505447249, 754195181, 1338936701, -176693411, -399948493,
+ -1610399078, 1200783875, -1135651738, -1011062025, -1590458258,
+ 1626934009, -2101147741, -832223620, 310144767, -1670647754,
+ -139535449, -511393081, -1608439153, 1179987525, -566009345,
+ 1746424843, -972134987, -35190731, -123082125, -1225027672,
+ -12768196, 57860780, -1169964405, -896086188, -73457590,
+ -1695925182, -585143980, -419933289, -1162453564, 755390220,
+ -827933852, 1698208124, 1044230129, 608724853, -164275430,
+ -102639739, 1198259549, 1137531989, -1116284630, -1356999201,
+ 1957170687, -1307732933, 1650483544, -638806570, -1154478179,
+ 173151245, 1569845123, 1591311498, -494240657, 841880275,
+ 975655140, 2144072931, 2093762562, 904195791, 1304524183,
+ 887864513, 555382367, 747757520, -988490149, 1029722488,
+ 462127468, -865600825, -355339627, -717869189, -1460719471,
+ 1840667290, -1467405670, 2066471254, 1682556574, 1506116152,
+ -1015317971, -959194655, 557427179, 1449151468, -966928570,
+ -862440014, -934483264, -382993930, 1831680277, 1962868691,
+ 1253985271, 605568087, -1920103611, -162069068, -1335381537,
+ 1466515436, 755896709, -152123806, 120724324, -2090132802,
+ -749600957, -513332999, 1697220972, 620702104, 1086605242,
+ 2010555991, -1344440189, 573125337, -412489169, 2089343506,
+ -887304007, 1741382282, -1294582154, 1399959211, -1465487414,
+ -987925582, 2054751675, 1026201859, 1347793280, -581133591,
+ -1900657794, 1589876493, -555443627, -1711864278, 431235807,
+ 1209591236, 248744027, 1750009325, -120886245, 1610195551,
+ -1381986720, 459223925, 2001423958, 1232784067, 1433853260,
+ 1951017062, 654934426, 1804947769, 1931534286, -495608934,
+ 1933837599, -1427818655, 2140007479, -713427148, 1995046348,
+ -1203051419, -35213147, 774683636, -592086787, 630139504,
+ 1094303075, -1180875316, -438823034, 2117971809, -646196848,
+ -1103051341, 798549240, -1510953796, 1429065214, 704873658,
+ 23191330, -1585169696, 1840542299, -591215524, 664793562,
+ 357997439, -963884906, 1863569433, -54498563, 991835977,
+ -716807557, 376575045, -591189325, -449494294, -404697696,
+ 1597459996, -108456969, -1899610313, 1255661956, 794700058,
+ 1115324758, 658753699, 1719389172, 1145101705, -1181105674,
+ -1622578697, 233308080, 1451926619, 1322377945, -1800772031,
+ -810522289, -1951656903, -527284125, 1582214669, -1422750113,
+ 2022685659, -1026827239, 634221522, 1270282959, -414347878,
+ -1553149269, 1740116911, -1502502767, -697241533, -1983623628,
+ -1015070121, 1211501947, -582055635, 1158111423, 1409791816,
+ -1650549906, -1289481332, -372952307, 698533442, 1253310137,
+ 126736648, 112219028, -260924425, 538236607, -8216695,
+ 1258254795, 1250497022, 695411926, 1807753300, 1141377444,
+ 378228237, -1640510094, 1772051547, 819577690, 1637064807,
+ 2004518786, 1726570960, 1237945196, 1890473363, 142532121,
+ 431642362, 457680616, 1793421134, 114108497, -1609966004,
+ 390875116, 865761473, -284648597, -1270011981, 1801086569,
+ -450041354, 34724667, 1461207855, -1563369174, -1193357688,
+ 890744002, -303462235, 306064148, -1714379878, 1160621852,
+ 782444786, 1760233647, -251092726, -1239728484, 451165251,
+ -971693596, -1492590107, 1251141173, 1068753315, 127560712,
+ -675602404, -1943731429, -1442429995, -1268600807, 783284088,
+ 960921257, -523462813, 1709486406, 1864033227, 32424581,
+ 1723753974, 771274676, -606900633, 1574849281, 169589480,
+ 1895990143, 1261911764, -1663992892, -1265826537, -1801917442,
+ -397409604, -1627809111, -1438727684, -932073653, 1633378602,
+ 357569766, -1297857029, -423873023, 1795321075, 1274442352,
+ -1876019716, -897691833, -1569633677, -2028495970, 1542882873,
+ -1573770995, 2119629948, -85633780, -936570279, 1393887381,
+ -488600436, -1684597311, 1160585918, 1519028021, 1044513341,
+ 150838164, 1792373530, 653285210, -1248666111, -191831605,
+ -1891529144, 1283118544, -1276441843, -1223401980, -425946279,
+ 1970465599, 721976469, -1505253544, 45266924, 1806828024,
+ -2070615494, -908596292, -1117250622, 331420657, -660185152,
+ 589879211, 210813328, -456387314, -1601399804, 198300673,
+ -1995521769, 334183842, 665336620, -1793671638, -2082929039,
+ -782763724, 1042860064, 225541755, -622972242, 591101317,
+ -1369419464, 1119101151, -1294175841, -400470575, 606374965,
+ 94458805, 864049393, -1426575787, 1101993333, 1230205177,
+ 497689724, 1211124753, -1512147358, -1012370015, -505814051,
+ 120429775, 2145165099, -1502901431, -474452671, 422689635,
+ -1152989257, 962267153, 280766852, 1196475443, -784471083,
+ 1169426630, 1749421434, -1606879751, 1300649655, 1121651007,
+ -1400613749, -724755769, 1045426592, -1958011127, 573427844,
+ -1811022517, 682117596, -143934906, 318890668, 198143572,
+ 103973361, -1685647151, 287732246, -260465004, 30450195,
+ -48654426, -1321307339, 1256181927, -1817379876, 1951739521,
+ 1537791157, 1145153363, -1463184068, 255386052, -872356472,
+ 1042060305, 1244057261, -968333667, 100042814, 2043095500,
+ -890128213, 637318503, -1322924135, 858281960, -1489122205,
+ 1572431441, -2013618080, 1719193989, -1529545814, 1940436029,
+ 2032604275, 320677124, 1625574448, -694531606, 1157552927,
+ 25905097, -1627753995, -1172357804, 1366114995, 941005378,
+ 2066816251, 1205853530, -1922099373, -22435800, 431025724,
+ 1372468641, 21209867, -1272633083, -867481634, -1918735176,
+ -534439129, -1383070475, -1696306012, -1724096388, -58061119,
+ -943955536, 136028839, -614980418, -1778996582, 1807875015,
+ 675695942, 1790527522, -528619987, 904810541, 950940251,
+ 1238041417, -8393215, -1446028808, -682467665, -1544440270,
+ -1472962754, -586815383, 2070764975, 648828893, 1113860227,
+ -2116497111, -2080720801, -1049969615, 1067459574, 2052227797,
+ -157225074, -437791713, -260129125, 558283708, 772040010,
+ 237325318, 1564229865, -935958297, 1037756822, -1223152552,
+ 1757318298, -740036263, -1268851110, -303275906, -1279892542,
+ 902459799, -952359270, -339785735, -1901753053, 1389276010,
+ -2063575830, 1460647369, -1640863297, 1336454626, -1497707231,
+ -1927020015, 1252446125, 368146846, 1594936652, -1500704261,
+ 1661025840, -1850727093, 2144637413, -1158186535, -1831026939,
+ -889829783, 1953368389, -1746264692, 166004987, -1043094955,
+ -2125018538, -1373375703, 1892792265, -1058706557, 248660620,
+ 1346423769, -1942180581, 608538378, 1244024597, -2056829184,
+ -1772818706, 199415397, 1546106190, 440423328, -693070051,
+ -778903177, -919523719, 1402659839, -1735022534, 1622883535,
+ 752024237, -1490765944, -1388651615, 931076141, -2047594739,
+ 1103847150, -704725814, 741564741, 1040069436, 1726406295,
+ 61122015, 1437812418, 1111368208, -108948681, 1619816846,
+ 910911373, 556728709, -802734918, -976030177, -1752026546,
+ -1860499981, -169913498, 60202440, 904411442, -240036733,
+ 1049955136, 1551675456, 1153662447, -770639649, -864966225,
+ -1589042799, 362084868, -811531736, 1691619679, 1948514866,
+ 1648879429, -82248915, -119887768, -1320899500, 1313770646,
+ -2019806760, 757267630, -1385371012, -2044161290, -1090682389,
+ 951372426, -1625173167, -1450498369, -446130452, 1245989289,
+ -416577865, -1877776085, -385296034, 257025076, 617642011,
+ -269486566, -283804879, -1469334479, -1010626032, 1903591953,
+ 976050042, -1690852568, -2113044709, -2040081275, -1385916058,
+ 1681025033, -458183734, 804037210, -1357816212, -2122604724,
+ -1620519414, -1081733978, 897828801, -338474662, 408632762,
+ 353159097, -643223791, 1432419115, -1063925125, -1283707295,
+ 1600221803, 1966239023, 1602403017, -1896954519, -465007748,
+ -1931897742, 1728032847, -990776477, -215620097, 24160762,
+ 1088990582, -1661538137, -413783692, -993213325, 1792858174,
+ -1785449573, 530278927, -1123162098, 1527518801, -994386637,
+ 1980429555, -767184731, 507106901, 288876987, -1165022305,
+ -1445201374, -2062378367, 1930504660, -1444470104, -538236127,
+ -2101929870, 1657423803, 594868136, 641992148, 423166527,
+ 1570659537, 2143752822, -1587338457, 938659018, 444521303,
+ -1613249129, 329284644, 879036249, -244264840, 214041075,
+ 2021893527, -841410460, -1997684084, -1335815733, 330146293,
+ -810388560, 461221786, -331575574, 1114860921, -1823171172,
+ 1460924447, 746653722, -1823299441, -1709862982, -1072627187,
+ 1272515033, 1696334966, -820424031, -1352171498, -369000884,
+ 524637728, 1634525440, 109234473, 623876212, -1617988772,
+ -1920751626, -529392580, 1426964127, -850772426, -1415825247,
+ 918760524, -2005391328, 306414237, -741998859, 1034121377,
+ -1535565247, 1528007238, -645197362, 1183823190, 1452290168,
+ -297497904, 366899271, 659402776, 1162771417, 33575465,
+ -819969972, 1550032173, 1812350412, 1019335983, 1108271725,
+ -1895946056, -1343073793, -448887654, -1533743371, 2121607227,
+ 169180061, 982879256, -1848545221, 27519068, 409070016,
+ 1846632377, 1454960183, 1548882667, 1035638563, -710263923,
+ -1786568873, 1026978083, 1381572891, -532776566, -989703533,
+ -1578299857, 541921735, -195042893, -1656822082, -1680966160,
+ -781641338, -287090569, -1920834785, -1959900163, 273017769,
+ 1364365142, -1230830272, 711120619, -464932247, 540384202,
+ -2145789947, 2136546253, 1813925403, -534472604, -193288193,
+ 362493438, 703583137, 1559213108, 1938188546, 1406703389,
+ -328204765, -500280210, -1702502156, 862665560, -969766913,
+ 510405705, 1076079592, 1984257517, 1975623421, 204828458,
+ 349826410, -731418130, 185132999, 1286701313, -2055650532,
+ -1514773692, 497599639, 1619904755, 1469303937, -1274649611,
+ 581620530, 66095673, -1460642792, 330221993, 707515924,
+ -1421482169, 1624712620, -750796951, -971080468, 657945392,
+ 759856618, -1120219222, 93175882, -532613816, 475234099,
+ 1161529994, 855426198, -597998838, 464758707, -1928475560,
+ 2079160388, -178236356, -724974414, 2047043674, 1651925317,
+ -883296943, -1198932182, 1605581769, 459492800, 891206669,
+ -1253302888, 1600646432, -85730303, 1102520943, 657674007,
+ 1394924437, 981863857, 1883171786, -1724771415, -997457177,
+ -1804969401, 680867892, -241199656, -480782502, -58592066,
+ -412095964, -1896150659, -926392833, -1333839736, 311247446,
+ -1588993057, 898827577, -1523879487, 1074320882, -6749953,
+ -1025965701, 861172904, -2071510292, -1193653711, -1667222954,
+ -1296582582, 1497253549, 1625141742, -108604494, 1165486207,
+ 51714803, 1723701480, -802521253, 2114265882, 1634942197,
+ -1224478625, -1153482049, 1127175259, 1544684234, 978234803,
+ -1982083851, -1784846680, 495428362};
+ const uint32_t fileLoc[] = {
+ 0, 0, 0, 0, 0, 3, 3, 3, 3, 6,
+ 6, 6, 6, 9, 9, 9, 9, 12, 12, 12,
+ 12, 15, 15, 15, 15, 18, 18, 18, 18, 21,
+ 21, 21, 21, 24, 24, 24, 24, 27, 27, 27,
+ 27, 30, 30, 30, 30, 33, 33, 33, 33, 36,
+ 36, 36, 36, 39, 39, 39, 39, 42, 42, 42,
+ 42, 45, 45, 45, 45, 48, 48, 48, 48, 51,
+ 51, 51, 51, 54, 54, 54, 54, 57, 57, 57,
+ 57, 60, 60, 60, 60, 63, 63, 63, 63, 66,
+ 66, 66, 66, 69, 69, 69, 69, 72, 72, 72,
+ 72, 75, 75, 75, 75, 78, 78, 78, 78, 81,
+ 81, 81, 81, 84, 84, 84, 84, 87, 87, 87,
+ 87, 90, 90, 90, 90, 93, 93, 93, 93, 96,
+ 96, 96, 96, 99, 99, 99, 99, 102, 102, 102,
+ 102, 105, 105, 105, 105, 108, 108, 108, 108, 111,
+ 111, 111, 111, 114, 114, 114, 114, 117, 117, 117,
+ 117, 120, 120, 120, 120, 123, 123, 123, 123, 126,
+ 126, 126, 126, 129, 129, 129, 129, 132, 132, 132,
+ 132, 135, 135, 135, 135, 138, 138, 138, 138, 141,
+ 141, 141, 141, 144, 144, 144, 144, 147, 147, 147,
+ 147, 150, 150, 150, 150, 153, 153, 153, 153, 156,
+ 156, 156, 156, 159, 159, 159, 159, 162, 162, 162,
+ 162, 165, 165, 165, 165, 168, 168, 168, 168, 171,
+ 171, 171, 171, 174, 174, 174, 174, 177, 177, 177,
+ 177, 180, 180, 180, 180, 183, 183, 183, 183, 186,
+ 186, 186, 186, 189, 189, 189, 189, 192, 192, 192,
+ 192, 196, 196, 196, 196, 200, 200, 200, 200, 204,
+ 204, 204, 204, 208, 208, 208, 208, 212, 212, 212,
+ 212, 216, 216, 216, 216, 220, 220, 220, 220, 224,
+ 224, 224, 224, 228, 228, 228, 228, 232, 232, 232,
+ 232, 236, 236, 236, 236, 240, 240, 240, 240, 244,
+ 244, 244, 244, 248, 248, 248, 248, 252, 252, 252,
+ 252, 256, 256, 256, 256, 260, 260, 260, 260, 264,
+ 264, 264, 264, 268, 268, 268, 268, 272, 272, 272,
+ 272, 276, 276, 276, 276, 280, 280, 280, 280, 284,
+ 284, 284, 284, 288, 288, 288, 288, 292, 292, 292,
+ 292, 296, 296, 296, 296, 300, 300, 300, 300, 304,
+ 304, 304, 304, 308, 308, 308, 308, 312, 312, 312,
+ 312, 316, 316, 316, 316, 320, 320, 320, 320, 324,
+ 324, 324, 324, 328, 328, 328, 328, 332, 332, 332,
+ 332, 336, 336, 336, 336, 340, 340, 340, 340, 344,
+ 344, 344, 344, 348, 348, 348, 348, 352, 352, 352,
+ 352, 356, 356, 356, 356, 360, 360, 360, 360, 364,
+ 364, 364, 364, 368, 368, 368, 368, 372, 372, 372,
+ 372, 376, 376, 376, 376, 380, 380, 380, 380, 384,
+ 384, 384, 384, 388, 388, 388, 388, 392, 392, 392,
+ 392, 396, 396, 396, 396, 400, 400, 400, 400, 404,
+ 404, 404, 404, 408, 408, 408, 408, 412, 412, 412,
+ 412, 416, 416, 416, 416, 420, 420, 420, 420, 424,
+ 424, 424, 424, 428, 428, 428, 428, 432, 432, 432,
+ 432, 436, 436, 436, 436, 440, 440, 440, 440, 444,
+ 444, 444, 444, 448, 448, 448, 448, 452, 452, 452,
+ 452, 456, 456, 456, 456, 460, 460, 460, 460, 464,
+ 464, 464, 464, 468, 468, 468, 468, 472, 472, 472,
+ 472, 476, 476, 476, 476, 480, 480, 480, 480, 484,
+ 484, 484, 484, 488, 488, 488, 488, 492, 492, 492,
+ 492, 496, 496, 496, 496, 500, 500, 500, 500, 504,
+ 504, 504, 504, 508, 508, 508, 508, 512, 512, 512,
+ 512, 516, 516, 516, 516, 520, 520, 520, 520, 524,
+ 524, 524, 524, 528, 528, 528, 528, 532, 532, 532,
+ 532, 536, 536, 536, 536, 540, 540, 540, 540, 544,
+ 544, 544, 544, 548, 548, 548, 548, 552, 552, 552,
+ 552, 556, 556, 556, 556, 560, 560, 560, 560, 564,
+ 564, 564, 564, 568, 568, 568, 568, 572, 572, 572,
+ 572, 576, 576, 576, 576, 580, 580, 580, 580, 584,
+ 584, 584, 584, 588, 588, 588, 588, 592, 592, 592,
+ 592, 596, 596, 596, 596, 600, 600, 600, 600, 604,
+ 604, 604, 604, 608, 608, 608, 608, 612, 612, 612,
+ 612, 616, 616, 616, 616, 620, 620, 620, 620, 624,
+ 624, 624, 624, 628, 628, 628, 628, 632, 632, 632,
+ 632, 636, 636, 636, 636, 640, 640, 640, 640, 644,
+ 644, 644, 644, 648, 648, 648, 648, 652, 652, 652,
+ 652, 656, 656, 656, 656, 660, 660, 660, 660, 664,
+ 664, 664, 664, 668, 668, 668, 668, 672, 672, 672,
+ 672, 676, 676, 676, 676, 680, 680, 680, 680, 684,
+ 684, 684, 684, 688, 688, 688, 688, 692, 692, 692,
+ 692, 696, 696, 696, 696, 700, 700, 700, 700, 704,
+ 704, 704, 704, 708, 708, 708, 708, 712, 712, 712,
+ 712, 716, 716, 716, 716, 720, 720, 720, 720, 724,
+ 724, 724, 724, 728, 728, 728, 728, 732, 732, 732,
+ 732, 736, 736, 736, 736, 740, 740, 740, 740, 744,
+ 744, 744, 744, 748, 748, 748, 748, 752, 752, 752,
+ 752, 756, 756, 756, 756, 760, 760, 760, 760, 764,
+ 764, 764, 764, 768, 768, 768, 768, 772, 772, 772,
+ 772, 776, 776, 776, 776, 780, 780, 780, 780, 784,
+ 784, 784, 784, 788, 788, 788, 788, 792, 792, 792,
+ 792, 796, 796, 796, 796, 800, 800, 800, 800, 804,
+ 804, 804, 804, 808, 808, 808, 808, 812, 812, 812,
+ 812, 816, 816, 816, 816, 820, 820, 820, 820, 824,
+ 824, 824, 824, 828, 828, 828, 828, 832, 832, 832,
+ 832, 836, 836, 836, 836, 840, 840, 840, 840, 844,
+ 844, 844, 844, 848, 848, 848, 848, 852, 852, 852,
+ 852, 856, 856, 856, 856, 860, 860, 860, 860, 864,
+ 864, 864, 864, 868, 868, 868, 868, 872, 872, 872,
+ 872, 876, 876, 876, 876, 880, 880, 880, 880, 884,
+ 884, 884, 884, 888, 888, 888, 888, 892, 892, 892,
+ 892, 896, 896, 896, 896, 900, 900, 900, 900, 904,
+ 904, 904, 904, 908, 908, 908, 908, 912, 912, 912,
+ 912, 916, 916, 916, 916, 920, 920, 920, 920, 924,
+ 924, 924, 924, 928, 928, 928, 928, 932, 932, 932,
+ 932, 936, 936, 936, 936, 940, 940, 940, 940, 944,
+ 944, 944, 944, 948, 948, 948, 948, 952, 952, 952,
+ 952, 956, 956, 956, 956, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 960, 960, 960, 960, 960, 960,
+ 960, 960, 960, 960, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 964, 964, 964, 964, 964, 964,
+ 964, 964, 964, 964, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 968, 968, 968, 968, 968, 968,
+ 968, 968, 968, 968, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 972, 972, 972, 972, 972, 972,
+ 972, 972, 972, 972, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 976, 976, 976, 976, 976, 976,
+ 976, 976, 976, 976, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 980, 980, 980, 980, 980, 980,
+ 980, 980, 980, 980, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 984, 984, 984, 984, 984, 984,
+ 984, 984, 984, 984, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 988,
+ 988, 988, 988, 988, 988, 988, 988, 988, 988, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 992, 992, 992, 992,
+ 992, 992, 992, 992, 992, 992, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625, 1625,
+ 1625, 1625, 1625, 1625, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255, 2255,
+ 2255, 2255, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886, 2886,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515,
+ 3515, 3515, 3515, 3515, 3515, 3515, 3515, 3515, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149, 4149,
+ 4149, 4149, 4149, 4149, 4149, 4149, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784, 4784,
+ 4784, 4784, 4784, 4784, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419, 5419,
+ 5419, 5419, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047, 6047,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676,
+ 6676, 6676, 6676, 6676, 6676, 6676, 6676, 6676, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308, 7308,
+ 7308, 7308, 7308, 7308, 7308, 7308, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943, 7943,
+ 7943, 7943, 7943, 7943, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575, 8575,
+ 8575, 8575, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205, 9205,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841,
+ 9841, 9841, 9841, 9841, 9841, 9841, 9841, 9841, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478, 10478,
+ 10478, 10478, 10478, 10478, 10478, 10478};
+ const uint32_t rleLoc[] = {
+ 0, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2,
+ 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1,
+ 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4,
+ 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3, 4, 1, 2, 3,
+ 4, 1, 2, 3, 4, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+ 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+ 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+ 121, 122, 123, 124, 125, 126, 127, 128, 129, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+ 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+ 126, 127, 128, 129, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+ 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+ 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+ 121, 122, 123, 124, 125, 126, 127, 128, 129, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+ 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+ 126, 127, 128, 129, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+ 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+ 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+ 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+ 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
+ 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+ 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13,
+ 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28,
+ 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43,
+ 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58,
+ 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73,
+ 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88,
+ 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103,
+ 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118,
+ 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5,
+ 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20,
+ 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35,
+ 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
+ 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65,
+ 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80,
+ 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95,
+ 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110,
+ 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125,
+ 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
+ 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27,
+ 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42,
+ 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57,
+ 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72,
+ 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87,
+ 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102,
+ 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117,
+ 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4,
+ 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19,
+ 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34,
+ 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49,
+ 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64,
+ 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79,
+ 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94,
+ 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109,
+ 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124,
+ 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11,
+ 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
+ 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41,
+ 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56,
+ 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71,
+ 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86,
+ 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101,
+ 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116,
+ 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3,
+ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
+ 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
+ 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48,
+ 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63,
+ 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78,
+ 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93,
+ 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108,
+ 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123,
+ 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10,
+ 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25,
+ 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40,
+ 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55,
+ 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70,
+ 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85,
+ 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100,
+ 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115,
+ 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2,
+ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17,
+ 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32,
+ 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47,
+ 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62,
+ 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77,
+ 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92,
+ 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107,
+ 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122,
+ 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,
+ 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24,
+ 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39,
+ 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54,
+ 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69,
+ 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
+ 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99,
+ 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114,
+ 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0, 1,
+ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16,
+ 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31,
+ 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46,
+ 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61,
+ 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76,
+ 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91,
+ 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106,
+ 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121,
+ 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7, 8,
+ 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23,
+ 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38,
+ 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53,
+ 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68,
+ 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83,
+ 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98,
+ 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113,
+ 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 0,
+ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,
+ 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30,
+ 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45,
+ 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60,
+ 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75,
+ 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90,
+ 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105,
+ 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120,
+ 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6, 7,
+ 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22,
+ 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37,
+ 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52,
+ 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
+ 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82,
+ 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97,
+ 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112,
+ 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127,
+ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14,
+ 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29,
+ 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44,
+ 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59,
+ 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74,
+ 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89,
+ 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104,
+ 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119,
+ 120, 121, 122, 123, 124, 125, 126, 127, 0, 1, 2, 3, 4, 5, 6,
+ 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21,
+ 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36,
+ 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51,
+ 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66,
+ 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81,
+ 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96,
+ 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111,
+ 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126,
+ 127};
+ std::list<uint64_t> positions[4096];
+ for (size_t i = 0; i < 4096; ++i) {
+ positions[i].push_back(fileLoc[i]);
+ positions[i].push_back(rleLoc[i]);
+ }
+ std::unique_ptr<RleDecoder> rle =
+ createRleDecoder(std::unique_ptr<SeekableInputStream>(stream), true,
+ RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(2048);
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ if (i < 1024) {
+ EXPECT_EQ(i / 4, data[i]) << "Wrong output at " << i;
+ } else {
+ EXPECT_EQ(2 * i, data[i]) << "Wrong output at " << i;
+ }
+ }
+ rle->next(data.data(), data.size(), nullptr);
+ for (size_t i = 0; i < data.size(); ++i) {
+ EXPECT_EQ(junk[i], data[i]) << "Wrong output at " << i;
+ }
+ size_t i = 4096;
+ do {
+ --i;
+ PositionProvider location(positions[i]);
+ rle->seek(location);
+ rle->next(data.data(), 1, nullptr);
+ if (i < 1024) {
+ EXPECT_EQ(i / 4, data[0]) << "Wrong output at " << i;
+ } else if (i < 2048) {
+ EXPECT_EQ(2 * i, data[0]) << "Wrong output at " << i;
+ } else {
+ EXPECT_EQ(junk[i - 2048], data[0]) << "Wrong output at " << i;
+ }
+ } while (i != 0);
+} // NOLINT
+
+TEST(RLEv1, testLeadingNulls) {
+ const unsigned char buffer[] = {0xfb, 0x01, 0x02, 0x03, 0x04, 0x05};
+ std::unique_ptr<RleDecoder> rle = createRleDecoder(
+ std::unique_ptr<SeekableInputStream>(
+ new SeekableArrayInputStream(buffer, ARRAY_SIZE(buffer))),
+ false, RleVersion_1, *dbcommon::getDefaultPool());
+ std::vector<int64_t> data(10);
+ const char isNull[] = {0x00, 0x00, 0x00, 0x00, 0x00,
+ 0x01, 0x01, 0x01, 0x01, 0x01};
+ rle->next(data.data(), 10, isNull);
+
+ for (size_t i = 5; i < 10; ++i) {
+ EXPECT_EQ(i - 4, data[i]) << "Output wrong at " << i;
+ }
+}
+
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-orc-vector.cc b/depends/storage/test/unit/format/test-orc-vector.cc
new file mode 100644
index 0000000..7c89a7c
--- /dev/null
+++ b/depends/storage/test/unit/format/test-orc-vector.cc
@@ -0,0 +1,194 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include <errno.h>
+#include <stdlib.h>
+
+#include "dbcommon/log/exception.h"
+#include "dbcommon/log/logger.h"
+#include "gtest/gtest.h"
+#include "storage/format/orc/vector.h"
+#include "storage/testutil/file-utils.h"
+
+namespace orc {
+
+TEST(TestOrcVector, IntegerVectors) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ LongVectorBatch lv(100, *pool);
+ EXPECT_EQ(lv.toString(), "Integer vector <0 of 100>");
+ lv.resize(1000);
+ EXPECT_EQ(lv.toString(), "Integer vector <0 of 1000>");
+ EXPECT_EQ(lv.getWidth(), 8);
+ EXPECT_EQ(lv.getType(), ORCTypeKind::LONG);
+
+ EXPECT_EQ(lv.hasVariableLength(), false);
+}
+
+TEST(TestOrcVector, BytesVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ StringVectorBatch sv(100, *pool);
+ EXPECT_EQ(sv.toString(), "String vector <0 of 100>");
+ sv.resize(1000);
+ EXPECT_EQ(sv.toString(), "String vector <0 of 1000>");
+ EXPECT_EQ(sv.getWidth(), 0);
+ EXPECT_EQ(sv.getType(), ORCTypeKind::STRING);
+ EXPECT_EQ(sv.getData(), (char *)sv.data.data());
+
+ EXPECT_EQ(sv.hasVariableLength(), true);
+}
+
+TEST(TestOrcVector, StructVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ StructVectorBatch vec(100, *pool);
+
+ std::unique_ptr<LongVectorBatch> cvec(new LongVectorBatch(100, *pool));
+ vec.fields.push_back(cvec.release());
+
+ EXPECT_THROW(vec.getWidth(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::STRUCT);
+ EXPECT_THROW(vec.getData(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+
+ EXPECT_EQ(vec.hasVariableLength(), false);
+
+ EXPECT_EQ(vec.toString(),
+ "Struct vector <0 of 100; Integer vector <0 of 100>; >");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(),
+ "Struct vector <0 of 1000; Integer vector <0 of 1000>; >");
+}
+
+TEST(TestOrcVector, ListVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ ListVectorBatch vec(100, *pool);
+
+ EXPECT_THROW(vec.getWidth(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::LIST);
+ EXPECT_THROW(vec.getData(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.hasVariableLength(), true);
+
+ EXPECT_EQ(vec.toString(), "List vector < with 0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(), "List vector < with 0 of 1000>");
+}
+
+TEST(TestOrcVector, MapVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ MapVectorBatch vec(100, *pool);
+
+ EXPECT_THROW(vec.getWidth(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::MAP);
+ EXPECT_THROW(vec.getData(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.hasVariableLength(), true);
+
+ EXPECT_EQ(vec.toString(), "Map vector <, with 0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(), "Map vector <, with 0 of 1000>");
+}
+
+TEST(TestOrcVector, UnionVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ UnionVectorBatch vec(100, *pool);
+ std::unique_ptr<LongVectorBatch> cvec(new LongVectorBatch(100, *pool));
+ vec.children.push_back(cvec.release());
+
+ EXPECT_THROW(vec.getWidth(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::UNION);
+ EXPECT_THROW(vec.getData(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.hasVariableLength(), false);
+
+ EXPECT_EQ(vec.toString(),
+ "Union vector <Integer vector <0 of 100>; with 0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(),
+ "Union vector <Integer vector <0 of 100>; with 0 of 1000>");
+}
+
+TEST(TestOrcVector, Decimal64VectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ Decimal64VectorBatch vec(100, *pool);
+
+ EXPECT_EQ(vec.getWidth(), 24);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::DECIMAL);
+ EXPECT_EQ(vec.getData(), (char *)(vec.values.data()));
+ EXPECT_EQ(vec.getAuxiliaryData(), (char *)(vec.highbitValues.data()));
+ // EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.hasVariableLength(), false);
+
+ EXPECT_EQ(vec.toString(), "Decimal64 vector with 0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(), "Decimal64 vector with 0 of 1000>");
+}
+
+TEST(TestOrcVector, Decimal128VectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ Decimal128VectorBatch vec(100, *pool);
+
+ EXPECT_EQ(vec.getWidth(), 24);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::DECIMAL);
+ EXPECT_EQ(vec.getData(), (char *)(vec.lowbitValues.data()));
+ EXPECT_EQ(vec.getAuxiliaryData(), (char *)(vec.highbitValues.data()));
+ // EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.hasVariableLength(), false);
+
+ EXPECT_EQ(vec.toString(), "Decimal128 vector with 0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(), "Decimal128 vector with 0 of 1000>");
+}
+
+TEST(TestOrcVector, DateVectors) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ DateVectorBatch lv(100, *pool);
+ EXPECT_EQ(lv.toString(), "Integer vector <0 of 100>");
+ lv.resize(1000);
+ EXPECT_EQ(lv.toString(), "Integer vector <0 of 1000>");
+ EXPECT_EQ(lv.getWidth(), 4);
+ EXPECT_EQ(lv.getType(), ORCTypeKind::DATE);
+
+ EXPECT_EQ(lv.hasVariableLength(), false);
+}
+
+TEST(TestOrcVector, DISABLED_TimestampVectorBatch) {
+ dbcommon::MemoryPool *pool = dbcommon::getDefaultPool();
+
+ TimestampVectorBatch vec(100, *pool);
+
+ EXPECT_THROW(vec.getWidth(), dbcommon::TransactionAbortException);
+ EXPECT_EQ(vec.getType(), ORCTypeKind::TIMESTAMP);
+ EXPECT_THROW(vec.getData(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.buildVector(), dbcommon::TransactionAbortException);
+ EXPECT_THROW(vec.hasVariableLength(), dbcommon::TransactionAbortException);
+
+ EXPECT_EQ(vec.toString(), "Timestamp vector <0 of 100>");
+ vec.resize(1000);
+ EXPECT_EQ(vec.toString(), "Timestamp vector <0 of 1000>");
+}
+} // namespace orc
diff --git a/depends/storage/test/unit/format/test-string-dictionary.cc b/depends/storage/test/unit/format/test-string-dictionary.cc
new file mode 100644
index 0000000..e51d8e2
--- /dev/null
+++ b/depends/storage/test/unit/format/test-string-dictionary.cc
@@ -0,0 +1,74 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "gtest/gtest.h"
+#include "storage/format/orc/string-dictionary.h"
+
+namespace orc {
+
+class TestStringDictionary : public ::testing::Test {
+ public:
+ TestStringDictionary() {}
+ virtual ~TestStringDictionary() {}
+
+ std::string convertToString(const char *val, uint64_t len) {
+ std::string ret;
+ ret.append(val, len);
+ return ret;
+ }
+};
+
+TEST_F(TestStringDictionary, FullTest) {
+ StringDictionary dict;
+ std::string s1 = "a2";
+ std::string s2 = "a1";
+ std::string s3 = "b1";
+ std::string s4 = "b2";
+
+ // test add
+ EXPECT_EQ(dict.add(s1.data(), s1.size()), 0);
+ EXPECT_EQ(dict.add(s2.data(), s1.size()), 1);
+ EXPECT_EQ(dict.add(s3.data(), s1.size()), 2);
+ EXPECT_EQ(dict.add(s4.data(), s1.size()), 3);
+ EXPECT_EQ(dict.add(s1.data(), s1.size()), 0);
+ EXPECT_EQ(dict.add(s2.data(), s1.size()), 1);
+ EXPECT_EQ(dict.add(s3.data(), s1.size()), 2);
+ EXPECT_EQ(dict.add(s4.data(), s1.size()), 3);
+
+ // test dump
+ std::vector<const char *> vals;
+ std::vector<uint64_t> lens;
+ std::vector<uint32_t> dumpOrder;
+ dict.dump(&vals, &lens, &dumpOrder);
+ EXPECT_STREQ(convertToString(vals[0], lens[0]).c_str(), "a1");
+ EXPECT_STREQ(convertToString(vals[1], lens[1]).c_str(), "a2");
+ EXPECT_STREQ(convertToString(vals[2], lens[2]).c_str(), "b1");
+ EXPECT_STREQ(convertToString(vals[3], lens[3]).c_str(), "b2");
+ EXPECT_EQ(dumpOrder[0], 1);
+ EXPECT_EQ(dumpOrder[1], 0);
+ EXPECT_EQ(dumpOrder[2], 2);
+ EXPECT_EQ(dumpOrder[3], 3);
+
+ // test size
+ EXPECT_EQ(dict.size(), 4);
+ dict.clear();
+ EXPECT_EQ(dict.size(), 0);
+}
+
+} // namespace orc
diff --git a/depends/storage/test/unit/unit-test-main.cc b/depends/storage/test/unit/unit-test-main.cc
new file mode 100644
index 0000000..970802f
--- /dev/null
+++ b/depends/storage/test/unit/unit-test-main.cc
@@ -0,0 +1,32 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ */
+
+#include "gtest/gtest.h"
+
+#include "dbcommon/log/logger.h"
+
+int main(int argc, char** argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+
+#ifdef TEST_ROOT_DIRECTORY
+ chdir(TEST_ROOT_DIRECTORY);
+#endif
+
+ return RUN_ALL_TESTS();
+}