blob: 4e72af830c9fdea54f1f334d84041297a755bd7b [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <cstdint>
#include "exprs/geo/common.h"
namespace impala::geo {
// This file is responsible for handling the header of the "esri shape" format used for
// geometries encoded as BINARY. This format is fully compatible with Java framework
// https://github.com/Esri/spatial-framework-for-hadoop
// A 5 byte "OGC" header followed by the same format as the one used in shape files:
// https://www.esri.com/content/dam/esrisites/sitecore-archive/Files/Pdfs/library/whitepapers/pdfs/shapefile.pdf
//
// The OGC header contains:
// - 4 byte big endian SRID (reference system id)
// - 1 byte type id (OGCType)
// - no padding
// see https://github.com/Esri/spatial-framework-for-hadoop/blob/v2.2.0/hive/src/main/java/com/esri/hadoop/hive/GeometryUtils.java#L16
//
// The header of the shape file format ("ESRI header") contains:
// - 4 byte type id
// for POINT type store the coordinates:
// - dimension * 8 byte to store x/y/z/m as doubles
// for other types store the bounding box:
// - min coordinates: 2 * 8 byte to store x/y as doubles
// - max coordinates: 2 * 8 byte to store x/y as doubles
// - min/max z/m are stored later in the headers - this file doesn't access those
// - no padding
//
// For some types this is follewed by a variable length part, which is not handled here.
// A POC example for handling the a full type:
// https://gerrit.cloudera.org/#/c/20602/6/be/src/exprs/geo/poly-line-shape-format.cc
//
// Currently only 2 dimensions are handled (xy), min/max for z/m has to be accessed
// with Java functions. The xy bounding box has the same offset and format in
// xyz/xym/xyzm geometries so x/y accessors work in this case too.
//
// Functions are defined in the header to allow inlining bounding box check in codegen.
constexpr int SRID_SIZE = 4;
constexpr int OGC_TYPE_SIZE = 1;
constexpr int SRID_OFFSET = 0;
constexpr int OGC_TYPE_OFFSET = 4;
static_assert(OGC_TYPE_OFFSET == SRID_SIZE);
constexpr int ESRI_TYPE_SIZE = 4;
constexpr int ESRI_TYPE_OFFSET = 5;
constexpr int X1_OFFSET = 9;
constexpr int Y1_OFFSET = X1_OFFSET + sizeof(double);
constexpr int X2_OFFSET = Y1_OFFSET + sizeof(double);
constexpr int Y2_OFFSET = X2_OFFSET + sizeof(double);
constexpr int MIN_GEOM_SIZE = 9;
constexpr int MIN_POINT_SIZE = 25;
constexpr int MIN_NON_POINT_SIZE = 41;
static_assert(ESRI_TYPE_OFFSET == OGC_TYPE_OFFSET + OGC_TYPE_SIZE);
static_assert(X1_OFFSET == ESRI_TYPE_OFFSET + ESRI_TYPE_SIZE);
static_assert(MIN_GEOM_SIZE == SRID_SIZE + OGC_TYPE_SIZE + ESRI_TYPE_SIZE);
static_assert(MIN_POINT_SIZE == MIN_GEOM_SIZE + 2 * sizeof(double));
static_assert(MIN_NON_POINT_SIZE == MIN_POINT_SIZE + 2 * sizeof(double));
// See https://github.com/Esri/geometry-api-java/blob/v2.2.4/src/main/java/com/esri/core/geometry/ShapeType.java#L27
enum EsriType: uint32_t {
ShapeNull = 0,
ShapePoint = 1,
ShapePointM = 21,
ShapePointZM = 11,
ShapePointZ = 9,
ShapeMultiPoint = 8,
ShapeMultiPointM = 28,
ShapeMultiPointZM = 18,
ShapeMultiPointZ = 20,
ShapePolyline = 3,
ShapePolylineM = 23,
ShapePolylineZM = 13,
ShapePolylineZ = 10,
ShapePolygon = 5,
ShapePolygonM = 25,
ShapePolygonZM = 15,
ShapePolygonZ = 19,
ShapeMultiPatchM = 31,
ShapeMultiPatch = 32,
ShapeGeneralPolyline = 50,
ShapeGeneralPolygon = 51,
ShapeGeneralPoint = 52,
ShapeGeneralMultiPoint = 53,
ShapeGeneralMultiPatch = 54,
ShapeTypeLast = 55
};
constexpr std::array<EsriType, ST_MULTIPOLYGON + 1> OGCTypeToEsriType = {{
ShapeNull, // UNKNOWN
ShapePoint, // ST_POINT
ShapePolyline, // ST_LINESTRING
ShapePolygon, // ST_POLYGON
ShapeMultiPoint, // ST_MULTIPOINT
ShapePolyline, // ST_MULTILINESTRING
ShapePolygon // ST_MULTIPOLYGON
}};
template <class T>
T readFromGeom(const StringVal& geom, int offset) {
DCHECK_GE(geom.len, offset + sizeof(T));
return *reinterpret_cast<T*>(geom.ptr + offset);
}
template <class T>
void writeToGeom(const T& val, StringVal& geom, int offset) {
DCHECK_GE(geom.len, offset + sizeof(T));
T* ptr = reinterpret_cast<T*>(geom.ptr + offset);
*ptr = val;
}
// getters/setters for OGC header:
inline uint32_t getSrid(const StringVal& geom) {
static_assert(SRID_SIZE == sizeof(uint32_t));
// SRID is in big endian format in 'geom', but Impala only supports little endian so we
// have to convert it.
#ifndef IS_LITTLE_ENDIAN
static_assert(false, "Only the little endian byte order is supported.");
#endif
const uint32_t srid_bytes = readFromGeom<uint32_t>(geom, SRID_OFFSET);
return BitUtil::ByteSwap(srid_bytes);
}
inline OGCType getOGCType(const StringVal& geom) {
static_assert(OGC_TYPE_SIZE == sizeof(char));
const char res = readFromGeom<char>(geom, OGC_TYPE_OFFSET);
return static_cast<OGCType>(res);
}
inline constexpr const char* getGeometryType(OGCType ogc_type) {
return OGCTypeToStr[ogc_type];
}
inline void setSrid(StringVal& geom, uint32_t srid) {
static_assert(SRID_SIZE == sizeof(uint32_t));
// SRID is in big endian format in 'geom', but Impala only supports little endian so we
// have to convert it.
#ifndef IS_LITTLE_ENDIAN
static_assert(false, "Only the little endian byte order is supported.");
#endif
const uint32_t srid_bytes = BitUtil::ByteSwap(srid);
writeToGeom<uint32_t>(srid_bytes, geom, SRID_OFFSET);
}
inline void setOGCType(StringVal& geom, OGCType ogc_type) {
writeToGeom<char>(ogc_type, geom, OGC_TYPE_OFFSET);
}
// getters/setters for ESRI header:
inline EsriType getEsriType(const StringVal& geom) {
static_assert(ESRI_TYPE_SIZE == sizeof(EsriType));
return readFromGeom<EsriType>(geom, ESRI_TYPE_OFFSET);
}
inline double getMinX(const StringVal& geom) {
return readFromGeom<double>(geom, X1_OFFSET);
}
inline double getMinY(const StringVal& geom) {
return readFromGeom<double>(geom, Y1_OFFSET);
}
inline double getMaxX(const StringVal& geom) {
return readFromGeom<double>(geom, X2_OFFSET);
}
inline double getMaxY(const StringVal& geom) {
return readFromGeom<double>(geom, Y2_OFFSET);
}
inline void setEsriType(StringVal& geom, EsriType esri_type) {
static_assert(ESRI_TYPE_SIZE == sizeof(EsriType));
writeToGeom<EsriType>(esri_type, geom, ESRI_TYPE_OFFSET);
}
inline void setMinX(StringVal& geom, double x) {
writeToGeom<double>(x, geom, X1_OFFSET);
}
inline void setMinY(StringVal& geom, double y) {
writeToGeom<double>(y, geom, Y1_OFFSET);
}
inline void setMaxX(StringVal& geom, double x) {
writeToGeom<double>(x, geom, X2_OFFSET);
}
inline void setMaxY(StringVal& geom, double y) {
writeToGeom<double>(y, geom, Y2_OFFSET);
}
// Validate header and get type
inline bool ParseHeader(FunctionContext* ctx, const StringVal& geom, OGCType* ogc_type) {
DCHECK(ogc_type != nullptr);
if (UNLIKELY(geom.is_null)) return false;
if (UNLIKELY(geom.len < MIN_GEOM_SIZE)) {
ctx->AddWarning("Geometry size too small.");
return false;
}
const OGCType unchecked_ogc_type = getOGCType(geom);
if (UNLIKELY(unchecked_ogc_type < UNKNOWN || unchecked_ogc_type > ST_MULTIPOLYGON)) {
ctx->AddWarning("Invalid geometry type.");
return false;
}
if (UNLIKELY(unchecked_ogc_type == UNKNOWN)) {
ctx->AddWarning("Geometry type UNKNOWN.");
return false;
}
if (UNLIKELY(unchecked_ogc_type == ST_POINT)) {
if (geom.len < MIN_POINT_SIZE) {
ctx->AddWarning("Geometry size too small for ST_POINT type.");
return false;
}
} else {
if (UNLIKELY(geom.len < MIN_NON_POINT_SIZE)) {
ctx->AddWarning("Geometry size too small for non ST_POINT type.");
return false;
}
}
// TODO: fix Z/M/ZM types and move to a function called from DCHECK
// ogc vs ESRI type checking can be useful during development, but it
// is unnecessary overhead in production
/*const EsriType esri_type = getEsriType(geom);
DCHECK_LT(unchecked_ogc_type, OGCTypeToEsriType.size());
const EsriType expected_esri_type = OGCTypeToEsriType[unchecked_ogc_type];
if (expected_esri_type != esri_type) {
// TODO: To test it we need to create a table with 3D types, we cannot create them
// with native constructors.
ctx->SetError(strings::Substitute(
"Invalid geometry: OGCType and EsriType do not match. "
"Because the OGCType is $0, expected EsriType $1, found $2.",
OGCTypeToStr[unchecked_ogc_type], expected_esri_type, esri_type).c_str());
}*/
*ogc_type = static_cast<OGCType>(unchecked_ogc_type);
return true;
}
// Bounding box check for x/y coordinates of two geometries. z/m are ignored, which
// is consistent with the original Java functions.
inline bool bBoxIntersects(const StringVal& lhs_geom, const StringVal rhs_geom,
OGCType lhs_type, OGCType rhs_type) {
bool is_lhs_point = lhs_type == ST_POINT;
double xmin1 = getMinX(lhs_geom);
double ymin1 = getMinY(lhs_geom);
double xmax1 = is_lhs_point ? xmin1 : getMaxX(lhs_geom);
double ymax1 = is_lhs_point ? ymin1 : getMaxY(lhs_geom);
bool is_rhs_point = rhs_type == ST_POINT;
double xmin2 = getMinX(rhs_geom);
double ymin2 = getMinY(rhs_geom);
double xmax2 = is_rhs_point ? xmin2 : getMaxX(rhs_geom);
double ymax2 = is_rhs_point ? ymin2 : getMaxY(rhs_geom);
if (xmax1 < xmin2 || xmax2 < xmin1 || ymax1 < ymin2 || ymax2 < ymin1 ) return false;
return true;
}
inline StringVal createStPoint(FunctionContext* ctx, double x, double y,
uint32_t srid = 0) {
StringVal res(ctx, MIN_POINT_SIZE);
setSrid(res, srid);
setOGCType(res, ST_POINT);
setEsriType(res, ShapePoint);
setMinX(res, x);
setMinY(res, y);
return res;
}
} // namespace impala