blob: aa2c4a124d8f0e2891a8cee4ff0950c2c8ce6981 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#pragma once
#include <string>
#include "util/ubsan.h"
#include "common/logging.h"
namespace impala {
/// String implementation that can apply the Small String Optimization on an
/// on-demand basis: every SmallableString object starts with the representation
/// that uses an 8-byte pointer and a 4-byte length. But if users invoke the
/// Smallify() method on this object and the length is less than 'SMALL_LIMIT' then
/// the representation switches to the small string layout, i.e. a SMALL_LIMIT-size
/// buffer and a single byte to store the (small) length. An indicator bit is set
/// in the last byte so the methods of this class can know which representation is
/// being used. On little-endian architectures this is the most significant bit (MSB) of
/// both LONG_STRING.len and SMALL_STRING.len.
/// TODO: On big endian architectures this will be the least significant bit (LSB) of
/// both LONG_STRING.len and SMALL_STRING.len.
/// We can have the indicator bit in 'len' because StringValues are limited to 1GB
/// character data.
class __attribute__((__packed__)) SmallableString {
public:
static_assert(__BYTE_ORDER == __LITTLE_ENDIAN,
"Current implementation of SmallableString assumes little-endianness");
static constexpr int SMALL_LIMIT = 11;
static constexpr unsigned char MSB_CHAR = 0b10000000;
static constexpr unsigned char SMALLSTRING_MASK = 0b01111111;
struct SimpleString {
char* ptr;
uint32_t len;
};
SmallableString() {
rep.long_rep.ptr = nullptr;
rep.long_rep.len = 0;
}
/// Constructs SmallableString based on 'other'. If 'other' uses the long
/// representation, then so will the newly created SmallableString, and it will point
/// to the same data. If 'other' uses the small representation, then newly created
/// object will also be small.
SmallableString(const SmallableString& other) {
memcpy(this, &other, sizeof(*this));
}
/// Creates smallable string from 'ptr' and 'len'. It will use the long representation.
SmallableString(char* ptr, int len) {
// Invalid string values might have negative lengths. We also have backend tests
// for this.
if (UNLIKELY(len < 0)) len = 0;
rep.long_rep.ptr = ptr;
rep.long_rep.len = len;
DCHECK(!IsSmall());
}
/// Construct a SmallableString from 's'. 's' must be valid for as long as
/// this object is valid.
explicit SmallableString(const std::string& s) :
SmallableString(const_cast<char*>(s.c_str()), s.size()) {}
/// Constructs a SmallableString from 's'. 's' must be valid for as long as
/// this object is valid.
/// s must be a null-terminated string. This constructor is to prevent
/// accidental use of the version taking an std::string.
explicit SmallableString(const char* s) :
SmallableString(const_cast<char*>(s), strlen(s)) {}
SmallableString& operator=(const SmallableString& other) {
Assign(other);
return *this;
}
void Assign(const SmallableString& other) {
memcpy(this, &other, sizeof(*this));
}
/// Assigns 'ptr' and 'len' to this string's long representation. Negative 'len'
/// is overwritten with 0.
void Assign(char* ptr, int len) {
// Invalid string values might have negative lengths. We also have backend tests
// for this.
if (UNLIKELY(len < 0)) len = 0;
rep.long_rep.ptr = ptr;
rep.long_rep.len = len;
DCHECK(!IsSmall());
}
/// Assigns 'ptr' and 'len' to this string's long representation without any checks.
void UnsafeAssign(char* ptr, int len) {
rep.long_rep.ptr = ptr;
rep.long_rep.len = len;
}
void Clear() { memset(this, 0, sizeof(*this)); }
bool IsSmall() const {
char last_char = reinterpret_cast<const char*>(this)[sizeof(*this) - 1];
return last_char & MSB_CHAR;
}
bool Smallify() {
if (IsSmall()) return true;
if (rep.long_rep.len > SMALL_LIMIT) return false;
char* s = rep.long_rep.ptr;
uint32_t len = rep.long_rep.len;
// Let's zero out the object so compression algorithms will be more efficient
// on small strings (there will be no garbage between string data and len).
memset(this, 0, sizeof(*this));
Ubsan::MemCpy(rep.small_rep.buf, s, len);
SetSmallLen(len);
return true;
}
int Len() const {
if (IsSmall()) {
return GetSmallLen();
} else {
return rep.long_rep.len;
}
}
/// Returns the number of bytes needed outside the slot itself:
/// - if the length is too long to smallify, return length
/// - if the length is small enough to smallify:
/// - if assume_smallify is true or the string is already smallified return 0
/// - otherwise (not already smallified and assume_smallify is false) return length
int ExternalLen(bool assume_smallify) const {
if (IsSmall() || (assume_smallify && rep.long_rep.len <= SMALL_LIMIT)) return 0;
return rep.long_rep.len;
}
char* Ptr() const {
if (IsSmall()) {
return const_cast<char*>(rep.small_rep.buf);
} else {
return rep.long_rep.ptr;
}
}
/// Sets a new length for this object. To keep it safe, the length can only be
/// decreased.
void SetLen(int len) {
DCHECK_LE(len, Len());
if (IsSmall()) {
SetSmallLen(len);
} else {
rep.long_rep.len = len;
DCHECK(!IsSmall());
}
}
void SetPtr(char* ptr) {
DCHECK(!IsSmall());
rep.long_rep.ptr = ptr;
}
SimpleString ToSimpleString() const {
SimpleString ret;
if (IsSmall()) {
ret.ptr = const_cast<char*>(rep.small_rep.buf);
ret.len = GetSmallLen();
} else {
ret.ptr = rep.long_rep.ptr;
ret.len = rep.long_rep.len;
}
return ret;
}
private:
int GetSmallLen() const {
return rep.small_rep.len & SMALLSTRING_MASK;
}
void SetSmallLen(int len) {
rep.small_rep.len = len;
rep.small_rep.len |= MSB_CHAR;
}
struct SmallStringRep {
char buf[SMALL_LIMIT];
unsigned char len;
};
struct __attribute__((__packed__)) LongStringRep {
/// TODO: change ptr to an offset relative to a contiguous memory block,
/// so that we can send row batches between nodes without having to swizzle
/// pointers
char* ptr;
uint32_t len;
};
static_assert(sizeof(SmallStringRep) == sizeof(LongStringRep),
"sizeof(SmallStringRep) must be equal to sizeof(LongStringRep)");
union {
SmallStringRep small_rep;
LongStringRep long_rep;
} rep;
};
}