blob: aa48a692d6718a157c26387a988a3991937084b1 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#ifndef IMPALA_RUNTIME_STRING_VALUE_H
#define IMPALA_RUNTIME_STRING_VALUE_H
#include <string.h>
#include <string>
#include "common/logging.h"
#include "udf/udf.h"
#include "util/hash-util.h"
#include "runtime/types.h"
#include "runtime/smallable-string.h"
namespace impala {
class StringValueTest;
class Tuple;
/// The format of a string-typed slot.
/// The returned StringValue of all functions that return StringValue
/// shares its buffer with the parent.
/// TODO: rename this to be less confusing with impala_udf::StringVal.
/// StringValues use SmallableStrings internally which adds on-demand
/// Small String Optimization.
class __attribute__((__packed__)) StringValue {
public:
/// The current limitation for a string instance is 1GB character data.
/// See IMPALA-1619 for more details.
static constexpr int MAX_LENGTH = (1 << 30);
using SimpleString = SmallableString::SimpleString;
StringValue() {}
StringValue(const StringValue& other): string_impl_(other.string_impl_) {}
StringValue(char* ptr, int len): string_impl_(ptr, len) {
DCHECK_GE(len, 0);
DCHECK_LE(len, MAX_LENGTH);
}
/// Construct a StringValue from 's'. 's' must be valid for as long as
/// this object is valid, unless 's' is short.
explicit StringValue(const std::string& s)
: string_impl_(s) {}
/// Construct a StringValue from 's'. 's' must be valid for as long as
/// this object is valid.
/// s must be a null-terminated string. This constructor is to prevent
/// accidental use of the version taking an std::string.
explicit StringValue(const char* s)
: string_impl_(s) {}
/// Only valid to call if source's length is small enough. Returns a StringValue object
/// that is smallified.
static StringValue MakeSmallStringFrom(const StringValue& source) {
DCHECK_LE(source.Len(), SmallableString::SMALL_LIMIT);
StringValue sv(source);
sv.Smallify();
DCHECK(sv.IsSmall());
return sv;
}
void Assign(const StringValue& other) { string_impl_.Assign(other.string_impl_); }
void Assign(char* ptr, int len) {
string_impl_.Assign(ptr, len);
}
void UnsafeAssign(char* ptr, int len) {
string_impl_.UnsafeAssign(ptr, len);
}
void Clear() { string_impl_.Clear(); }
bool IsSmall() const { return string_impl_.IsSmall(); }
int Len() const { return string_impl_.Len(); }
/// Returns the number of bytes needed outside the slot itself:
/// - if the length is too long to smallify, return length
/// - if the length is small enough to smallify:
/// - if assume_smallify is true or the string is already smallified return 0
/// - otherwise (not already smallified and assume_smallify is false) return length
int ExternalLen(bool assume_smallify) const {
return string_impl_.ExternalLen(assume_smallify);
}
/// Sets the length of this String object. Length can only be decreased.
void SetLen(int len) { return string_impl_.SetLen(len); }
char* Ptr() const { return string_impl_.Ptr(); }
/// We can only call this if the string is not smallified.
void SetPtr(char* ptr) { return string_impl_.SetPtr(ptr); }
// START IR FUNCTIONS
int IrLen() const;
char* IrPtr() const;
void IrSetLen(int len);
void IrAssign(char* ptr, int len);
void IrUnsafeAssign(char* ptr, int len);
void IrClear();
// END IR FUNCTIONS
SimpleString ToSimpleString() const { return string_impl_.ToSimpleString(); }
/// Byte-by-byte comparison. Returns:
/// this < other: -1
/// this == other: 0
/// this > other: 1
inline int Compare(const StringValue& other) const;
/// ==
inline bool Eq(const StringValue& other) const;
inline bool operator==(const StringValue& other) const;
/// !=
inline bool Ne(const StringValue& other) const;
inline bool operator!=(const StringValue& other) const;
/// <=
inline bool Le(const StringValue& other) const;
inline bool operator<=(const StringValue& other) const;
/// >=
inline bool Ge(const StringValue& other) const;
inline bool operator>=(const StringValue& other) const;
/// <
inline bool Lt(const StringValue& other) const;
inline bool operator<(const StringValue& other) const;
/// >
inline bool Gt(const StringValue& other) const;
inline bool operator>(const StringValue& other) const;
std::string DebugString() const;
/// Returns the substring starting at start_pos until the end of string.
inline StringValue Substring(int start_pos) const;
/// Returns the substring starting at start_pos with given length.
/// If new_len < 0 then the substring from start_pos to end of string is returned. If
/// new_len > len, len is extended to new_len.
/// TODO: len should never be extended. This is not a trivial fix because UrlParser
/// depends on the current behavior.
inline StringValue Substring(int start_pos, int new_len) const;
/// Trims leading and trailing spaces.
inline StringValue Trim() const;
void ToStringVal(impala_udf::StringVal* sv) const {
*sv = impala_udf::StringVal(reinterpret_cast<uint8_t*>(Ptr()), Len());
}
// Treat up to first 8 bytes of the string as an 64-bit unsigned integer. If len is
// less than 8, 8-len number of bytes of value '\0' are appended.
uint64_t ToUInt64() const;
static StringValue FromStringVal(const impala_udf::StringVal& sv) {
return StringValue(reinterpret_cast<char*>(sv.ptr), sv.len);
}
/// Pads the end of the char pointer with spaces. num_chars is the number of used
/// characters, cptr_len is the length of cptr
inline static void PadWithSpaces(char* cptr, int64_t cptr_len, int64_t num_chars);
/// Returns number of characters in a char array (ignores trailing spaces)
inline static int64_t UnpaddedCharLength(const char* cptr, int64_t len);
// Return the largest smaller string 'result' such that 'result' < 'this'. If no such
// string exists, return an empty string. The smallest non-empty string is "\x00" and
// the absolute smallest string is the empty string.
std::string LargestSmallerString() const;
// Return the least larger string 'result' such that 'this' < 'result'.
std::string LeastLargerString() const;
/// For C++/IR interop, we need to be able to look up types by name.
static const char* LLVM_CLASS_NAME;
private:
friend Tuple;
friend StringValueTest;
/// !!! THIS IS UNSAFE TO CALL ON EXISTING STRINGVALUE OBJECTS !!!
/// Please make sure you only invoke it for newly created StringValues, e.g. on the
/// target StringValue object of a deep copy operation.
/// Tries to apply Small String Optimization if possible. Returns 'true' on success,
/// 'false' otherwise. In the latter case the object remains unmodified.
/// !!! THIS IS UNSAFE TO CALL ON EXISTING STRINGVALUE OBJECTS !!!
bool Smallify() { return string_impl_.Smallify(); }
SmallableString string_impl_;
};
/// This function must be called 'hash_value' to be picked up by boost.
inline std::size_t hash_value(const StringValue& v) {
return HashUtil::Hash(v.Ptr(), v.Len(), 0);
}
std::ostream& operator<<(std::ostream& os, const StringValue& string_value);
}
/// With this specialization it is possbile to use StringValues in hash-based std
/// containers (unordered_set, unordered_map) without the need of explicitly
/// specifying the Hash template parameter.
namespace std {
template <> struct hash<impala::StringValue> {
size_t operator()(const impala::StringValue& str) const {
return hash_value(str);
}
};
}
#endif