be/src/util/symbols-util.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include "util/symbols-util.h"
 #include <cxxabi.h>
 #include <sstream>
 #include <boost/algorithm/string.hpp>
 #include <boost/algorithm/string/regex.hpp>
 #include <boost/preprocessor/stringize.hpp>

 #include "common/names.h"

 using boost::algorithm::split_regex;
 using boost::regex;
 using namespace impala;

 // For the rules about gcc-compatible name mangling, see:
 // http://mentorembedded.github.io/cxx-abi/abi.html#mangling
 // This implementation *is* not generally compatible. It is harded coded to
 // only work with functions that implement the UDF or UDA signature. That is,
 // functions of the form:
 //   namespace::Function(impala_udf::FunctionContext*, const impala_udf::AnyVal&, etc)
 //
 // The general idea is to walk the types left to right and output them. This happens
 // in a single pass. User literals are output as <len><literal>. There are many reserved,
 // usually single character tokens for native types and specifying if something is a
 // pointer.
 //
 // One additional piece of complexity is that repeated literals are compressed out.
 // As literals are output, they are associated with an ID. The next time that
 // we encounter the literal, we output the ID instead.
 // We don't implement this generally since the way the literals are added to the
 // dictionary is much more general than we need.
 // e.g. for the literal ns1::ns2::class::type,
 // the dictionary would add 4 literals: 'ns1', 'ns1::ns2', 'ns1::ns2::class',
 //    'ns1::ns2::class::type'
 // We instead take some shortcuts since we know all the argument types are
 // types we define.

 // Mangled symbols must start with this.
 const char* MANGLE_PREFIX = "_Z";

 bool SymbolsUtil::IsMangled(const string& symbol) {
   return strncmp(symbol.c_str(), MANGLE_PREFIX, strlen(MANGLE_PREFIX)) == 0;
 }

 string SymbolsUtil::Demangle(const string& name) {
   int status = 0;
   char* demangled_name = abi::__cxa_demangle(name.c_str(), NULL, NULL, &status);
   if (status != 0) return name;
   string result = demangled_name;
   free(demangled_name);
   return result;
 }

 string SymbolsUtil::DemangleNoArgs(const string& symbol) {
   string fn_name = Demangle(symbol);
   // Chop off argument list (e.g. "foo(int)" => "foo")
   return fn_name.substr(0, fn_name.find('('));
 }

 string SymbolsUtil::DemangleNameOnly(const string& symbol) {
   string fn_name = DemangleNoArgs(symbol);
   // Chop off namespace and/or class name if present (e.g. "impala::foo" => "foo")
   // TODO: fix for templates
   return fn_name.substr(fn_name.find_last_of(':') + 1);
 }

 // Appends <Length><String> to the stream.
 // e.g. Hello --> "5Hello"
 static void AppendMangledToken(const string& s, stringstream* out) {
   DCHECK(!s.empty());
   (*out) << s.size() << s;
 }

 // Outputs the seq_id. This is base 36 encoded with an S prefix and _ suffix.
 // As an added optimization, the "seq_id - 1" value is output with the first
 // token as just "S".
 // e.g. seq_id 0: "S_"
 //      seq_id 1: "S0_"
 //      seq_id 2: "S1_"
 static void AppendSeqId(int seq_id, stringstream* out) {
   DCHECK_GE(seq_id, 0);
   if (seq_id == 0) {
     (*out) << "S_";
     return;
   }
   --seq_id;
   char buffer[10];
   char* ptr = buffer + 10;
   if (seq_id == 0) *--ptr = '0';
   while (seq_id != 0) {
     DCHECK(ptr > buffer);
     char c = static_cast<char>(seq_id % 36);
     *--ptr = (c < 10 ? '0' + c : 'A' + c - 10);
     seq_id /=36;
   }
   (*out) << "S";
   out->write(ptr, 10 - (ptr - buffer));
   (*out) << "_";
 }

 #define CASE_TYPE_APPEND_MANGLED_TOKEN(type_lit, type_val) \
     case type_lit: AppendMangledToken(#type_val, s); break;

 static void AppendAnyValType(int namespace_id, const ColumnType& type, stringstream* s) {
   (*s) << "N";
   // All the AnyVal types are in the impala_udf namespace, that token
   // already came with impala_udf::FunctionContext
   AppendSeqId(namespace_id, s);

   switch (type.type) {
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_BOOLEAN, BooleanVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_TINYINT, TinyIntVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_SMALLINT, SmallIntVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_INT, IntVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DATE, DateVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_BIGINT, BigIntVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_FLOAT, FloatVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DOUBLE, DoubleVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_STRING, StringVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_VARCHAR, StringVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_CHAR, StringVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_TIMESTAMP, TimestampVal)
     CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DECIMAL, DecimalVal)

     default:
       DCHECK(false) << "NYI: " << type.DebugString();
   }
   (*s) << "E"; // end impala_udf namespace
 }

 string SymbolsUtil::MangleUserFunction(const string& fn_name,
     const vector<ColumnType>& arg_types, bool has_var_args,
     ColumnType* ret_arg_type) {
   // We need to split fn_name by :: to separate scoping from tokens
   vector<string> name_tokens;
   split_regex(name_tokens, fn_name, regex("::"));

   // Mangled names use substitution as a builtin compression. The first time a token
   // is seen, we output the raw token string and store the index ("seq_id"). The
   // next time we see the same token, we output the index instead.
   int seq_id = 0;

   // Sequence id for the impala_udf namespace token
   int impala_udf_seq_id = -1;

   stringstream ss;
   ss << MANGLE_PREFIX;
   if (name_tokens.size() > 1) {
     ss << "N";  // Start namespace
     seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
   }
   for (int i = 0; i < name_tokens.size(); ++i) {
     AppendMangledToken(name_tokens[i], &ss);
   }
   if (name_tokens.size() > 1) ss << "E"; // End fn namespace
   ss << "PN"; // First argument and start of FunctionContext namespace
   AppendMangledToken("impala_udf", &ss);
   impala_udf_seq_id = seq_id++;
   AppendMangledToken("FunctionContext", &ss);
   ++seq_id;
   ss << "E"; // E indicates end of namespace

   map<PrimitiveType, int> argument_map;
   for (int i = 0; i < arg_types.size(); ++i) {
     int repeated_symbol_idx = -1; // Set to >0, if we've seen the symbol.
     if (argument_map.find(arg_types[i].type) != argument_map.end()) {
       repeated_symbol_idx = argument_map[arg_types[i].type];
     }

     if (has_var_args && i == arg_types.size() - 1) {
       // We always specify varargs as int32 followed by the type.
       ss << "i"; // The argument for the number of varargs.
       ss << "P"; // This indicates what follows is a ptr (that is the array of varargs)
       ++seq_id; // For "P"
       if (repeated_symbol_idx > 0) {
         AppendSeqId(repeated_symbol_idx - 1, &ss);
         continue;
       }
     } else {
       if (repeated_symbol_idx > 0) {
         AppendSeqId(repeated_symbol_idx, &ss);
         continue;
       }
       ss << "R"; // This indicates it is a reference type
       ++seq_id; // For R.
     }

     ss << "K"; // This indicates it is const
     seq_id += 2; // For impala_udf::*Val, which is two tokens.
     AppendAnyValType(impala_udf_seq_id, arg_types[i], &ss);
     argument_map[arg_types[i].type] = seq_id;
   }

   // Output return argument.
   if (ret_arg_type != NULL) {
     int repeated_symbol_idx = -1;
     if (argument_map.find(ret_arg_type->type) != argument_map.end()) {
       repeated_symbol_idx = argument_map[ret_arg_type->type];
     }
     ss << "P"; // Return argument is a pointer

     if (repeated_symbol_idx != -1) {
       // This is always last and a pointer type.
       AppendSeqId(argument_map[ret_arg_type->type] - 2, &ss);
     } else {
       AppendAnyValType(impala_udf_seq_id, *ret_arg_type, &ss);
     }
   }

   return ss.str();
 }

 string SymbolsUtil::ManglePrepareOrCloseFunction(const string& fn_name) {
   // We need to split fn_name by :: to separate scoping from tokens
   vector<string> name_tokens;
   split_regex(name_tokens, fn_name, regex("::"));

   // Mangled names use substitution as a builtin compression. The first time a token
   // is seen, we output the raw token string and store the index ("seq_id"). The
   // next time we see the same token, we output the index instead.
   int seq_id = 0;

   stringstream ss;
   ss << MANGLE_PREFIX;
   if (name_tokens.size() > 1) {
     ss << "N";  // Start namespace
     seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
   }
   for (int i = 0; i < name_tokens.size(); ++i) {
     AppendMangledToken(name_tokens[i], &ss);
   }
   if (name_tokens.size() > 1) ss << "E"; // End fn namespace

   ss << "PN"; // FunctionContext* argument and start of FunctionContext namespace
   AppendMangledToken("impala_udf", &ss);
   AppendMangledToken("FunctionContext", &ss);
   ss << "E"; // E indicates end of namespace

   ss << "NS"; // FunctionStateScope argument
   ss << seq_id;
   ss << "_";
   AppendMangledToken("FunctionStateScope", &ss);
   ss << "E"; // E indicates end of namespace

   return ss.str();
 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include "util/symbols-util.h"
	#include <cxxabi.h>
	#include <sstream>
	#include <boost/algorithm/string.hpp>
	#include <boost/algorithm/string/regex.hpp>
	#include <boost/preprocessor/stringize.hpp>

	#include "common/names.h"

	using boost::algorithm::split_regex;
	using boost::regex;
	using namespace impala;

	// For the rules about gcc-compatible name mangling, see:
	// http://mentorembedded.github.io/cxx-abi/abi.html#mangling
	// This implementation is not generally compatible. It is harded coded to
	// only work with functions that implement the UDF or UDA signature. That is,
	// functions of the form:
	// namespace::Function(impala_udf::FunctionContext*, const impala_udf::AnyVal&, etc)
	//
	// The general idea is to walk the types left to right and output them. This happens
	// in a single pass. User literals are output as <len><literal>. There are many reserved,
	// usually single character tokens for native types and specifying if something is a
	// pointer.
	//
	// One additional piece of complexity is that repeated literals are compressed out.
	// As literals are output, they are associated with an ID. The next time that
	// we encounter the literal, we output the ID instead.
	// We don't implement this generally since the way the literals are added to the
	// dictionary is much more general than we need.
	// e.g. for the literal ns1::ns2::class::type,
	// the dictionary would add 4 literals: 'ns1', 'ns1::ns2', 'ns1::ns2::class',
	// 'ns1::ns2::class::type'
	// We instead take some shortcuts since we know all the argument types are
	// types we define.

	// Mangled symbols must start with this.
	const char* MANGLE_PREFIX = "_Z";

	bool SymbolsUtil::IsMangled(const string& symbol) {
	return strncmp(symbol.c_str(), MANGLE_PREFIX, strlen(MANGLE_PREFIX)) == 0;
	}

	string SymbolsUtil::Demangle(const string& name) {
	int status = 0;
	char* demangled_name = abi::__cxa_demangle(name.c_str(), NULL, NULL, &status);
	if (status != 0) return name;
	string result = demangled_name;
	free(demangled_name);
	return result;
	}

	string SymbolsUtil::DemangleNoArgs(const string& symbol) {
	string fn_name = Demangle(symbol);
	// Chop off argument list (e.g. "foo(int)" => "foo")
	return fn_name.substr(0, fn_name.find('('));
	}

	string SymbolsUtil::DemangleNameOnly(const string& symbol) {
	string fn_name = DemangleNoArgs(symbol);
	// Chop off namespace and/or class name if present (e.g. "impala::foo" => "foo")
	// TODO: fix for templates
	return fn_name.substr(fn_name.find_last_of(':') + 1);
	}

	// Appends <Length><String> to the stream.
	// e.g. Hello --> "5Hello"
	static void AppendMangledToken(const string& s, stringstream* out) {
	DCHECK(!s.empty());
	(*out) << s.size() << s;
	}

	// Outputs the seq_id. This is base 36 encoded with an S prefix and _ suffix.
	// As an added optimization, the "seq_id - 1" value is output with the first
	// token as just "S".
	// e.g. seq_id 0: "S_"
	// seq_id 1: "S0_"
	// seq_id 2: "S1_"
	static void AppendSeqId(int seq_id, stringstream* out) {
	DCHECK_GE(seq_id, 0);
	if (seq_id == 0) {
	(*out) << "S_";
	return;
	}
	--seq_id;
	char buffer[10];
	char* ptr = buffer + 10;
	if (seq_id == 0) *--ptr = '0';
	while (seq_id != 0) {
	DCHECK(ptr > buffer);
	char c = static_cast<char>(seq_id % 36);
	*--ptr = (c < 10 ? '0' + c : 'A' + c - 10);
	seq_id /=36;
	}
	(*out) << "S";
	out->write(ptr, 10 - (ptr - buffer));
	(*out) << "_";
	}

	#define CASE_TYPE_APPEND_MANGLED_TOKEN(type_lit, type_val) \
	case type_lit: AppendMangledToken(#type_val, s); break;

	static void AppendAnyValType(int namespace_id, const ColumnType& type, stringstream* s) {
	(*s) << "N";
	// All the AnyVal types are in the impala_udf namespace, that token
	// already came with impala_udf::FunctionContext
	AppendSeqId(namespace_id, s);

	switch (type.type) {
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_BOOLEAN, BooleanVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_TINYINT, TinyIntVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_SMALLINT, SmallIntVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_INT, IntVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DATE, DateVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_BIGINT, BigIntVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_FLOAT, FloatVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DOUBLE, DoubleVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_STRING, StringVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_VARCHAR, StringVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_CHAR, StringVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_TIMESTAMP, TimestampVal)
	CASE_TYPE_APPEND_MANGLED_TOKEN(TYPE_DECIMAL, DecimalVal)

	default:
	DCHECK(false) << "NYI: " << type.DebugString();
	}
	(*s) << "E"; // end impala_udf namespace
	}

	string SymbolsUtil::MangleUserFunction(const string& fn_name,
	const vector<ColumnType>& arg_types, bool has_var_args,
	ColumnType* ret_arg_type) {
	// We need to split fn_name by :: to separate scoping from tokens
	vector<string> name_tokens;
	split_regex(name_tokens, fn_name, regex("::"));

	// Mangled names use substitution as a builtin compression. The first time a token
	// is seen, we output the raw token string and store the index ("seq_id"). The
	// next time we see the same token, we output the index instead.
	int seq_id = 0;

	// Sequence id for the impala_udf namespace token
	int impala_udf_seq_id = -1;

	stringstream ss;
	ss << MANGLE_PREFIX;
	if (name_tokens.size() > 1) {
	ss << "N"; // Start namespace
	seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
	}
	for (int i = 0; i < name_tokens.size(); ++i) {
	AppendMangledToken(name_tokens[i], &ss);
	}
	if (name_tokens.size() > 1) ss << "E"; // End fn namespace
	ss << "PN"; // First argument and start of FunctionContext namespace
	AppendMangledToken("impala_udf", &ss);
	impala_udf_seq_id = seq_id++;
	AppendMangledToken("FunctionContext", &ss);
	++seq_id;
	ss << "E"; // E indicates end of namespace

	map<PrimitiveType, int> argument_map;
	for (int i = 0; i < arg_types.size(); ++i) {
	int repeated_symbol_idx = -1; // Set to >0, if we've seen the symbol.
	if (argument_map.find(arg_types[i].type) != argument_map.end()) {
	repeated_symbol_idx = argument_map[arg_types[i].type];
	}

	if (has_var_args && i == arg_types.size() - 1) {
	// We always specify varargs as int32 followed by the type.
	ss << "i"; // The argument for the number of varargs.
	ss << "P"; // This indicates what follows is a ptr (that is the array of varargs)
	++seq_id; // For "P"
	if (repeated_symbol_idx > 0) {
	AppendSeqId(repeated_symbol_idx - 1, &ss);
	continue;
	}
	} else {
	if (repeated_symbol_idx > 0) {
	AppendSeqId(repeated_symbol_idx, &ss);
	continue;
	}
	ss << "R"; // This indicates it is a reference type
	++seq_id; // For R.
	}

	ss << "K"; // This indicates it is const
	seq_id += 2; // For impala_udf::*Val, which is two tokens.
	AppendAnyValType(impala_udf_seq_id, arg_types[i], &ss);
	argument_map[arg_types[i].type] = seq_id;
	}

	// Output return argument.
	if (ret_arg_type != NULL) {
	int repeated_symbol_idx = -1;
	if (argument_map.find(ret_arg_type->type) != argument_map.end()) {
	repeated_symbol_idx = argument_map[ret_arg_type->type];
	}
	ss << "P"; // Return argument is a pointer

	if (repeated_symbol_idx != -1) {
	// This is always last and a pointer type.
	AppendSeqId(argument_map[ret_arg_type->type] - 2, &ss);
	} else {
	AppendAnyValType(impala_udf_seq_id, *ret_arg_type, &ss);
	}
	}

	return ss.str();
	}

	string SymbolsUtil::ManglePrepareOrCloseFunction(const string& fn_name) {
	// We need to split fn_name by :: to separate scoping from tokens
	vector<string> name_tokens;
	split_regex(name_tokens, fn_name, regex("::"));

	// Mangled names use substitution as a builtin compression. The first time a token
	// is seen, we output the raw token string and store the index ("seq_id"). The
	// next time we see the same token, we output the index instead.
	int seq_id = 0;

	stringstream ss;
	ss << MANGLE_PREFIX;
	if (name_tokens.size() > 1) {
	ss << "N"; // Start namespace
	seq_id += name_tokens.size() - 1; // Append for all the name space tokens.
	}
	for (int i = 0; i < name_tokens.size(); ++i) {
	AppendMangledToken(name_tokens[i], &ss);
	}
	if (name_tokens.size() > 1) ss << "E"; // End fn namespace

	ss << "PN"; // FunctionContext* argument and start of FunctionContext namespace
	AppendMangledToken("impala_udf", &ss);
	AppendMangledToken("FunctionContext", &ss);
	ss << "E"; // E indicates end of namespace

	ss << "NS"; // FunctionStateScope argument
	ss << seq_id;
	ss << "_";
	AppendMangledToken("FunctionStateScope", &ss);
	ss << "E"; // E indicates end of namespace

	return ss.str();
	}