be/src/util/string-util.cc - impala - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one
 // or more contributor license agreements.  See the NOTICE file
 // distributed with this work for additional information
 // regarding copyright ownership.  The ASF licenses this file
 // to you under the Apache License, Version 2.0 (the
 // "License"); you may not use this file except in compliance
 // with the License.  You may obtain a copy of the License at
 //
 //   http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing,
 // software distributed under the License is distributed on an
 // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
 // KIND, either express or implied.  See the License for the
 // specific language governing permissions and limitations
 // under the License.

 #include <algorithm>

 #include "gutil/strings/substitute.h"
 #include "util/bit-util.h"
 #include "util/string-util.h"

 #include "common/names.h"

 namespace impala {

 Status TruncateDown(const string& str, int32_t max_length, string* result) {
   DCHECK(result != nullptr);
   *result = str.substr(0, std::min(static_cast<int32_t>(str.length()), max_length));
   return Status::OK();
 }

 Status TruncateUp(const string& str, int32_t max_length, string* result) {
   DCHECK(result != nullptr);
   if (str.length() <= max_length) {
     *result = str;
     return Status::OK();
   }

   *result = str.substr(0, max_length);
   int i = max_length - 1;
   while (i > 0 && static_cast<int32_t>((*result)[i]) == -1) {
     (*result)[i] += 1;
     --i;
   }
   // We convert it to unsigned because signed overflow results in undefined behavior.
   unsigned char uch = static_cast<unsigned char>((*result)[i]);
   uch += 1;
   (*result)[i] = uch;
   if (i == 0 && (*result)[i] == 0) {
     return Status("TruncateUp() couldn't increase string.");
   }
   result->resize(i + 1);
   return Status::OK();
 }

 bool CommaSeparatedContains(const std::string& cs_list, const std::string& item) {
   size_t pos = 0;
   while (pos < cs_list.size()) {
     size_t comma_pos = cs_list.find(',', pos);
     if (comma_pos == string::npos) return cs_list.compare(pos, string::npos, item) == 0;
     if (cs_list.compare(pos, comma_pos - pos, item) == 0) return true;
     pos = comma_pos + 1;
   }
   return false;
 }

 bool EndsWith(const std::string& full_string, const std::string& end) {
   if (full_string.size() >= end.size()) {
     return (full_string.compare(full_string.size() - end.size(), end.size(),
         end) == 0);
   }
   return false;
 }

 const uint8_t* FindEndOfIdentifier(const uint8_t* start, const uint8_t* end) {
   if (start == end) return nullptr;
   uint8_t ch = *start++;
   if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
       (ch >= '0' && ch <= '9') || ch == '_')) {
     return nullptr;
   }
   while (start != end) {
     ch = *start;
     if (!((ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') ||
         (ch >= '0' && ch <= '9') || ch == '_')) {
       return start;
     }
     ++start;
   }
   return end;
 }

 int FindUtf8PosForward(const uint8_t* ptr, const int len, int index) {
   DCHECK_GE(index, 0);
   int pos = 0;
   while (index > 0 && pos < len) {
     // Counting malformed UTF8 characters.
     while (!BitUtil::IsUtf8StartByte(ptr[pos]) && index > 0 && pos < len) {
       ++pos;
       --index;
     }
     if (index == 0 || pos == len) break;
     pos += BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
     --index;
   }
   if (pos >= len) return len;
   return pos;
 }

 int FindUtf8PosBackward(const uint8_t* ptr, const int len, int index) {
   DCHECK_GE(index, 0);
   int pos = len - 1;
   int last_pos = len;
   while (pos >= 0) {
     // Point to the start byte of the last character.
     while (pos >= 0 && !BitUtil::IsUtf8StartByte(ptr[pos])) --pos;
     if (pos < 0) {
       // Can't find any legal characters. Count each byte from last_pos as one character.
       // Note that index is 0-based.
       if (index < last_pos) return last_pos - index - 1;
       return -1;
     }
     // Get bytes length of the located character.
     int bytes_len = BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
     // If there are not enough bytes after the first byte, i.e. last_pos-pos < bytes_len,
     // we consider the bytes belong to a malformed character, and count them as one
     // character.
     int malformed_bytes = max(last_pos - pos - bytes_len, 0);
     if (index < malformed_bytes) {
       // Count each malformed bytes as one character.
       return last_pos - index - 1;
     }
     // We found a legal character and 'malformed_bytes' malformed characters.
     // At this point, index >= malformed_bytes. So the lowest value of the updated index
     // is -1, which means 'pos' points at what we want.
     index -= malformed_bytes + 1;
     if (index < 0) return pos;
     last_pos = pos;
     --pos;
   }
   DCHECK_EQ(pos, -1);
   return -1;
 }

 void StringStreamPop::move_back() {
   if (tellp() > 0) {
     seekp(-1, std::ios_base::cur);
   }
 }

 }
	// Licensed to the Apache Software Foundation (ASF) under one
	// or more contributor license agreements. See the NOTICE file
	// distributed with this work for additional information
	// regarding copyright ownership. The ASF licenses this file
	// to you under the Apache License, Version 2.0 (the
	// "License"); you may not use this file except in compliance
	// with the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing,
	// software distributed under the License is distributed on an
	// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	// KIND, either express or implied. See the License for the
	// specific language governing permissions and limitations
	// under the License.

	#include <algorithm>

	#include "gutil/strings/substitute.h"
	#include "util/bit-util.h"
	#include "util/string-util.h"

	#include "common/names.h"

	namespace impala {

	Status TruncateDown(const string& str, int32_t max_length, string* result) {
	DCHECK(result != nullptr);
	*result = str.substr(0, std::min(static_cast<int32_t>(str.length()), max_length));
	return Status::OK();
	}

	Status TruncateUp(const string& str, int32_t max_length, string* result) {
	DCHECK(result != nullptr);
	if (str.length() <= max_length) {
	*result = str;
	return Status::OK();
	}

	*result = str.substr(0, max_length);
	int i = max_length - 1;
	while (i > 0 && static_cast<int32_t>((*result)[i]) == -1) {
	(*result)[i] += 1;
	--i;
	}
	// We convert it to unsigned because signed overflow results in undefined behavior.
	unsigned char uch = static_cast<unsigned char>((*result)[i]);
	uch += 1;
	(*result)[i] = uch;
	if (i == 0 && (*result)[i] == 0) {
	return Status("TruncateUp() couldn't increase string.");
	}
	result->resize(i + 1);
	return Status::OK();
	}

	bool CommaSeparatedContains(const std::string& cs_list, const std::string& item) {
	size_t pos = 0;
	while (pos < cs_list.size()) {
	size_t comma_pos = cs_list.find(',', pos);
	if (comma_pos == string::npos) return cs_list.compare(pos, string::npos, item) == 0;
	if (cs_list.compare(pos, comma_pos - pos, item) == 0) return true;
	pos = comma_pos + 1;
	}
	return false;
	}

	bool EndsWith(const std::string& full_string, const std::string& end) {
	if (full_string.size() >= end.size()) {
	return (full_string.compare(full_string.size() - end.size(), end.size(),
	end) == 0);
	}
	return false;
	}

	const uint8_t* FindEndOfIdentifier(const uint8_t* start, const uint8_t* end) {
	if (start == end) return nullptr;
	uint8_t ch = *start++;
	if (!((ch >= 'a' && ch <= 'z') \|\| (ch >= 'A' && ch <= 'Z') \|\|
	(ch >= '0' && ch <= '9') \|\| ch == '_')) {
	return nullptr;
	}
	while (start != end) {
	ch = *start;
	if (!((ch >= 'a' && ch <= 'z') \|\| (ch >= 'A' && ch <= 'Z') \|\|
	(ch >= '0' && ch <= '9') \|\| ch == '_')) {
	return start;
	}
	++start;
	}
	return end;
	}

	int FindUtf8PosForward(const uint8_t* ptr, const int len, int index) {
	DCHECK_GE(index, 0);
	int pos = 0;
	while (index > 0 && pos < len) {
	// Counting malformed UTF8 characters.
	while (!BitUtil::IsUtf8StartByte(ptr[pos]) && index > 0 && pos < len) {
	++pos;
	--index;
	}
	if (index == 0 \|\| pos == len) break;
	pos += BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
	--index;
	}
	if (pos >= len) return len;
	return pos;
	}

	int FindUtf8PosBackward(const uint8_t* ptr, const int len, int index) {
	DCHECK_GE(index, 0);
	int pos = len - 1;
	int last_pos = len;
	while (pos >= 0) {
	// Point to the start byte of the last character.
	while (pos >= 0 && !BitUtil::IsUtf8StartByte(ptr[pos])) --pos;
	if (pos < 0) {
	// Can't find any legal characters. Count each byte from last_pos as one character.
	// Note that index is 0-based.
	if (index < last_pos) return last_pos - index - 1;
	return -1;
	}
	// Get bytes length of the located character.
	int bytes_len = BitUtil::NumBytesInUtf8Encoding(ptr[pos]);
	// If there are not enough bytes after the first byte, i.e. last_pos-pos < bytes_len,
	// we consider the bytes belong to a malformed character, and count them as one
	// character.
	int malformed_bytes = max(last_pos - pos - bytes_len, 0);
	if (index < malformed_bytes) {
	// Count each malformed bytes as one character.
	return last_pos - index - 1;
	}
	// We found a legal character and 'malformed_bytes' malformed characters.
	// At this point, index >= malformed_bytes. So the lowest value of the updated index
	// is -1, which means 'pos' points at what we want.
	index -= malformed_bytes + 1;
	if (index < 0) return pos;
	last_pos = pos;
	--pos;
	}
	DCHECK_EQ(pos, -1);
	return -1;
	}

	void StringStreamPop::move_back() {
	if (tellp() > 0) {
	seekp(-1, std::ios_base::cur);
	}
	}

	}