| // Copyright 2008 and onwards Google Inc. All rights reserved. |
| // |
| // Maintainer: Greg Miller <jgm@google.com> |
| |
| #include "gutil/strings/split.h" |
| |
| #include <assert.h> |
| #include <stdlib.h> |
| #include <string.h> |
| #include <iterator> |
| using std::back_insert_iterator; |
| using std::iterator_traits; |
| #include <limits> |
| using std::numeric_limits; |
| |
| using std::unordered_map; |
| using std::unordered_set; |
| |
| #include "gutil/integral_types.h" |
| #include <common/logging.h> |
| #include "gutil/logging-inl.h" |
| #include "gutil/macros.h" |
| #include "gutil/strtoint.h" |
| #include "gutil/strings/ascii_ctype.h" |
| #include "gutil/strings/util.h" |
| #include "gutil/hash/hash.h" |
| |
| // Implementations for some of the Split2 API. Much of the Split2 API is |
| // templated so it exists in header files, either strings/split.h or |
| // strings/split_iternal.h. |
| namespace strings { |
| namespace delimiter { |
| |
| namespace { |
| |
| // This GenericFind() template function encapsulates the finding algorithm |
| // shared between the Literal and AnyOf delimiters. The FindPolicy template |
| // parameter allows each delimiter to customize the actual find function to use |
| // and the length of the found delimiter. For example, the Literal delimiter |
| // will ultimately use StringPiece::find(), and the AnyOf delimiter will use |
| // StringPiece::find_first_of(). |
| template <typename FindPolicy> |
| StringPiece GenericFind( |
| StringPiece text, |
| StringPiece delimiter, |
| FindPolicy find_policy) { |
| if (delimiter.empty() && text.length() > 0) { |
| // Special case for empty string delimiters: always return a zero-length |
| // StringPiece referring to the item at position 1. |
| return StringPiece(text.begin() + 1, 0); |
| } |
| int found_pos = StringPiece::npos; |
| StringPiece found(text.end(), 0); // By default, not found |
| found_pos = find_policy.Find(text, delimiter); |
| if (found_pos != StringPiece::npos) { |
| found.set(text.data() + found_pos, find_policy.Length(delimiter)); |
| } |
| return found; |
| } |
| |
| // Finds using StringPiece::find(), therefore the length of the found delimiter |
| // is delimiter.length(). |
| struct LiteralPolicy { |
| int Find(StringPiece text, StringPiece delimiter) { |
| return text.find(delimiter); |
| } |
| int Length(StringPiece delimiter) { |
| return delimiter.length(); |
| } |
| }; |
| |
| // Finds using StringPiece::find_first_of(), therefore the length of the found |
| // delimiter is 1. |
| struct AnyOfPolicy { |
| size_t Find(StringPiece text, StringPiece delimiter) { |
| return text.find_first_of(delimiter); |
| } |
| int Length(StringPiece delimiter) { |
| return 1; |
| } |
| }; |
| |
| } // namespace |
| |
| // |
| // Literal |
| // |
| |
| Literal::Literal(StringPiece sp) : delimiter_(sp.ToString()) { |
| } |
| |
| StringPiece Literal::Find(StringPiece text) const { |
| return GenericFind(text, delimiter_, LiteralPolicy()); |
| } |
| |
| // |
| // AnyOf |
| // |
| |
| AnyOf::AnyOf(StringPiece sp) : delimiters_(sp.ToString()) { |
| } |
| |
| StringPiece AnyOf::Find(StringPiece text) const { |
| return GenericFind(text, delimiters_, AnyOfPolicy()); |
| } |
| |
| } // namespace delimiter |
| } // namespace strings |
| |
| // |
| // ==================== LEGACY SPLIT FUNCTIONS ==================== |
| // |
| |
| using ::strings::SkipEmpty; |
| using ::strings::delimiter::AnyOf; |
| using ::strings::delimiter::Limit; |
| |
| namespace { |
| |
| // Appends the results of a split to the specified container. This function has |
| // the following overloads: |
| // - vector<string> - for better performance |
| // - map<string, string> - to change append semantics |
| // - unordered_map<string, string> - to change append semantics |
| template <typename Container, typename Splitter> |
| void AppendToImpl(Container* container, Splitter splitter) { |
| Container c = splitter; // Calls implicit conversion operator. |
| std::copy(c.begin(), c.end(), std::inserter(*container, container->end())); |
| } |
| |
| // Overload of AppendToImpl() that is optimized for appending to vector<string>. |
| // This version eliminates a couple string copies by using a vector<StringPiece> |
| // as the intermediate container. |
| template <typename Splitter> |
| void AppendToImpl(vector<string>* container, Splitter splitter) { |
| vector<StringPiece> vsp = splitter; // Calls implicit conversion operator. |
| size_t container_size = container->size(); |
| container->resize(container_size + vsp.size()); |
| for (const auto& sp : vsp) { |
| sp.CopyToString(&(*container)[container_size++]); |
| } |
| } |
| |
| // Here we define two AppendToImpl() overloads for map<> and unordered_map<>. Both of |
| // these overloads call through to this AppendToMap() function. This is needed |
| // because inserting a duplicate key into a map does NOT overwrite the previous |
| // value, which was not the behavior of the split1 Split*() functions. Consider |
| // this example: |
| // |
| // map<string, string> m; |
| // m.insert(std::make_pair("a", "1")); |
| // m.insert(std::make_pair("a", "2")); // <-- doesn't actually insert. |
| // ASSERT_EQ(m["a"], "1"); // <-- "a" has value "1" not "2". |
| // |
| // Due to this behavior of map::insert, we can't rely on a normal std::inserter |
| // for a maps. Instead, maps and unordered_maps need to be special cased to implement |
| // the desired append semantic of inserting an existing value overwrites the |
| // previous value. |
| // |
| // This same issue is true with sets as well. However, since sets don't have a |
| // separate key and value, failing to overwrite an existing value in a set is |
| // fine because the value already exists in the set. |
| // |
| template <typename Map, typename Splitter> |
| void AppendToMap(Map* m, Splitter splitter) { |
| Map tmp = splitter; // Calls implicit conversion operator. |
| for (typename Map::const_iterator it = tmp.begin(); it != tmp.end(); ++it) { |
| (*m)[it->first] = it->second; |
| } |
| } |
| |
| template <typename Splitter> |
| void AppendToImpl(map<string, string>* map_container, Splitter splitter) { |
| AppendToMap(map_container, splitter); |
| } |
| |
| template <typename Splitter> |
| void AppendToImpl(unordered_map<string, string>* map_container, Splitter splitter) { |
| AppendToMap(map_container, splitter); |
| } |
| |
| // Appends the results of a call to strings::Split() to the specified container. |
| // This function is used with the new strings::Split() API to implement the |
| // append semantics of the legacy Split*() functions. |
| // |
| // The "Splitter" template parameter is intended to be a |
| // ::strings::internal::Splitter<>, which is the return value of a call to |
| // strings::Split(). Sample usage: |
| // |
| // vector<string> v; |
| // ... add stuff to "v" ... |
| // AppendTo(&v, strings::Split("a,b,c", ",")); |
| // |
| template <typename Container, typename Splitter> |
| void AppendTo(Container* container, Splitter splitter) { |
| if (container->empty()) { |
| // "Appending" to an empty container is by far the common case. For this we |
| // assign directly to the output container, which is more efficient than |
| // explicitly appending. |
| *container = splitter; // Calls implicit conversion operator. |
| } else { |
| AppendToImpl(container, splitter); |
| } |
| } |
| |
| } // anonymous namespace |
| |
| // Constants for ClipString() |
| static const int kMaxOverCut = 12; |
| // The ellipsis to add to strings that are too long |
| static const char kCutStr[] = "..."; |
| static const int kCutStrSize = sizeof(kCutStr) - 1; |
| |
| // ---------------------------------------------------------------------- |
| // Return the place to clip the string at, or -1 |
| // if the string doesn't need to be clipped. |
| // ---------------------------------------------------------------------- |
| static int ClipStringHelper(const char* str, int max_len, bool use_ellipsis) { |
| if (strlen(str) <= max_len) |
| return -1; |
| |
| int max_substr_len = max_len; |
| |
| if (use_ellipsis && max_len > kCutStrSize) { |
| max_substr_len -= kCutStrSize; |
| } |
| |
| const char* cut_by = |
| (max_substr_len < kMaxOverCut ? str : str + max_len - kMaxOverCut); |
| const char* cut_at = str + max_substr_len; |
| while (!ascii_isspace(*cut_at) && cut_at > cut_by) |
| cut_at--; |
| |
| if (cut_at == cut_by) { |
| // No space was found |
| return max_substr_len; |
| } else { |
| return cut_at-str; |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // ClipString |
| // Clip a string to a max length. We try to clip on a word boundary |
| // if this is possible. If the string is clipped, we append an |
| // ellipsis. |
| // ---------------------------------------------------------------------- |
| |
| void ClipString(char* str, int max_len) { |
| int cut_at = ClipStringHelper(str, max_len, true); |
| if (cut_at != -1) { |
| if (max_len > kCutStrSize) { |
| strcpy(str+cut_at, kCutStr); |
| } else { |
| strcpy(str+cut_at, ""); |
| } |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // ClipString |
| // Version of ClipString() that uses string instead of char*. |
| // ---------------------------------------------------------------------- |
| void ClipString(string* full_str, int max_len) { |
| int cut_at = ClipStringHelper(full_str->c_str(), max_len, true); |
| if (cut_at != -1) { |
| full_str->erase(cut_at); |
| if (max_len > kCutStrSize) { |
| full_str->append(kCutStr); |
| } |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringToIteratorAllowEmpty() |
| // Split a string using a character delimiter. Append the components |
| // to 'result'. If there are consecutive delimiters, this function |
| // will return corresponding empty strings. The string is split into |
| // at most the specified number of pieces greedily. This means that the |
| // last piece may possibly be split further. To split into as many pieces |
| // as possible, specify 0 as the number of pieces. |
| // |
| // If "full" is the empty string, yields an empty string as the only value. |
| // |
| // If "pieces" is negative for some reason, it returns the whole string |
| // ---------------------------------------------------------------------- |
| template <typename StringType, typename ITR> |
| static inline |
| void SplitStringToIteratorAllowEmpty(const StringType& full, |
| const char* delim, |
| int pieces, |
| ITR& result) { |
| string::size_type begin_index, end_index; |
| begin_index = 0; |
| |
| for (int i = 0; (i < pieces-1) || (pieces == 0); i++) { |
| end_index = full.find_first_of(delim, begin_index); |
| if (end_index == string::npos) { |
| *result++ = full.substr(begin_index); |
| return; |
| } |
| *result++ = full.substr(begin_index, (end_index - begin_index)); |
| begin_index = end_index + 1; |
| } |
| *result++ = full.substr(begin_index); |
| } |
| |
| void SplitStringIntoNPiecesAllowEmpty(const string& full, |
| const char* delim, |
| int pieces, |
| vector<string>* result) { |
| if (pieces == 0) { |
| // No limit when pieces is 0. |
| AppendTo(result, strings::Split(full, AnyOf(delim))); |
| } else { |
| // The input argument "pieces" specifies the max size that *result should |
| // be. However, the argument to the Limit() delimiter is the max number of |
| // delimiters, which should be one less than "pieces". Example: "a,b,c" has |
| // 3 pieces and two comma delimiters. |
| int limit = std::max(pieces - 1, 0); |
| AppendTo(result, strings::Split(full, Limit(AnyOf(delim), limit))); |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringAllowEmpty |
| // Split a string using a character delimiter. Append the components |
| // to 'result'. If there are consecutive delimiters, this function |
| // will return corresponding empty strings. |
| // ---------------------------------------------------------------------- |
| void SplitStringAllowEmpty(const string& full, const char* delim, |
| vector<string>* result) { |
| AppendTo(result, strings::Split(full, AnyOf(delim))); |
| } |
| |
| // If we know how much to allocate for a vector of strings, we can |
| // allocate the vector<string> only once and directly to the right size. |
| // This saves in between 33-66 % of memory space needed for the result, |
| // and runs faster in the microbenchmarks. |
| // |
| // The reserve is only implemented for the single character delim. |
| // |
| // The implementation for counting is cut-and-pasted from |
| // SplitStringToIteratorUsing. I could have written my own counting iterator, |
| // and use the existing template function, but probably this is more clear |
| // and more sure to get optimized to reasonable code. |
| static int CalculateReserveForVector(const string& full, const char* delim) { |
| int count = 0; |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| // Optimize the common case where delim is a single character. |
| char c = delim[0]; |
| const char* p = full.data(); |
| const char* end = p + full.size(); |
| while (p != end) { |
| if (*p == c) { // This could be optimized with hasless(v,1) trick. |
| ++p; |
| } else { |
| while (++p != end && *p != c) { |
| // Skip to the next occurence of the delimiter. |
| } |
| ++count; |
| } |
| } |
| } |
| return count; |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringUsing() |
| // SplitStringToHashsetUsing() |
| // SplitStringToSetUsing() |
| // SplitStringToMapUsing() |
| // SplitStringToHashmapUsing() |
| // Split a string using a character delimiter. Append the components |
| // to 'result'. |
| // |
| // Note: For multi-character delimiters, this routine will split on *ANY* of |
| // the characters in the string, not the entire string as a single delimiter. |
| // ---------------------------------------------------------------------- |
| template <typename StringType, typename ITR> |
| static inline |
| void SplitStringToIteratorUsing(const StringType& full, |
| const char* delim, |
| ITR& result) { |
| // Optimize the common case where delim is a single character. |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| char c = delim[0]; |
| const char* p = full.data(); |
| const char* end = p + full.size(); |
| while (p != end) { |
| if (*p == c) { |
| ++p; |
| } else { |
| const char* start = p; |
| while (++p != end && *p != c) { |
| // Skip to the next occurence of the delimiter. |
| } |
| *result++ = StringType(start, p - start); |
| } |
| } |
| return; |
| } |
| |
| string::size_type begin_index, end_index; |
| begin_index = full.find_first_not_of(delim); |
| while (begin_index != string::npos) { |
| end_index = full.find_first_of(delim, begin_index); |
| if (end_index == string::npos) { |
| *result++ = full.substr(begin_index); |
| return; |
| } |
| *result++ = full.substr(begin_index, (end_index - begin_index)); |
| begin_index = full.find_first_not_of(delim, end_index); |
| } |
| } |
| |
| void SplitStringUsing(const string& full, |
| const char* delim, |
| vector<string>* result) { |
| result->reserve(result->size() + CalculateReserveForVector(full, delim)); |
| std::back_insert_iterator< vector<string> > it(*result); |
| SplitStringToIteratorUsing(full, delim, it); |
| } |
| |
| void SplitStringToHashsetUsing(const string& full, const char* delim, |
| unordered_set<string>* result) { |
| AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
| } |
| |
| void SplitStringToSetUsing(const string& full, const char* delim, |
| set<string>* result) { |
| AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
| } |
| |
| void SplitStringToMapUsing(const string& full, const char* delim, |
| map<string, string>* result) { |
| AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
| } |
| |
| void SplitStringToHashmapUsing(const string& full, const char* delim, |
| unordered_map<string, string>* result) { |
| AppendTo(result, strings::Split(full, AnyOf(delim), strings::SkipEmpty())); |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringPieceToVector() |
| // Split a StringPiece into sub-StringPieces based on delim |
| // and appends the pieces to 'vec'. |
| // If omit empty strings is true, empty strings are omitted |
| // from the resulting vector. |
| // ---------------------------------------------------------------------- |
| void SplitStringPieceToVector(const StringPiece& full, |
| const char* delim, |
| vector<StringPiece>* vec, |
| bool omit_empty_strings) { |
| if (omit_empty_strings) { |
| AppendTo(vec, strings::Split(full, AnyOf(delim), SkipEmpty())); |
| } else { |
| AppendTo(vec, strings::Split(full, AnyOf(delim))); |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitUsing() |
| // Split a string using a string of delimiters, returning vector |
| // of strings. The original string is modified to insert nulls. |
| // ---------------------------------------------------------------------- |
| |
| vector<char*>* SplitUsing(char* full, const char* delim) { |
| auto vec = new vector<char*>; |
| SplitToVector(full, delim, vec, true); // Omit empty strings |
| return vec; |
| } |
| |
| void SplitToVector(char* full, const char* delim, vector<char*>* vec, |
| bool omit_empty_strings) { |
| char* next = full; |
| while ((next = gstrsep(&full, delim)) != nullptr) { |
| if (omit_empty_strings && next[0] == '\0') continue; |
| vec->push_back(next); |
| } |
| // Add last element (or full string if no delimeter found): |
| if (full != nullptr) { |
| vec->push_back(full); |
| } |
| } |
| |
| void SplitToVector(char* full, const char* delim, vector<const char*>* vec, |
| bool omit_empty_strings) { |
| char* next = full; |
| while ((next = gstrsep(&full, delim)) != nullptr) { |
| if (omit_empty_strings && next[0] == '\0') continue; |
| vec->push_back(next); |
| } |
| // Add last element (or full string if no delimeter found): |
| if (full != nullptr) { |
| vec->push_back(full); |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitOneStringToken() |
| // Mainly a stringified wrapper around strpbrk() |
| // ---------------------------------------------------------------------- |
| string SplitOneStringToken(const char ** source, const char * delim) { |
| assert(source); |
| assert(delim); |
| if (!*source) { |
| return string(); |
| } |
| const char * begin = *source; |
| // Optimize the common case where delim is a single character. |
| if (delim[0] != '\0' && delim[1] == '\0') { |
| *source = strchr(*source, delim[0]); |
| } else { |
| *source = strpbrk(*source, delim); |
| } |
| if (*source) { |
| return string(begin, (*source)++); |
| } else { |
| return string(begin); |
| } |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringWithEscaping() |
| // SplitStringWithEscapingAllowEmpty() |
| // SplitStringWithEscapingToSet() |
| // SplitStringWithWithEscapingToHashset() |
| // Split the string using the specified delimiters, taking escaping into |
| // account. '\' is not allowed as a delimiter. |
| // ---------------------------------------------------------------------- |
| template <typename ITR> |
| static inline |
| void SplitStringWithEscapingToIterator(const string& src, |
| const strings::CharSet& delimiters, |
| const bool allow_empty, |
| ITR* result) { |
| CHECK(!delimiters.Test('\\')) << "\\ is not allowed as a delimiter."; |
| CHECK(result); |
| string part; |
| |
| for (uint32 i = 0; i < src.size(); ++i) { |
| char current_char = src[i]; |
| if (delimiters.Test(current_char)) { |
| // Push substrings when we encounter delimiters. |
| if (allow_empty || !part.empty()) { |
| *(*result)++ = part; |
| part.clear(); |
| } |
| } else if (current_char == '\\' && ++i < src.size()) { |
| // If we see a backslash, the next delimiter or backslash is literal. |
| current_char = src[i]; |
| if (current_char != '\\' && !delimiters.Test(current_char)) { |
| // Don't honour unknown escape sequences: emit \f for \f. |
| part.push_back('\\'); |
| } |
| part.push_back(current_char); |
| } else { |
| // Otherwise, we have a normal character or trailing backslash. |
| part.push_back(current_char); |
| } |
| } |
| |
| // Push the trailing part. |
| if (allow_empty || !part.empty()) { |
| *(*result)++ = part; |
| } |
| } |
| |
| void SplitStringWithEscaping(const string &full, |
| const strings::CharSet& delimiters, |
| vector<string> *result) { |
| std::back_insert_iterator< vector<string> > it(*result); |
| SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
| } |
| |
| void SplitStringWithEscapingAllowEmpty(const string &full, |
| const strings::CharSet& delimiters, |
| vector<string> *result) { |
| std::back_insert_iterator< vector<string> > it(*result); |
| SplitStringWithEscapingToIterator(full, delimiters, true, &it); |
| } |
| |
| void SplitStringWithEscapingToSet(const string &full, |
| const strings::CharSet& delimiters, |
| set<string> *result) { |
| std::insert_iterator< set<string> > it(*result, result->end()); |
| SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
| } |
| |
| void SplitStringWithEscapingToHashset(const string &full, |
| const strings::CharSet& delimiters, |
| unordered_set<string> *result) { |
| std::insert_iterator< unordered_set<string> > it(*result, result->end()); |
| SplitStringWithEscapingToIterator(full, delimiters, false, &it); |
| } |
| |
| |
| // ---------------------------------------------------------------------- |
| // SplitOneIntToken() |
| // SplitOneInt32Token() |
| // SplitOneUint32Token() |
| // SplitOneInt64Token() |
| // SplitOneUint64Token() |
| // SplitOneDoubleToken() |
| // SplitOneFloatToken() |
| // SplitOneDecimalIntToken() |
| // SplitOneDecimalInt32Token() |
| // SplitOneDecimalUint32Token() |
| // SplitOneDecimalInt64Token() |
| // SplitOneDecimalUint64Token() |
| // SplitOneHexUint32Token() |
| // SplitOneHexUint64Token() |
| // Mainly a stringified wrapper around strtol/strtoul/strtod |
| // ---------------------------------------------------------------------- |
| // Curried functions for the macro below |
| static inline long strto32_0(const char * source, char ** end) { |
| return strto32(source, end, 0); } |
| static inline unsigned long strtou32_0(const char * source, char ** end) { |
| return strtou32(source, end, 0); } |
| static inline int64 strto64_0(const char * source, char ** end) { |
| return strto64(source, end, 0); } |
| static inline uint64 strtou64_0(const char * source, char ** end) { |
| return strtou64(source, end, 0); } |
| static inline long strto32_10(const char * source, char ** end) { |
| return strto32(source, end, 10); } |
| static inline unsigned long strtou32_10(const char * source, char ** end) { |
| return strtou32(source, end, 10); } |
| static inline int64 strto64_10(const char * source, char ** end) { |
| return strto64(source, end, 10); } |
| static inline uint64 strtou64_10(const char * source, char ** end) { |
| return strtou64(source, end, 10); } |
| static inline uint32 strtou32_16(const char * source, char ** end) { |
| return strtou32(source, end, 16); } |
| static inline uint64 strtou64_16(const char * source, char ** end) { |
| return strtou64(source, end, 16); } |
| |
| #define DEFINE_SPLIT_ONE_NUMBER_TOKEN(name, type, function) \ |
| bool SplitOne##name##Token(const char ** source, const char * delim, \ |
| type * value) { \ |
| assert(source); \ |
| assert(delim); \ |
| assert(value); \ |
| if (!*source) \ |
| return false; \ |
| /* Parse int */ \ |
| char * end; \ |
| *value = function(*source, &end); \ |
| if (end == *source) \ |
| return false; /* number not present at start of string */ \ |
| if (end[0] && !strchr(delim, end[0])) \ |
| return false; /* Garbage characters after int */ \ |
| /* Advance past token */ \ |
| if (*end != '\0') \ |
| *source = const_cast<const char *>(end+1); \ |
| else \ |
| *source = NULL; \ |
| return true; \ |
| } |
| |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int, int, strto32_0) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int32, int32, strto32_0) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint32, uint32, strtou32_0) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Int64, int64, strto64_0) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Uint64, uint64, strtou64_0) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Double, double, strtod) |
| #ifdef _MSC_VER // has no strtof() |
| // Note: does an implicit cast to float. |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtod) |
| #else |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(Float, float, strtof) |
| #endif |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt, int, strto32_10) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt32, int32, strto32_10) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint32, uint32, strtou32_10) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalInt64, int64, strto64_10) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(DecimalUint64, uint64, strtou64_10) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint32, uint32, strtou32_16) |
| DEFINE_SPLIT_ONE_NUMBER_TOKEN(HexUint64, uint64, strtou64_16) |
| |
| |
| // ---------------------------------------------------------------------- |
| // SplitRange() |
| // Splits a string of the form "<from>-<to>". Either or both can be |
| // missing. A raw number (<to>) is interpreted as "<to>-". Modifies |
| // parameters insofar as they're specified by the string. RETURNS |
| // true iff the input is a well-formed range. If it RETURNS false, |
| // from and to remain unchanged. The range in rangestr should be |
| // terminated either by "\0" or by whitespace. |
| // ---------------------------------------------------------------------- |
| |
| #define EOS(ch) ( (ch) == '\0' || ascii_isspace(ch) ) |
| bool SplitRange(const char* rangestr, int* from, int* to) { |
| // We need to do the const-cast because strol takes a char**, not const char** |
| char* val = const_cast<char*>(rangestr); |
| if (val == nullptr || EOS(*val)) return true; // we'll say nothingness is ok |
| |
| if ( val[0] == '-' && EOS(val[1]) ) // CASE 1: - |
| return true; // nothing changes |
| |
| if ( val[0] == '-' ) { // CASE 2: -<i2> |
| const int int2 = strto32(val+1, &val, 10); |
| if ( !EOS(*val) ) return false; // not a valid integer |
| *to = int2; // only "to" changes |
| return true; |
| |
| } else { |
| const int int1 = strto32(val, &val, 10); |
| if ( EOS(*val) || (*val == '-' && EOS(*(val+1))) ) { |
| *from = int1; // CASE 3: <i1>, same as <i1>- |
| return true; // only "from" changes |
| } else if (*val != '-') { // not a valid range |
| return false; |
| } |
| const int int2 = strto32(val+1, &val, 10); |
| if ( !EOS(*val) ) return false; // not a valid integer |
| *from = int1; // CASE 4: <i1>-<i2> |
| *to = int2; |
| return true; |
| } |
| } |
| |
| void SplitCSVLineWithDelimiter(char* line, char delimiter, |
| vector<char*>* cols) { |
| char* end_of_line = line + strlen(line); |
| char* end; |
| char* start; |
| |
| for (; line < end_of_line; line++) { |
| // Skip leading whitespace, unless said whitespace is the delimiter. |
| while (ascii_isspace(*line) && *line != delimiter) |
| ++line; |
| |
| if (*line == '"' && delimiter == ',') { // Quoted value... |
| start = ++line; |
| end = start; |
| for (; *line; line++) { |
| if (*line == '"') { |
| line++; |
| if (*line != '"') // [""] is an escaped ["] |
| break; // but just ["] is end of value |
| } |
| *end++ = *line; |
| } |
| // All characters after the closing quote and before the comma |
| // are ignored. |
| line = strchr(line, delimiter); |
| if (!line) line = end_of_line; |
| } else { |
| start = line; |
| line = strchr(line, delimiter); |
| if (!line) line = end_of_line; |
| // Skip all trailing whitespace, unless said whitespace is the delimiter. |
| for (end = line; end > start; --end) { |
| if (!ascii_isspace(end[-1]) || end[-1] == delimiter) |
| break; |
| } |
| } |
| const bool need_another_column = |
| (*line == delimiter) && (line == end_of_line - 1); |
| *end = '\0'; |
| cols->push_back(start); |
| // If line was something like [paul,] (comma is the last character |
| // and is not proceeded by whitespace or quote) then we are about |
| // to eliminate the last column (which is empty). This would be |
| // incorrect. |
| if (need_another_column) |
| cols->push_back(end); |
| |
| assert(*line == '\0' || *line == delimiter); |
| } |
| } |
| |
| void SplitCSVLine(char* line, vector<char*>* cols) { |
| SplitCSVLineWithDelimiter(line, ',', cols); |
| } |
| |
| void SplitCSVLineWithDelimiterForStrings(const string &line, |
| char delimiter, |
| vector<string> *cols) { |
| // Unfortunately, the interface requires char* instead of const char* |
| // which requires copying the string. |
| char *cline = strndup_with_new(line.c_str(), line.size()); |
| vector<char *> v; |
| SplitCSVLineWithDelimiter(cline, delimiter, &v); |
| for (vector<char*>::const_iterator ci = v.begin(); ci != v.end(); ++ci) { |
| cols->push_back(*ci); |
| } |
| delete[] cline; |
| } |
| |
| // ---------------------------------------------------------------------- |
| namespace { |
| |
| // Helper class used by SplitStructuredLineInternal. |
| class ClosingSymbolLookup { |
| public: |
| explicit ClosingSymbolLookup(const char* symbol_pairs) |
| : closing_(), |
| valid_closing_() { |
| // Initialize the opening/closing arrays. |
| for (const char* symbol = symbol_pairs; *symbol != 0; ++symbol) { |
| unsigned char opening = *symbol; |
| ++symbol; |
| // If the string ends before the closing character has been found, |
| // use the opening character as the closing character. |
| unsigned char closing = *symbol != 0 ? *symbol : opening; |
| closing_[opening] = closing; |
| valid_closing_[closing] = true; |
| if (*symbol == 0) break; |
| } |
| } |
| |
| // Returns the closing character corresponding to an opening one, |
| // or 0 if the argument is not an opening character. |
| char GetClosingChar(char opening) const { |
| return closing_[static_cast<unsigned char>(opening)]; |
| } |
| |
| // Returns true if the argument is a closing character. |
| bool IsClosing(char c) const { |
| return valid_closing_[static_cast<unsigned char>(c)]; |
| } |
| |
| private: |
| // Maps an opening character to its closing. If the entry contains 0, |
| // the character is not in the opening set. |
| char closing_[256]; |
| // Valid closing characters. |
| bool valid_closing_[256]; |
| |
| DISALLOW_COPY_AND_ASSIGN(ClosingSymbolLookup); |
| }; |
| |
| char* SplitStructuredLineInternal(char* line, |
| char delimiter, |
| const char* symbol_pairs, |
| vector<char*>* cols, |
| bool with_escapes) { |
| ClosingSymbolLookup lookup(symbol_pairs); |
| |
| // Stack of symbols expected to close the current opened expressions. |
| vector<char> expected_to_close; |
| bool in_escape = false; |
| |
| CHECK(cols); |
| cols->push_back(line); |
| char* current; |
| for (current = line; *current; ++current) { |
| char c = *current; |
| if (in_escape) { |
| in_escape = false; |
| } else if (with_escapes && c == '\\') { |
| // We are escaping the next character. Note the escape still appears |
| // in the output. |
| in_escape = true; |
| } else if (expected_to_close.empty() && c == delimiter) { |
| // We don't have any open expression, this is a valid separator. |
| *current = 0; |
| cols->push_back(current + 1); |
| } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
| // Can we close the currently open expression? |
| expected_to_close.pop_back(); |
| } else if (lookup.GetClosingChar(c)) { |
| // If this is an opening symbol, we open a new expression and push |
| // the expected closing symbol on the stack. |
| expected_to_close.push_back(lookup.GetClosingChar(c)); |
| } else if (lookup.IsClosing(c)) { |
| // Error: mismatched closing symbol. |
| return current; |
| } |
| } |
| if (!expected_to_close.empty()) { |
| return current; // Missing closing symbol(s) |
| } |
| return nullptr; // Success |
| } |
| |
| bool SplitStructuredLineInternal(StringPiece line, |
| char delimiter, |
| const char* symbol_pairs, |
| vector<StringPiece>* cols, |
| bool with_escapes) { |
| ClosingSymbolLookup lookup(symbol_pairs); |
| |
| // Stack of symbols expected to close the current opened expressions. |
| vector<char> expected_to_close; |
| bool in_escape = false; |
| |
| CHECK_NOTNULL(cols); |
| cols->push_back(line); |
| for (int i = 0; i < line.size(); ++i) { |
| char c = line[i]; |
| if (in_escape) { |
| in_escape = false; |
| } else if (with_escapes && c == '\\') { |
| // We are escaping the next character. Note the escape still appears |
| // in the output. |
| in_escape = true; |
| } else if (expected_to_close.empty() && c == delimiter) { |
| // We don't have any open expression, this is a valid separator. |
| cols->back().remove_suffix(line.size() - i); |
| cols->push_back(StringPiece(line, i + 1)); |
| } else if (!expected_to_close.empty() && c == expected_to_close.back()) { |
| // Can we close the currently open expression? |
| expected_to_close.pop_back(); |
| } else if (lookup.GetClosingChar(c)) { |
| // If this is an opening symbol, we open a new expression and push |
| // the expected closing symbol on the stack. |
| expected_to_close.push_back(lookup.GetClosingChar(c)); |
| } else if (lookup.IsClosing(c)) { |
| // Error: mismatched closing symbol. |
| return false; |
| } |
| } |
| if (!expected_to_close.empty()) { |
| return false; // Missing closing symbol(s) |
| } |
| return true; // Success |
| } |
| |
| } // anonymous namespace |
| |
| char* SplitStructuredLine(char* line, |
| char delimiter, |
| const char *symbol_pairs, |
| vector<char*>* cols) { |
| return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
| false); |
| } |
| |
| bool SplitStructuredLine(StringPiece line, |
| char delimiter, |
| const char* symbol_pairs, |
| vector<StringPiece>* cols) { |
| return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
| false); |
| } |
| |
| char* SplitStructuredLineWithEscapes(char* line, |
| char delimiter, |
| const char *symbol_pairs, |
| vector<char*>* cols) { |
| return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
| true); |
| } |
| |
| bool SplitStructuredLineWithEscapes(StringPiece line, |
| char delimiter, |
| const char* symbol_pairs, |
| vector<StringPiece>* cols) { |
| return SplitStructuredLineInternal(line, delimiter, symbol_pairs, cols, |
| true); |
| } |
| |
| |
| // ---------------------------------------------------------------------- |
| // SplitStringIntoKeyValues() |
| // ---------------------------------------------------------------------- |
| bool SplitStringIntoKeyValues(const string& line, |
| const string& key_value_delimiters, |
| const string& value_value_delimiters, |
| string *key, vector<string> *values) { |
| key->clear(); |
| values->clear(); |
| |
| // find the key string |
| size_t end_key_pos = line.find_first_of(key_value_delimiters); |
| if (end_key_pos == string::npos) { |
| VLOG(1) << "cannot parse key from line: " << line; |
| return false; // no key |
| } |
| key->assign(line, 0, end_key_pos); |
| |
| // find the values string |
| string remains(line, end_key_pos, line.size() - end_key_pos); |
| size_t begin_values_pos = remains.find_first_not_of(key_value_delimiters); |
| if (begin_values_pos == string::npos) { |
| VLOG(1) << "cannot parse value from line: " << line; |
| return false; // no value |
| } |
| string values_string(remains, |
| begin_values_pos, |
| remains.size() - begin_values_pos); |
| |
| // construct the values vector |
| if (value_value_delimiters.empty()) { // one value |
| values->push_back(values_string); |
| } else { // multiple values |
| SplitStringUsing(values_string, value_value_delimiters.c_str(), values); |
| if (values->size() < 1) { |
| VLOG(1) << "cannot parse value from line: " << line; |
| return false; // no value |
| } |
| } |
| return true; |
| } |
| |
| bool SplitStringIntoKeyValuePairs(const string& line, |
| const string& key_value_delimiters, |
| const string& key_value_pair_delimiters, |
| vector<pair<string, string> >* kv_pairs) { |
| kv_pairs->clear(); |
| |
| vector<string> pairs; |
| SplitStringUsing(line, key_value_pair_delimiters.c_str(), &pairs); |
| |
| bool success = true; |
| for (const auto& pair : pairs) { |
| string key; |
| vector<string> value; |
| if (!SplitStringIntoKeyValues(pair, |
| key_value_delimiters, |
| "", &key, &value)) { |
| // Don't return here, to allow for keys without associated |
| // values; just record that our split failed. |
| success = false; |
| } |
| // we expect atmost one value because we passed in an empty vsep to |
| // SplitStringIntoKeyValues |
| DCHECK_LE(value.size(), 1); |
| kv_pairs->push_back(make_pair(key, value.empty()? "" : value[0])); |
| } |
| return success; |
| } |
| |
| // ---------------------------------------------------------------------- |
| // SplitLeadingDec32Values() |
| // SplitLeadingDec64Values() |
| // A simple parser for space-separated decimal int32/int64 values. |
| // Appends parsed integers to the end of the result vector, stopping |
| // at the first unparsable spot. Skips past leading and repeated |
| // whitespace (does not consume trailing whitespace), and returns |
| // a pointer beyond the last character parsed. |
| // -------------------------------------------------------------------- |
| const char* SplitLeadingDec32Values(const char *str, vector<int32> *result) { |
| for (;;) { |
| char *end = nullptr; |
| long value = strtol(str, &end, 10); |
| if (end == str) |
| break; |
| // Limit long values to int32 min/max. Needed for lp64. |
| if (value > numeric_limits<int32>::max()) { |
| value = numeric_limits<int32>::max(); |
| } else if (value < numeric_limits<int32>::min()) { |
| value = numeric_limits<int32>::min(); |
| } |
| result->push_back(value); |
| str = end; |
| if (!ascii_isspace(*end)) |
| break; |
| } |
| return str; |
| } |
| |
| const char* SplitLeadingDec64Values(const char *str, vector<int64> *result) { |
| for (;;) { |
| char *end = nullptr; |
| const int64 value = strtoll(str, &end, 10); |
| if (end == str) |
| break; |
| result->push_back(value); |
| str = end; |
| if (!ascii_isspace(*end)) |
| break; |
| } |
| return str; |
| } |
| |
| void SplitStringToLines(const char* full, |
| int max_len, |
| int num_lines, |
| vector<string>* result) { |
| if (max_len <= 0) { |
| return; |
| } |
| int pos = 0; |
| for (int i = 0; (i < num_lines || num_lines <= 0); i++) { |
| int cut_at = ClipStringHelper(full+pos, max_len, (i == num_lines - 1)); |
| if (cut_at == -1) { |
| result->push_back(string(full+pos)); |
| return; |
| } |
| result->push_back(string(full+pos, cut_at)); |
| if (i == num_lines - 1 && max_len > kCutStrSize) { |
| result->at(i).append(kCutStr); |
| } |
| pos += cut_at; |
| } |
| } |