blob: 6e48db7cc052476e5d1bd0826f90ee680a41051c [file] [log] [blame]
// Copyright 2008 and onwards Google, Inc.
//
// #status: RECOMMENDED
// #category: operations on strings
// #summary: Functions for splitting strings into substrings.
//
// This file contains functions for splitting strings. The new and recommended
// API for string splitting is the strings::Split() function. The old API is a
// large collection of standalone functions declared at the bottom of this file
// in the global scope.
//
// TODO(user): Rough migration plan from old API to new API
// (1) Add comments to old Split*() functions showing how to do the same things
// with the new API.
// (2) Reimplement some of the old Split*() functions in terms of the new
// Split() API. This will allow deletion of code in split.cc.
// (3) (Optional) Replace old Split*() API calls at call sites with calls to new
// Split() API.
//
#ifndef STRINGS_SPLIT_H_
#define STRINGS_SPLIT_H_
#include <stddef.h>
#include <algorithm>
using std::copy;
using std::max;
using std::min;
using std::reverse;
using std::sort;
using std::swap;
#include <iterator>
using std::back_insert_iterator;
using std::iterator_traits;
#include <map>
using std::map;
using std::multimap;
#include <set>
using std::multiset;
using std::set;
#include <string>
using std::string;
#include <utility>
using std::make_pair;
using std::pair;
#include <vector>
using std::vector;
#include <unordered_map>
#include <unordered_set>
#include <common/logging.h>
#include "gutil/integral_types.h"
#include "gutil/logging-inl.h"
#include "gutil/strings/charset.h"
#include "gutil/strings/split_internal.h"
#include "gutil/strings/stringpiece.h"
#include "gutil/strings/strip.h"
namespace strings {
// The new Split API
// aka Split2
// aka strings::Split()
//
// This string splitting API consists of a Split() function in the ::strings
// namespace and a handful of delimiter objects in the ::strings::delimiter
// namespace (more on delimiter objects below). The Split() function always
// takes two arguments: the text to be split and the delimiter on which to split
// the text. An optional third argument may also be given, which is a Predicate
// functor that will be used to filter the results, e.g., to skip empty strings
// (more on predicates below). The Split() function adapts the returned
// collection to the type specified by the caller.
//
// Example 1:
// // Splits the given string on commas. Returns the results in a
// // vector of strings.
// vector<string> v = strings::Split("a,b,c", ",");
// assert(v.size() == 3);
//
// Example 2:
// // By default, empty strings are *included* in the output. See the
// // strings::SkipEmpty predicate below to omit them.
// vector<string> v = strings::Split("a,b,,c", ",");
// assert(v.size() == 4); // "a", "b", "", "c"
// v = strings::Split("", ",");
// assert(v.size() == 1); // v contains a single ""
//
// Example 3:
// // Splits the string as in the previous example, except that the results
// // are returned as StringPiece objects. Note that because we are storing
// // the results within StringPiece objects, we have to ensure that the input
// // string outlives any results.
// vector<StringPiece> v = strings::Split("a,b,c", ",");
// assert(v.size() == 3);
//
// Example 4:
// // Stores results in a set<string>.
// set<string> a = strings::Split("a,b,c,a,b,c", ",");
// assert(a.size() == 3);
//
// Example 5:
// // Stores results in a map. The map implementation assumes that the input
// // is provided as a series of key/value pairs. For example, the 0th element
// // resulting from the split will be stored as a key to the 1st element. If
// // an odd number of elements are resolved, the last element is paired with
// // a default-constructed value (e.g., empty string).
// map<string, string> m = strings::Split("a,b,c", ",");
// assert(m.size() == 2);
// assert(m["a"] == "b");
// assert(m["c"] == ""); // last component value equals ""
//
// Example 6:
// // Splits on the empty string, which results in each character of the input
// // string becoming one element in the output collection.
// vector<string> v = strings::Split("abc", "");
// assert(v.size() == 3);
//
// Example 7:
// // Stores first two split strings as the members in an std::pair.
// std::pair<string, string> p = strings::Split("a,b,c", ",");
// EXPECT_EQ("a", p.first);
// EXPECT_EQ("b", p.second);
// // "c" is omitted because std::pair can hold only two elements.
//
// As illustrated above, the Split() function adapts the returned collection to
// the type specified by the caller. The returned collections may contain
// string, StringPiece, Cord, or any object that has a constructor (explicit or
// not) that takes a single StringPiece argument. This pattern works for all
// standard STL containers including vector, list, deque, set, multiset, map,
// multimap, unordered_set and unordered_map, and even std::pair which is not
// actually a container.
//
// Splitting to std::pair is an interesting case because it can hold only two
// elements and is not a collection type. When splitting to an std::pair the
// first two split strings become the std::pair's .first and .second members
// respectively. The remaining split substrings are discarded. If there are less
// than two split substrings, the empty string is used for the corresponding
// std::pair member.
//
// The strings::Split() function can be used multiple times to perform more
// complicated splitting logic, such as intelligently parsing key-value pairs.
// For example
//
// // The input string "a=b=c,d=e,f=,g" becomes
// // { "a" => "b=c", "d" => "e", "f" => "", "g" => "" }
// map<string, string> m;
// for (StringPiece sp : strings::Split("a=b=c,d=e,f=,g", ",")) {
// m.insert(strings::Split(sp, strings::delimiter::Limit("=", 1)));
// }
// EXPECT_EQ("b=c", m.find("a")->second);
// EXPECT_EQ("e", m.find("d")->second);
// EXPECT_EQ("", m.find("f")->second);
// EXPECT_EQ("", m.find("g")->second);
//
// The above example stores the results in an std::map. But depending on your
// data requirements, you can just as easily store the results in an
// std::multimap or even a vector<std::pair<>>.
//
//
// Delimiters
//
// The Split() function also takes a second argument that is a delimiter. This
// delimiter is actually an object that defines the boundaries between elements
// in the provided input. If a string (const char*, ::string, or StringPiece) is
// passed in place of an explicit Delimiter object, the argument is implicitly
// converted to a ::strings::delimiter::Literal.
//
// With this split API comes the formal concept of a Delimiter (big D). A
// Delimiter is an object with a Find() function that knows how find the first
// occurrence of itself in a given StringPiece. Models of the Delimiter concept
// represent specific kinds of delimiters, such as single characters,
// substrings, or even regular expressions.
//
// The following Delimiter objects are provided as part of the Split() API:
//
// - Literal (default)
// - AnyOf
// - Limit
//
// The following are examples of using some provided Delimiter objects:
//
// Example 1:
// // Because a string literal is converted to a strings::delimiter::Literal,
// // the following two splits are equivalent.
// vector<string> v1 = strings::Split("a,b,c", ","); // (1)
// using ::strings::delimiter::Literal;
// vector<string> v2 = strings::Split("a,b,c", Literal(",")); // (2)
//
// Example 2:
// // Splits on any of the characters specified in the delimiter string.
// using ::strings::delimiter::AnyOf;
// vector<string> v = strings::Split("a,b;c-d", AnyOf(",;-"));
// assert(v.size() == 4);
//
// Example 3:
// // Uses the Limit meta-delimiter to limit the number of matches a delimiter
// // can have. In this case, the delimiter of a Literal comma is limited to
// // to matching at most one time. The last element in the returned
// // collection will contain all unsplit pieces, which may contain instances
// // of the delimiter.
// using ::strings::delimiter::Limit;
// vector<string> v = strings::Split("a,b,c", Limit(",", 1));
// assert(v.size() == 2); // Limited to 1 delimiter; so two elements found
// assert(v[0] == "a");
// assert(v[1] == "b,c");
//
//
// Predicates
//
// Predicates can filter the results of a Split() operation by determining
// whether or not a resultant element is included in the result set. A predicate
// may be passed as an *optional* third argument to the Split() function.
//
// Predicates are unary functions (or functors) that take a single StringPiece
// argument and return bool indicating whether the argument should be included
// (true) or excluded (false).
//
// One example where this is useful is when filtering out empty substrings. By
// default, empty substrings may be returned by strings::Split(), which is
// similar to the way split functions work in other programming languages. For
// example:
//
// // Empty strings *are* included in the returned collection.
// vector<string> v = strings::Split(",a,,b,", ",");
// assert(v.size() == 5); // v[0] == "", v[1] == "a", v[2] == "", ...
//
// These empty strings can be filtered out of the results by simply passing the
// provided SkipEmpty predicate as the third argument to the Split() function.
// SkipEmpty does not consider a string containing all whitespace to be empty.
// For that behavior use the SkipWhitespace predicate. For example:
//
// Example 1:
// // Uses SkipEmpty to omit empty strings. Strings containing whitespace are
// // not empty and are therefore not skipped.
// using strings::SkipEmpty;
// vector<string> v = strings::Split(",a, ,b,", ",", SkipEmpty());
// assert(v.size() == 3);
// assert(v[0] == "a");
// assert(v[1] == " "); // <-- The whitespace makes the string not empty.
// assert(v[2] == "b");
//
// Example 2:
// // Uses SkipWhitespace to skip all strings that are either empty or contain
// // only whitespace.
// using strings::SkipWhitespace;
// vector<string> v = strings::Split(",a, ,b,", ",", SkipWhitespace());
// assert(v.size() == 2);
// assert(v[0] == "a");
// assert(v[1] == "b");
//
//
// Differences between Split1 and Split2
//
// Split2 is the strings::Split() API described above. Split1 is a name for the
// collection of legacy Split*() functions declared later in this file. Most of
// the Split1 functions follow a set of conventions that don't necessarily match
// the conventions used in Split2. The following are some of the important
// differences between Split1 and Split2:
//
// Split1 -> Split2
// ----------------
// Append -> Assign:
// The Split1 functions all returned their output collections via a pointer to
// an out parameter as is typical in Google code. In some cases the comments
// explicitly stated that results would be *appended* to the output
// collection. In some cases it was ambiguous whether results were appended.
// This ambiguity is gone in the Split2 API as results are always assigned to
// the output collection, never appended.
//
// AnyOf -> Literal:
// Most Split1 functions treated their delimiter argument as a string of
// individual byte delimiters. For example, a delimiter of ",;" would split on
// "," and ";", not the substring ",;". This behavior is equivalent to the
// Split2 delimiter strings::delimiter::AnyOf, which is *not* the default. By
// default, strings::Split() splits using strings::delimiter::Literal() which
// would treat the whole string ",;" as a single delimiter string.
//
// SkipEmpty -> allow empty:
// Most Split1 functions omitted empty substrings in the results. To keep
// empty substrings one would have to use an explicitly named
// Split*AllowEmpty() function. This behavior is reversed in Split2. By
// default, strings::Split() *allows* empty substrings in the output. To skip
// them, use the strings::SkipEmpty predicate.
//
// string -> user's choice:
// Most Split1 functions return collections of string objects. Some return
// char*, but the type returned is dictated by each Split1 function. With
// Split2 the caller can choose which string-like object to return. (Note:
// char* C-strings are not supported in Split2--use StringPiece instead).
//
// Definitions of the main Split() function.
template <typename Delimiter>
inline internal::Splitter<Delimiter> Split(StringPiece text, Delimiter d) {
return internal::Splitter<Delimiter>(text, d);
}
template <typename Delimiter, typename Predicate>
inline internal::Splitter<Delimiter, Predicate> Split(
StringPiece text, Delimiter d, Predicate p) {
return internal::Splitter<Delimiter, Predicate>(text, d, p);
}
namespace delimiter {
// A Delimiter object represents a single separator, such as a character,
// literal string, or regular expression. A Delimiter object must have the
// following member:
//
// StringPiece Find(StringPiece text);
//
// This Find() member function should return a StringPiece referring to the next
// occurrence of the represented delimiter within the given string text. If no
// delimiter is found in the given text, a zero-length StringPiece referring to
// text.end() should be returned (e.g., StringPiece(text.end(), 0)). It is
// important that the returned StringPiece always be within the bounds of the
// StringPiece given as an argument--it must not refer to a string that is
// physically located outside of the given string. The following example is a
// simple Delimiter object that is created with a single char and will look for
// that char in the text given to the Find() function:
//
// struct SimpleDelimiter {
// const char c_;
// explicit SimpleDelimiter(char c) : c_(c) {}
// StringPiece Find(StringPiece text) {
// int pos = text.find(c_);
// if (pos == StringPiece::npos) return StringPiece(text.end(), 0);
// return StringPiece(text, pos, 1);
// }
// };
// Represents a literal string delimiter. Examples:
//
// using ::strings::delimiter::Literal;
// vector<string> v = strings::Split("a=>b=>c", Literal("=>"));
// assert(v.size() == 3);
// assert(v[0] == "a");
// assert(v[1] == "b");
// assert(v[2] == "c");
//
// The next example uses the empty string as a delimiter.
//
// using ::strings::delimiter::Literal;
// vector<string> v = strings::Split("abc", Literal(""));
// assert(v.size() == 3);
// assert(v[0] == "a");
// assert(v[1] == "b");
// assert(v[2] == "c");
//
class Literal {
public:
explicit Literal(StringPiece sp);
StringPiece Find(StringPiece text) const;
private:
const string delimiter_;
};
// Represents a delimiter that will match any of the given byte-sized
// characters. AnyOf is similar to Literal, except that AnyOf uses
// StringPiece::find_first_of() and Literal uses StringPiece::find(). AnyOf
// examples:
//
// using ::strings::delimiter::AnyOf;
// vector<string> v = strings::Split("a,b=c", AnyOf(",="));
//
// assert(v.size() == 3);
// assert(v[0] == "a");
// assert(v[1] == "b");
// assert(v[2] == "c");
//
// If AnyOf is given the empty string, it behaves exactly like Literal and
// matches each individual character in the input string.
//
// Note: The string passed to AnyOf is assumed to be a string of single-byte
// ASCII characters. AnyOf does not work with multi-byte characters.
class AnyOf {
public:
explicit AnyOf(StringPiece sp);
StringPiece Find(StringPiece text) const;
private:
const string delimiters_;
};
// Wraps another delimiter and sets a max number of matches for that delimiter.
// Create LimitImpls using the Limit() function. Example:
//
// using ::strings::delimiter::Limit;
// vector<string> v = strings::Split("a,b,c,d", Limit(",", 2));
//
// assert(v.size() == 3); // Split on 2 commas, giving a vector with 3 items
// assert(v[0] == "a");
// assert(v[1] == "b");
// assert(v[2] == "c,d");
//
template <typename Delimiter>
class LimitImpl {
public:
LimitImpl(Delimiter delimiter, int limit)
: delimiter_(std::move(delimiter)), limit_(limit), count_(0) {}
StringPiece Find(StringPiece text) {
if (count_++ == limit_) {
return StringPiece(text.end(), 0); // No more matches.
}
return delimiter_.Find(text);
}
private:
Delimiter delimiter_;
const int limit_;
int count_;
};
// Overloaded Limit() function to create LimitImpl<> objects. Uses the Delimiter
// Literal as the default if string-like objects are passed as the delimiter
// parameter. This is similar to the overloads for Split() below.
template <typename Delimiter>
inline LimitImpl<Delimiter> Limit(Delimiter delim, int limit) {
return LimitImpl<Delimiter>(delim, limit);
}
inline LimitImpl<Literal> Limit(const char* s, int limit) {
return LimitImpl<Literal>(Literal(s), limit);
}
inline LimitImpl<Literal> Limit(const string& s, int limit) {
return LimitImpl<Literal>(Literal(s), limit);
}
inline LimitImpl<Literal> Limit(StringPiece s, int limit) {
return LimitImpl<Literal>(Literal(s), limit);
}
} // namespace delimiter
//
// Predicates are functors that return bool indicating whether the given
// StringPiece should be included in the split output. If the predicate returns
// false then the string will be excluded from the output from strings::Split().
//
// Always returns true, indicating that all strings--including empty
// strings--should be included in the split output. This predicate is not
// strictly needed because this is the default behavior of the strings::Split()
// function. But it might be useful at some call sites to make the intent
// explicit.
//
// vector<string> v = Split(" a , ,,b,", ",", AllowEmpty());
// EXPECT_THAT(v, ElementsAre(" a ", " ", "", "b", ""));
struct AllowEmpty {
bool operator()(StringPiece sp) const {
return true;
}
};
// Returns false if the given StringPiece is empty, indicating that the
// strings::Split() API should omit the empty string.
//
// vector<string> v = Split(" a , ,,b,", ",", SkipEmpty());
// EXPECT_THAT(v, ElementsAre(" a ", " ", "b"));
struct SkipEmpty {
bool operator()(StringPiece sp) const {
return !sp.empty();
}
};
// Returns false if the given StringPiece is empty or contains only whitespace,
// indicating that the strings::Split() API should omit the string.
//
// vector<string> v = Split(" a , ,,b,", ",", SkipWhitespace());
// EXPECT_THAT(v, ElementsAre(" a ", "b"));
struct SkipWhitespace {
bool operator()(StringPiece sp) const {
StripWhiteSpace(&sp);
return !sp.empty();
}
};
// Split() function overloads to effectively give Split() a default Delimiter
// type of Literal. If Split() is called and a string is passed as the delimiter
// instead of an actual Delimiter object, then one of these overloads will be
// invoked and will create a Splitter<Literal> with the delimiter string.
//
// Since Split() is a function template above, these overload signatures need to
// be explicit about the string type so they match better than the templated
// version. These functions are overloaded for:
//
// - const char*
// - const string&
// - StringPiece
inline internal::Splitter<delimiter::Literal> Split(
StringPiece text, const char* delimiter) {
return internal::Splitter<delimiter::Literal>(
text, delimiter::Literal(delimiter));
}
inline internal::Splitter<delimiter::Literal> Split(
StringPiece text, const string& delimiter) {
return internal::Splitter<delimiter::Literal>(
text, delimiter::Literal(delimiter));
}
inline internal::Splitter<delimiter::Literal> Split(
StringPiece text, StringPiece delimiter) {
return internal::Splitter<delimiter::Literal>(
text, delimiter::Literal(delimiter));
}
// Same overloads as above, but also including a Predicate argument.
template <typename Predicate>
inline internal::Splitter<delimiter::Literal, Predicate> Split(
StringPiece text, const char* delimiter, Predicate p) {
return internal::Splitter<delimiter::Literal, Predicate>(
text, delimiter::Literal(delimiter), p);
}
template <typename Predicate>
inline internal::Splitter<delimiter::Literal, Predicate> Split(
StringPiece text, const string& delimiter, Predicate p) {
return internal::Splitter<delimiter::Literal, Predicate>(
text, delimiter::Literal(delimiter), p);
}
template <typename Predicate>
inline internal::Splitter<delimiter::Literal, Predicate> Split(
StringPiece text, StringPiece delimiter, Predicate p) {
return internal::Splitter<delimiter::Literal, Predicate>(
text, delimiter::Literal(delimiter), p);
}
} // namespace strings
//
// ==================== LEGACY SPLIT FUNCTIONS ====================
//
// NOTE: The instruction below creates a Module titled
// GlobalSplitFunctions within the auto-generated Doxygen documentation.
// This instruction is needed to expose global functions that are not
// within a namespace.
//
// START DOXYGEN SplitFunctions grouping
/* @defgroup SplitFunctions
* @{ */
// ----------------------------------------------------------------------
// ClipString
// Clip a string to a max length. We try to clip on a word boundary
// if this is possible. If the string is clipped, we append an
// ellipsis.
//
// ***NOTE***
// ClipString counts length with strlen. If you have non-ASCII
// strings like UTF-8, this is wrong. If you are displaying the
// clipped strings to users in a frontend, consider using
// ClipStringOnWordBoundary in
// webserver/util/snippets/rewriteboldtags, which considers the width
// of the string, not just the number of bytes.
//
// TODO(user) Move ClipString back to strutil. The problem with this is
// that ClipStringHelper is used behind the scenes by SplitStringToLines, but
// probably shouldn't be exposed in the .h files.
// ----------------------------------------------------------------------
void ClipString(char* str, int max_len);
// ----------------------------------------------------------------------
// ClipString
// Version of ClipString() that uses string instead of char*.
// NOTE: See comment above.
// ----------------------------------------------------------------------
void ClipString(string* full_str, int max_len);
// ----------------------------------------------------------------------
// SplitStringToLines() Split a string into lines of maximum length
// 'max_len'. Append the resulting lines to 'result'. Will attempt
// to split on word boundaries. If 'num_lines'
// is zero it splits up the whole string regardless of length. If
// 'num_lines' is positive, it returns at most num_lines lines, and
// appends a "..." to the end of the last line if the string is too
// long to fit completely into 'num_lines' lines.
// ----------------------------------------------------------------------
void SplitStringToLines(const char* full,
int max_len,
int num_lines,
vector<string>* result);
// ----------------------------------------------------------------------
// SplitOneStringToken()
// Returns the first "delim" delimited string from "*source" and modifies
// *source to point after the delimiter that was found. If no delimiter is
// found, *source is set to NULL.
//
// If the start of *source is a delimiter, an empty string is returned.
// If *source is NULL, an empty string is returned.
//
// "delim" is treated as a sequence of 1 or more character delimiters. Any one
// of the characters present in "delim" is considered to be a single
// delimiter; The delimiter is not "delim" as a whole. For example:
//
// const char* s = "abc=;de";
// string r = SplitOneStringToken(&s, ";=");
// // r = "abc"
// // s points to ";de"
// ----------------------------------------------------------------------
string SplitOneStringToken(const char** source, const char* delim);
// ----------------------------------------------------------------------
// SplitUsing()
// Split a string into substrings based on the nul-terminated list
// of bytes at delimiters (uses strsep) and return a vector of
// those strings. Modifies 'full' We allocate the return vector,
// and you should free it. Note that empty fields are ignored.
// Use SplitToVector with last argument 'false' if you want the
// empty fields.
// ----------------------------------------------------------------------
vector<char*>* SplitUsing(char* full, const char* delimiters);
// ----------------------------------------------------------------------
// SplitToVector()
// Split a string into substrings based on the nul-terminated list
// of bytes at delim (uses strsep) and appends the split
// strings to 'vec'. Modifies "full". If omit empty strings is
// true, empty strings are omitted from the resulting vector.
// ----------------------------------------------------------------------
void SplitToVector(char* full, const char* delimiters,
vector<char*>* vec,
bool omit_empty_strings);
void SplitToVector(char* full, const char* delimiters,
vector<const char*>* vec,
bool omit_empty_strings);
// ----------------------------------------------------------------------
// SplitStringPieceToVector
// Split a StringPiece into sub-StringPieces based on the
// nul-terminated list of bytes at delim and appends the
// pieces to 'vec'. If omit empty strings is true, empty strings
// are omitted from the resulting vector.
// Expects the original string (from which 'full' is derived) to exist
// for the full lifespan of 'vec'.
// ----------------------------------------------------------------------
void SplitStringPieceToVector(const StringPiece& full,
const char* delim,
vector<StringPiece>* vec,
bool omit_empty_strings);
// ----------------------------------------------------------------------
// SplitStringUsing()
// SplitStringToHashsetUsing()
// SplitStringToSetUsing()
// SplitStringToMapUsing()
// SplitStringToHashmapUsing()
// Splits a string using one or more byte delimiters, presented as a
// nul-terminated c string. Append the components to 'result'. If there are
// consecutive delimiters, this function skips over all of them: in other words,
// empty components are dropped. If you want to keep empty components, try
// SplitStringAllowEmpty().
//
// NOTE: Do not use this for multi-byte delimiters such as UTF-8 strings. Use
// strings::Split() with strings::delimiter::Literal as the delimiter.
//
// ==> NEW API: Consider using the new Split API defined above. <==
// Example:
//
// using strings::SkipEmpty;
// using strings::Split;
// using strings::delimiter::AnyOf;
//
// vector<string> v = Split(full, AnyOf(delimiter), SkipEmpty());
//
// For even better performance, store the result in a vector<StringPiece>
// to avoid string copies.
// ----------------------------------------------------------------------
void SplitStringUsing(const string& full, const char* delimiters,
vector<string>* result);
void SplitStringToHashsetUsing(const string& full, const char* delimiters,
std::unordered_set<string>* result);
void SplitStringToSetUsing(const string& full, const char* delimiters,
set<string>* result);
// The even-positioned (0-based) components become the keys for the
// odd-positioned components that follow them. When there is an odd
// number of components, the value for the last key will be unchanged
// if the key was already present in the hash table, or will be the
// empty string if the key is a newly inserted key.
void SplitStringToMapUsing(const string& full, const char* delim,
map<string, string>* result);
void SplitStringToHashmapUsing(const string& full, const char* delim,
std::unordered_map<string, string>* result);
// ----------------------------------------------------------------------
// SplitStringAllowEmpty()
//
// Split a string using one or more byte delimiters, presented as a
// nul-terminated c string. Append the components to 'result'. If there are
// consecutive delimiters, this function will return corresponding empty
// strings. If you want to drop the empty strings, try SplitStringUsing().
//
// If "full" is the empty string, yields an empty string as the only value.
//
// ==> NEW API: Consider using the new Split API defined above. <==
//
// using strings::Split;
// using strings::delimiter::AnyOf;
//
// vector<string> v = Split(full, AnyOf(delimiter));
//
// For even better performance, store the result in a vector<StringPiece> to
// avoid string copies.
// ----------------------------------------------------------------------
void SplitStringAllowEmpty(const string& full, const char* delim,
vector<string>* result);
// ----------------------------------------------------------------------
// SplitStringWithEscaping()
// SplitStringWithEscapingAllowEmpty()
// SplitStringWithEscapingToSet()
// SplitStringWithEscapingToHashset()
// Split the string using the specified delimiters, taking escaping into
// account. '\' is not allowed as a delimiter.
//
// Within the string, preserve a delimiter preceded by a backslash as a
// literal delimiter. In addition, preserve two consecutive backslashes as
// a single literal backslash. Do not unescape any other backslash-character
// sequence.
//
// Eg. 'foo\=bar=baz\\qu\ux' split on '=' becomes ('foo=bar', 'baz\qu\ux')
//
// All versions other than "AllowEmpty" discard any empty substrings.
// ----------------------------------------------------------------------
void SplitStringWithEscaping(const string& full,
const strings::CharSet& delimiters,
vector<string>* result);
void SplitStringWithEscapingAllowEmpty(const string& full,
const strings::CharSet& delimiters,
vector<string>* result);
void SplitStringWithEscapingToSet(const string& full,
const strings::CharSet& delimiters,
set<string>* result);
void SplitStringWithEscapingToHashset(const string& full,
const strings::CharSet& delimiters,
std::unordered_set<string>* result);
// ----------------------------------------------------------------------
// SplitStringIntoNPiecesAllowEmpty()
// Split a string using a nul-terminated list of byte
// delimiters. Append the components to 'result'. If there are
// consecutive delimiters, this function will return corresponding
// empty strings. The string is split into at most the specified
// number of pieces greedily. This means that the last piece may
// possibly be split further. To split into as many pieces as
// possible, specify 0 as the number of pieces.
//
// If "full" is the empty string, yields an empty string as the only value.
// ----------------------------------------------------------------------
void SplitStringIntoNPiecesAllowEmpty(const string& full,
const char* delimiters,
int pieces,
vector<string>* result);
// ----------------------------------------------------------------------
// SplitStringAndParse()
// SplitStringAndParseToContainer()
// SplitStringAndParseToList()
// Split a string using a nul-terminated list of character
// delimiters. For each component, parse using the provided
// parsing function and if successful, append it to 'result'.
// Return true if and only if all components parse successfully.
// If there are consecutive delimiters, this function skips over
// all of them. This function will correctly handle parsing
// strings that have embedded \0s.
//
// SplitStringAndParse fills into a vector.
// SplitStringAndParseToContainer fills into any container that implements
// a single-argument insert function. (i.e. insert(const value_type& x) ).
// SplitStringAndParseToList fills into any container that implements a single-
// argument push_back function (i.e. push_back(const value_type& x) ), plus
// value_type& back() and pop_back().
// NOTE: This implementation relies on parsing in-place into the "back()"
// reference, so its performance may depend on the efficiency of back().
//
// Example Usage:
// vector<double> values;
// CHECK(SplitStringAndParse("1.0,2.0,3.0", ",", &safe_strtod, &values));
// CHECK_EQ(3, values.size());
//
// vector<int64> values;
// CHECK(SplitStringAndParse("1M,2M,3M", ",",
// &HumanReadableNumBytes::ToInt64, &values));
// CHECK_EQ(3, values.size());
//
// set<int64> values;
// CHECK(SplitStringAndParseToContainer("3,1,1,2", ",",
// &safe_strto64, &values));
// CHECK_EQ(4, values.size());
//
// deque<int64> values;
// CHECK(SplitStringAndParseToList("3,1,1,2", ",", &safe_strto64, &values));
// CHECK_EQ(4, values.size());
// ----------------------------------------------------------------------
template <class T>
bool SplitStringAndParse(StringPiece source, StringPiece delim,
bool (*parse)(const string& str, T* value),
vector<T>* result);
template <class Container>
bool SplitStringAndParseToContainer(
StringPiece source, StringPiece delim,
bool (*parse)(const string& str, typename Container::value_type* value),
Container* result);
template <class List>
bool SplitStringAndParseToList(
StringPiece source, StringPiece delim,
bool (*parse)(const string& str, typename List::value_type* value),
List* result);
// ----------------------------------------------------------------------
// SplitRange()
// Splits a string of the form "<from>-<to>". Either or both can be
// missing. A raw number (<to>) is interpreted as "<to>-". Modifies
// parameters insofar as they're specified by the string. RETURNS
// true iff the input is a well-formed range. If it RETURNS false,
// from and to remain unchanged. The range in rangestr should be
// terminated either by "\0" or by whitespace.
// ----------------------------------------------------------------------
bool SplitRange(const char* rangestr, int* from, int* to);
// ----------------------------------------------------------------------
// SplitCSVLineWithDelimiter()
// CSV lines come in many guises. There's the Comma Separated Values
// variety, in which fields are separated by (surprise!) commas. There's
// also the tab-separated values variant, in which tabs separate the
// fields. This routine handles both, which makes it almost like
// SplitUsing(line, delimiter), but for some special processing. For both
// delimiters, whitespace is trimmed from either side of the field value.
// If the delimiter is ',', we play additional games with quotes. A
// field value surrounded by double quotes is allowed to contain commas,
// which are not treated as field separators. Within a double-quoted
// string, a series of two double quotes signals an escaped single double
// quote. It'll be clearer in the examples.
// Example:
// Google , x , "Buchheit, Paul", "string with "" quote in it"
// --> [Google], [x], [Buchheit, Paul], [string with " quote in it]
//
// SplitCSVLine()
// A convenience wrapper around SplitCSVLineWithDelimiter which uses
// ',' as the delimiter.
//
// The following variants of SplitCSVLine() are not recommended for new code.
// Please consider the CSV parser in //util/csv as an alternative. Examples:
// To parse a single line:
// #include "util/csv/parser.h"
// vector<string> fields = util::csv::ParseLine(line).fields();
//
// To parse an entire file:
// #include "util/csv/parser.h"
// for (Record rec : Parser(source)) {
// vector<string> fields = rec.fields();
// }
//
// See //util/csv/parser.h for more complete documentation.
//
// ----------------------------------------------------------------------
void SplitCSVLine(char* line, vector<char*>* cols);
void SplitCSVLineWithDelimiter(char* line, char delimiter,
vector<char*>* cols);
// SplitCSVLine string wrapper that internally makes a copy of string line.
void SplitCSVLineWithDelimiterForStrings(const string& line, char delimiter,
vector<string>* cols);
// ----------------------------------------------------------------------
// SplitStructuredLine()
// Splits a line using the given delimiter, and places the columns
// into 'cols'. This is unlike 'SplitUsing(line, ",")' because you can
// define pairs of opening closing symbols inside which the delimiter should
// be ignored. If the symbol_pair string has an odd number of characters,
// the last character (which cannot be paired) will be assumed to be both an
// opening and closing symbol.
// WARNING : The input string 'line' is destroyed in the process.
// The function returns 0 if the line was parsed correctly (i.e all the
// opened braces had their closing braces) otherwise, it returns the position
// of the error.
// Example:
// SplitStructuredLine("item1,item2,{subitem1,subitem2},item4,[5,{6,7}]",
// ',',
// "{}[]", &output)
// --> output = { "item1", "item2", "{subitem1,subitem2}", "item4",
// "[5,{6,7}]" }
// Example2: trying to split "item1,[item2,{4,5],5}" will fail and the
// function will return the position of the problem : ]
//
// ----------------------------------------------------------------------
char* SplitStructuredLine(char* line,
char delimiter,
const char* symbol_pairs,
vector<char*>* cols);
// Similar to the function with the same name above, but splits a StringPiece
// into StringPiece parts. Returns true if successful.
bool SplitStructuredLine(StringPiece line,
char delimiter,
const char* symbol_pairs,
vector<StringPiece>* cols);
// ----------------------------------------------------------------------
// SplitStructuredLineWithEscapes()
// Like SplitStructuredLine but also allows characters to be escaped.
//
// WARNING: the escape characters will be replicated in the output
// columns rather than being consumed, i.e. if {} were the opening and
// closing symbols, using \{ to quote a curly brace in the middle of
// an option would pass this unchanged.
//
// Example:
// SplitStructuredLineWithEscapes(
// "\{item1\},it\\em2,{\{subitem1\},sub\\item2},item4\,item5,[5,{6,7}]",
// ',',
// "{}[]",
// &output)
// --> output = { "\{item1\}", "it\\em2", "{\{subitem1\},sub\\item2}",
// "item4\,item5", "[5,{6,7}]" }
//
// ----------------------------------------------------------------------
char* SplitStructuredLineWithEscapes(char* line,
char delimiter,
const char* symbol_pairs,
vector<char*>* cols);
// Similar to the function with the same name above, but splits a StringPiece
// into StringPiece parts. Returns true if successful.
bool SplitStructuredLineWithEscapes(StringPiece line,
char delimiter,
const char* symbol_pairs,
vector<StringPiece>* cols);
// ----------------------------------------------------------------------
// DEPRECATED(jgm): See the "NEW API" comment about this function below for
// example code showing an alternative.
//
// SplitStringIntoKeyValues()
// Split a line into a key string and a vector of value strings. The line has
// the following format:
//
// <key><kvsep>+<vvsep>*<value1><vvsep>+<value2><vvsep>+<value3>...<vvsep>*
//
// where key and value are strings; */+ means zero/one or more; <kvsep> is
// a delimiter character to separate key and value; and <vvsep> is a delimiter
// character to separate between values. The user can specify a bunch of
// delimiter characters using a string. For example, if the user specifies
// the separator string as "\t ", then either ' ' or '\t' or any combination
// of them wil be treated as separator. For <vvsep>, the user can specify a
// empty string to indicate there is only one value.
//
// Note: this function assumes the input string begins exactly with a
// key. Therefore, if you use whitespaces to separate key and value, you
// should not let whitespace precedes the key in the input. Otherwise, you
// will get an empty string as the key.
//
// A line with no <kvsep> will return an empty string as the key, even if
// <key> is non-empty!
//
// The syntax makes it impossible for a value to be the empty string.
// It is possible for the number of values to be zero.
//
// Returns false if the line has no <kvsep> or if the number of values is
// zero.
//
// ==> NEW API: Consider using the new Split API defined above. <==
//
// The SplitStringIntoKeyValues() function has some subtle and surprising
// semantics in various corner cases. To avoid this the strings::Split API is
// recommended. The following example shows how to split a string of delimited
// key-value pairs into a vector of pairs using the strings::Split API.
//
// using strings::Split;
// using strings::delimiter::AnyOf;
// using strings::delimiter::Limit;
//
// pair<string, StringPiece> key_values =
// Split(line, Limit(AnyOf(kv_delim), 1));
// string key = key_values.first;
// vector<string> values = Split(key_values.second, AnyOf(vv_delim));
//
// ----------------------------------------------------------------------
bool SplitStringIntoKeyValues(const string& line,
const string& key_value_delimiters,
const string& value_value_delimiters,
string* key, vector<string>* values);
// ----------------------------------------------------------------------
// SplitStringIntoKeyValuePairs()
// Split a line into a vector of <key, value> pairs. The line has
// the following format:
//
// <kvpsep>*<key1><kvsep>+<value1><kvpsep>+<key2><kvsep>+<value2>...<kvpsep>*
//
// Where key and value are strings; */+ means zero/one or more. <kvsep> is
// a delimiter character to separate key and value and <kvpsep> is a delimiter
// character to separate key value pairs. The user can specify a bunch of
// delimiter characters using a string.
//
// Note: this function assumes each key-value pair begins exactly with a
// key. Therefore, if you use whitespaces to separate key and value, you
// should not let whitespace precede the key in the pair. Otherwise, you
// will get an empty string as the key.
//
// A pair with no <kvsep> will return empty strings as the key and value,
// even if <key> is non-empty!
//
// Returns false for pairs with no <kvsep> specified and for pairs with
// empty strings as values.
//
// ==> NEW API: Consider using the new Split API defined above. <==
//
// The SplitStringIntoKeyValuePairs() function has some subtle and surprising
// semantics in various corner cases. To avoid this the strings::Split API is
// recommended. The following example shows how to split a string of delimited
// key-value pairs into a vector of pairs using the strings::Split API.
//
// using strings::SkipEmpty;
// using strings::Split;
// using strings::delimiter::AnyOf;
// using strings::delimiter::Limit;
//
// vector<pair<string, string>> pairs; // or even map<string, string>
// for (StringPiece sp : Split(line, AnyOf(pair_delim), SkipEmpty())) {
// pairs.push_back(Split(sp, Limit(AnyOf(kv_delim), 1), SkipEmpty()));
// }
//
// ----------------------------------------------------------------------
bool SplitStringIntoKeyValuePairs(const string& line,
const string& key_value_delimiters,
const string& key_value_pair_delimiters,
vector<pair<string, string> >* kv_pairs);
// ----------------------------------------------------------------------
// SplitLeadingDec32Values()
// SplitLeadingDec64Values()
// A simple parser for space-separated decimal int32/int64 values.
// Appends parsed integers to the end of the result vector, stopping
// at the first unparsable spot. Skips past leading and repeated
// whitespace (does not consume trailing whitespace), and returns
// a pointer beyond the last character parsed.
// --------------------------------------------------------------------
const char* SplitLeadingDec32Values(const char* next, vector<int32>* result);
const char* SplitLeadingDec64Values(const char* next, vector<int64>* result);
// ----------------------------------------------------------------------
// SplitOneIntToken()
// SplitOneInt32Token()
// SplitOneUint32Token()
// SplitOneInt64Token()
// SplitOneUint64Token()
// SplitOneDoubleToken()
// SplitOneFloatToken()
// Parse a single "delim" delimited number from "*source" into "*value".
// Modify *source to point after the delimiter.
// If no delimiter is present after the number, set *source to NULL.
//
// If the start of *source is not an number, return false.
// If the int is followed by the null character, return true.
// If the int is not followed by a character from delim, return false.
// If *source is NULL, return false.
//
// They cannot handle decimal numbers with leading 0s, since they will be
// treated as octal.
// ----------------------------------------------------------------------
bool SplitOneIntToken(const char** source, const char* delim,
int* value);
bool SplitOneInt32Token(const char** source, const char* delim,
int32* value);
bool SplitOneUint32Token(const char** source, const char* delim,
uint32* value);
bool SplitOneInt64Token(const char** source, const char* delim,
int64* value);
bool SplitOneUint64Token(const char** source, const char* delim,
uint64* value);
bool SplitOneDoubleToken(const char** source, const char* delim,
double* value);
bool SplitOneFloatToken(const char** source, const char* delim,
float* value);
// Some aliases, so that the function names are standardized against the names
// of the reflection setters/getters in proto2. This makes it easier to use
// certain macros with reflection when creating custom text formats for protos.
inline bool SplitOneUInt32Token(const char** source, const char* delim,
uint32* value) {
return SplitOneUint32Token(source, delim, value);
}
inline bool SplitOneUInt64Token(const char** source, const char* delim,
uint64* value) {
return SplitOneUint64Token(source, delim, value);
}
// ----------------------------------------------------------------------
// SplitOneDecimalIntToken()
// SplitOneDecimalInt32Token()
// SplitOneDecimalUint32Token()
// SplitOneDecimalInt64Token()
// SplitOneDecimalUint64Token()
// Parse a single "delim"-delimited number from "*source" into "*value".
// Unlike SplitOneIntToken, etc., this function always interprets
// the numbers as decimal.
bool SplitOneDecimalIntToken(const char** source, const char* delim,
int* value);
bool SplitOneDecimalInt32Token(const char** source, const char* delim,
int32* value);
bool SplitOneDecimalUint32Token(const char** source, const char* delim,
uint32* value);
bool SplitOneDecimalInt64Token(const char** source, const char* delim,
int64* value);
bool SplitOneDecimalUint64Token(const char** source, const char* delim,
uint64* value);
// ----------------------------------------------------------------------
// SplitOneHexUint32Token()
// SplitOneHexUint64Token()
// Once more, for hexadecimal numbers (unsigned only).
bool SplitOneHexUint32Token(const char** source, const char* delim,
uint32* value);
bool SplitOneHexUint64Token(const char** source, const char* delim,
uint64* value);
// ###################### TEMPLATE INSTANTIATIONS BELOW #######################
// SplitStringAndParse() -- see description above
template <class T>
bool SplitStringAndParse(StringPiece source, StringPiece delim,
bool (*parse)(const string& str, T* value),
vector<T>* result) {
return SplitStringAndParseToList(source, delim, parse, result);
}
namespace strings {
namespace internal {
template <class Container, class InsertPolicy>
bool SplitStringAndParseToInserter(
StringPiece source, StringPiece delim,
bool (*parse)(const string& str, typename Container::value_type* value),
Container* result, InsertPolicy insert_policy) {
CHECK(NULL != parse);
CHECK(NULL != result);
CHECK(NULL != delim.data());
CHECK_GT(delim.size(), 0);
bool retval = true;
vector<StringPiece> pieces = strings::Split(source,
strings::delimiter::AnyOf(delim),
strings::SkipEmpty());
for (const auto& piece : pieces) {
typename Container::value_type t;
if (parse(piece.as_string(), &t)) {
insert_policy(result, t);
} else {
retval = false;
}
}
return retval;
}
// Cannot use output iterator here (e.g. std::inserter, std::back_inserter)
// because some callers use non-standard containers that don't have iterators,
// only an insert() or push_back() method.
struct BasicInsertPolicy {
template <class C, class V>
void operator()(C* c, const V& v) const { c->insert(v); }
};
struct BackInsertPolicy {
template <class C, class V>
void operator()(C* c, const V& v) const { c->push_back(v); }
};
} // namespace internal
} // namespace strings
// SplitStringAndParseToContainer() -- see description above
template <class Container>
bool SplitStringAndParseToContainer(
StringPiece source, StringPiece delim,
bool (*parse)(const string& str, typename Container::value_type* value),
Container* result) {
return strings::internal::SplitStringAndParseToInserter(
source, delim, parse, result, strings::internal::BasicInsertPolicy());
}
// SplitStringAndParseToList() -- see description above
template <class List>
bool SplitStringAndParseToList(
StringPiece source, StringPiece delim,
bool (*parse)(const string& str, typename List::value_type* value),
List* result) {
return strings::internal::SplitStringAndParseToInserter(
source, delim, parse, result, strings::internal::BackInsertPolicy());
}
// END DOXYGEN SplitFunctions grouping
/* @} */
#endif // STRINGS_SPLIT_H_