blob: 898ae57439616283796d2ff27be968eb6e69bcb5 [file] [log] [blame]
// Copyright 2010 Google Inc. All Rights Reserved.
// Refactored from contributions of various authors in strings/strutil.cc
//
// This file contains string processing functions related to
// numeric values.
#include "gutil/strings/numbers.h"
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <float.h> // for DBL_DIG and FLT_DIG
#include <math.h> // for HUGE_VAL
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <inttypes.h>
#include <sys/types.h>
#include <limits>
#include <ostream>
#include "common/exception.h"
using std::numeric_limits;
#include <string>
using std::string;
#include <fmt/compile.h>
#include <fmt/format.h>
#include "common/logging.h"
#include "gutil/integral_types.h"
#include "gutil/strings/ascii_ctype.h"
#include "gutil/strtoint.h"
namespace {
// Represents integer values of digits.
// Uses 36 to indicate an invalid character since we support
// bases up to 36.
static const int8 kAsciiToInt[256] = {
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, // 16 36s.
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 36, 36,
36, 36, 36, 36, 36, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26,
27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 36, 36, 36, 36, 10, 11, 12, 13, 14, 15, 16,
17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36,
36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36, 36};
// Input format based on POSIX.1-2008 strtol
// http://pubs.opengroup.org/onlinepubs/9699919799/functions/strtol.html
template <typename IntType>
bool safe_int_internal(const char* start, const char* end, int base, IntType* value_p) {
// Consume whitespace.
while (start < end && ascii_isspace(start[0])) {
++start;
}
while (start < end && ascii_isspace(end[-1])) {
--end;
}
if (start >= end) {
return false;
}
// Consume sign.
const bool negative = (start[0] == '-');
if (negative || start[0] == '+') {
++start;
if (start >= end) {
return false;
}
}
// Consume base-dependent prefix.
// base 0: "0x" -> base 16, "0" -> base 8, default -> base 10
// base 16: "0x" -> base 16
// Also validate the base.
if (base == 0) {
if (end - start >= 2 && start[0] == '0' && (start[1] == 'x' || start[1] == 'X')) {
base = 16;
start += 2;
} else if (end - start >= 1 && start[0] == '0') {
base = 8;
start += 1;
} else {
base = 10;
}
} else if (base == 16) {
if (end - start >= 2 && start[0] == '0' && (start[1] == 'x' || start[1] == 'X')) {
start += 2;
}
} else if (base >= 2 && base <= 36) {
// okay
} else {
return false;
}
// Consume digits.
//
// The classic loop:
//
// for each digit
// value = value * base + digit
// value *= sign
//
// The classic loop needs overflow checking. It also fails on the most
// negative integer, -2147483648 in 32-bit two's complement representation.
//
// My improved loop:
//
// if (!negative)
// for each digit
// value = value * base
// value = value + digit
// else
// for each digit
// value = value * base
// value = value - digit
//
// Overflow checking becomes simple.
//
// I present the positive code first for easier reading.
IntType value = 0;
if (!negative) {
const IntType vmax = std::numeric_limits<IntType>::max();
assert(vmax > 0);
assert(vmax >= base);
const IntType vmax_over_base = vmax / base;
// loop over digits
// loop body is interleaved for perf, not readability
for (; start < end; ++start) {
unsigned char c = static_cast<unsigned char>(start[0]);
int digit = kAsciiToInt[c];
if (value > vmax_over_base) return false;
value *= base;
if (digit >= base) return false;
if (value > vmax - digit) return false;
value += digit;
}
} else {
const IntType vmin = std::numeric_limits<IntType>::min();
assert(vmin < 0);
assert(vmin <= 0 - base);
IntType vmin_over_base = vmin / base;
// 2003 c++ standard [expr.mul]
// "... the sign of the remainder is implementation-defined."
// Although (vmin/base)*base + vmin%base is always vmin.
// 2011 c++ standard tightens the spec but we cannot rely on it.
if (vmin % base > 0) {
vmin_over_base += 1;
}
// loop over digits
// loop body is interleaved for perf, not readability
for (; start < end; ++start) {
unsigned char c = static_cast<unsigned char>(start[0]);
int digit = kAsciiToInt[c];
if (value < vmin_over_base) return false;
value *= base;
if (digit >= base) return false;
if (value < vmin + digit) return false;
value -= digit;
}
}
// Store output.
*value_p = value;
return true;
}
} // anonymous namespace
bool safe_strto32_base(const char* startptr, const int buffer_size, int32* v, int base) {
return safe_int_internal<int32>(startptr, startptr + buffer_size, base, v);
}
bool safe_strto64_base(const char* startptr, const int buffer_size, int64* v, int base) {
return safe_int_internal<int64>(startptr, startptr + buffer_size, base, v);
}
bool safe_strto32(const char* startptr, const int buffer_size, int32* value) {
return safe_int_internal<int32>(startptr, startptr + buffer_size, 10, value);
}
bool safe_strto64(const char* startptr, const int buffer_size, int64* value) {
return safe_int_internal<int64>(startptr, startptr + buffer_size, 10, value);
}
bool safe_strto32_base(const char* str, int32* value, int base) {
char* endptr;
errno = 0; // errno only gets set on errors
*value = strto32(str, &endptr, base);
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
return *str != '\0' && *endptr == '\0' && errno == 0;
}
bool safe_strto64_base(const char* str, int64* value, int base) {
char* endptr;
errno = 0; // errno only gets set on errors
*value = strto64(str, &endptr, base);
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
return *str != '\0' && *endptr == '\0' && errno == 0;
}
bool safe_strtou32_base(const char* str, uint32* value, int base) {
// strtoul does not give any errors on negative numbers, so we have to
// search the string for '-' manually.
while (ascii_isspace(*str)) ++str;
if (*str == '-') return false;
char* endptr;
errno = 0; // errno only gets set on errors
*value = strtou32(str, &endptr, base);
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
return *str != '\0' && *endptr == '\0' && errno == 0;
}
bool safe_strtou64_base(const char* str, uint64* value, int base) {
// strtou64 does not give any errors on negative numbers, so we have to
// search the string for '-' manually.
while (ascii_isspace(*str)) ++str;
if (*str == '-') return false;
char* endptr;
errno = 0; // errno only gets set on errors
*value = strtou64(str, &endptr, base);
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
return *str != '\0' && *endptr == '\0' && errno == 0;
}
// ----------------------------------------------------------------------
// u64tostr_base36()
// Converts unsigned number to string representation in base-36.
// --------------------------------------------------------------------
size_t u64tostr_base36(uint64 number, size_t buf_size, char* buffer) {
CHECK_GT(buf_size, 0);
CHECK(buffer);
static const char kAlphabet[] = "0123456789abcdefghijklmnopqrstuvwxyz";
buffer[buf_size - 1] = '\0';
size_t result_size = 1;
do {
if (buf_size == result_size) { // Ran out of space.
return 0;
}
int remainder = number % 36;
number /= 36;
buffer[buf_size - result_size - 1] = kAlphabet[remainder];
result_size++;
} while (number);
memmove(buffer, buffer + buf_size - result_size, result_size);
return result_size - 1;
}
// Generate functions that wrap safe_strtoXXX_base.
#define GEN_SAFE_STRTO(name, type) \
bool name##_base(const string& str, type* value, int base) { \
return name##_base(str.c_str(), value, base); \
} \
bool name(const char* str, type* value) { return name##_base(str, value, 10); } \
bool name(const string& str, type* value) { return name##_base(str.c_str(), value, 10); }
GEN_SAFE_STRTO(safe_strto32, int32);
GEN_SAFE_STRTO(safe_strtou32, uint32);
GEN_SAFE_STRTO(safe_strto64, int64);
GEN_SAFE_STRTO(safe_strtou64, uint64);
#undef GEN_SAFE_STRTO
bool safe_strtof(const char* str, float* value) {
char* endptr;
#ifdef _MSC_VER // has no strtof()
*value = strtod(str, &endptr);
#else
*value = strtof(str, &endptr);
#endif
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
// Ignore range errors from strtod/strtof.
// The values it returns on underflow and
// overflow are the right fallback in a
// robust setting.
return *str != '\0' && *endptr == '\0';
}
bool safe_strtod(const char* str, double* value) {
char* endptr;
*value = strtod(str, &endptr);
if (endptr != str) {
while (ascii_isspace(*endptr)) ++endptr;
}
// Ignore range errors from strtod. The values it
// returns on underflow and overflow are the right
// fallback in a robust setting.
return *str != '\0' && *endptr == '\0';
}
bool safe_strtof(const string& str, float* value) {
return safe_strtof(str.c_str(), value);
}
bool safe_strtod(const string& str, double* value) {
return safe_strtod(str.c_str(), value);
}
// ----------------------------------------------------------------------
// SimpleDtoa()
// SimpleFtoa()
// DoubleToBuffer()
// FloatToBuffer()
// We want to print the value without losing precision, but we also do
// not want to print more digits than necessary. This turns out to be
// trickier than it sounds. Numbers like 0.2 cannot be represented
// exactly in binary. If we print 0.2 with a very large precision,
// e.g. "%.50g", we get "0.2000000000000000111022302462515654042363167".
// On the other hand, if we set the precision too low, we lose
// significant digits when printing numbers that actually need them.
// It turns out there is no precision value that does the right thing
// for all numbers.
//
// Our strategy is to first try printing with a precision that is never
// over-precise, then parse the result with strtod() to see if it
// matches. If not, we print again with a precision that will always
// give a precise result, but may use more digits than necessary.
//
// An arguably better strategy would be to use the algorithm described
// in "How to Print Floating-Point Numbers Accurately" by Steele &
// White, e.g. as implemented by David M. Gay's dtoa(). It turns out,
// however, that the following implementation is about as fast as
// DMG's code. Furthermore, DMG's code locks mutexes, which means it
// will not scale well on multi-core machines. DMG's code is slightly
// more accurate (in that it will never use more digits than
// necessary), but this is probably irrelevant for most users.
//
// Rob Pike and Ken Thompson also have an implementation of dtoa() in
// third_party/fmt/fltfmt.cc. Their implementation is similar to this
// one in that it makes guesses and then uses strtod() to check them.
// Their implementation is faster because they use their own code to
// generate the digits in the first place rather than use snprintf(),
// thus avoiding format string parsing overhead. However, this makes
// it considerably more complicated than the following implementation,
// and it is embedded in a larger library. If speed turns out to be
// an issue, we could re-implement this in terms of their
// implementation.
// ----------------------------------------------------------------------
int DoubleToBuffer(double value, int width, char* buffer) {
// DBL_DIG is 15 for IEEE-754 doubles, which are used on almost all
// platforms these days. Just in case some system exists where DBL_DIG
// is significantly larger -- and risks overflowing our buffer -- we have
// this assert.
COMPILE_ASSERT(DBL_DIG < 20, DBL_DIG_is_too_big);
int snprintf_result = snprintf(buffer, width, "%.*g", DBL_DIG, value);
// The snprintf should never overflow because the buffer is significantly
// larger than the precision we asked for.
DCHECK(snprintf_result > 0 && snprintf_result < width);
if (strtod(buffer, nullptr) != value) {
snprintf_result = snprintf(buffer, width, "%.*g", DBL_DIG + 2, value);
// Should never overflow; see above.
DCHECK(snprintf_result > 0 && snprintf_result < width);
}
return snprintf_result;
}
int FloatToBuffer(float value, int width, char* buffer) {
// FLT_DIG is 6 for IEEE-754 floats, which are used on almost all
// platforms these days. Just in case some system exists where FLT_DIG
// is significantly larger -- and risks overflowing our buffer -- we have
// this assert.
COMPILE_ASSERT(FLT_DIG < 10, FLT_DIG_is_too_big);
int snprintf_result = snprintf(buffer, width, "%.*g", FLT_DIG, value);
// The snprintf should never overflow because the buffer is significantly
// larger than the precision we asked for.
DCHECK(snprintf_result > 0 && snprintf_result < width);
float parsed_value;
if (!safe_strtof(buffer, &parsed_value) || parsed_value != value) {
snprintf_result = snprintf(buffer, width, "%.*g", FLT_DIG + 2, value);
// Should never overflow; see above.
DCHECK(snprintf_result > 0 && snprintf_result < width);
}
return snprintf_result;
}
int FastDoubleToBuffer(double value, char* buffer) {
auto end = fmt::format_to(buffer, FMT_COMPILE("{}"), value);
*end = '\0';
return end - buffer;
}
int FastFloatToBuffer(float value, char* buffer) {
auto* end = fmt::format_to(buffer, FMT_COMPILE("{}"), value);
*end = '\0';
return end - buffer;
}
// ----------------------------------------------------------------------
// SimpleItoaWithCommas()
// Description: converts an integer to a string.
// Puts commas every 3 spaces.
// Faster than printf("%d")?
//
// Return value: string
// ----------------------------------------------------------------------
char* SimpleItoaWithCommas(int64_t i, char* buffer, int32_t buffer_size) {
// 19 digits, 6 commas, and sign are good for 64-bit or smaller ints.
char* p = buffer + buffer_size;
// Need to use uint64 instead of int64 to correctly handle
// -9,223,372,036,854,775,808.
uint64 n = i;
if (i < 0) n = 0 - n;
*--p = '0' + n % 10; // this case deals with the number "0"
n /= 10;
while (n) {
*--p = '0' + n % 10;
n /= 10;
if (n == 0) break;
*--p = '0' + n % 10;
n /= 10;
if (n == 0) break;
*--p = ',';
*--p = '0' + n % 10;
n /= 10;
// For this unrolling, we check if n == 0 in the main while loop
}
if (i < 0) *--p = '-';
return p;
}
char* SimpleItoaWithCommas(__int128_t i, char* buffer, int32_t buffer_size) {
// 39 digits, 12 commas, and sign are good for 128-bit or smaller ints.
char* p = buffer + buffer_size;
// Need to use uint128 instead of int128 to correctly handle
// -170,141,183,460,469,231,731,687,303,715,884,105,728.
__uint128_t n = i;
if (i < 0) n = 0 - n;
*--p = '0' + n % 10; // this case deals with the number "0"
n /= 10;
while (n) {
*--p = '0' + n % 10;
n /= 10;
if (n == 0) break;
*--p = '0' + n % 10;
n /= 10;
if (n == 0) break;
*--p = ',';
*--p = '0' + n % 10;
n /= 10;
// For this unrolling, we check if n == 0 in the main while loop
}
if (i < 0) *--p = '-';
return p;
}