blob: b6b70134f4ada83d642a63cff98723b1b8102491 [file] [log] [blame]
// Copyright 2008 Google Inc. All Rights Reserved.
#ifndef STRINGS_CHARSET_H_
#define STRINGS_CHARSET_H_
#include "gutil/integral_types.h"
namespace strings {
// A CharSet is a simple map from (1-byte) characters to Booleans. It simply
// exposes the mechanism of checking if a given character is in the set, fairly
// efficiently. Useful for string tokenizing routines.
//
// Run on asherah (2 X 2400 MHz CPUs); 2008/11/10-13:18:03
// CPU: Intel Core2 (2 cores) dL1:32KB dL2:4096KB
// ***WARNING*** CPU scaling is enabled, the benchmark timings may be noisy,
// Benchmark Time(ns) CPU(ns) Iterations
// -------------------------------------------------------
// BM_CharSetTesting/1K 21 21 32563138
// BM_CharSetTesting/4K 21 21 31968433
// BM_CharSetTesting/32K 21 21 32114953
// BM_CharSetTesting/256K 22 22 31679082
// BM_CharSetTesting/1M 21 21 32563138
//
// This class is thread-compatible.
//
// This class has an implicit constructor.
// Style guide exception granted:
// http://goto/style-guide-exception-20978288
class CharSet {
public:
// Initialize a CharSet containing no characters or the given set of
// characters, respectively.
CharSet();
// Deliberately an implicit constructor, so anything that takes a CharSet
// can also take an explicit list of characters.
CharSet(const char* characters); // NOLINT(runtime/explicit)
explicit CharSet(const CharSet& other);
// Add or remove a character from the set.
void Add(unsigned char c) { bits_[Word(c)] |= BitMask(c); }
void Remove(unsigned char c) { bits_[Word(c)] &= ~BitMask(c); }
// Return true if this character is in the set
bool Test(unsigned char c) const { return bits_[Word(c)] & BitMask(c); }
private:
// The numbers below are optimized for 64-bit hardware. TODO(user): In the
// future, we should change this to use uword_t and do various bits of magic
// to calculate the numbers at compile time.
// In general,
// static const int kNumWords = max(32 / sizeof(uword_t), 1);
uint64 bits_[4];
// 4 words => the high 2 bits of c are the word number. In general,
// kShiftValue = 8 - log2(kNumWords)
static int Word(unsigned char c) { return c >> 6; }
// And the value we AND with c is ((1 << shift value) - 1)
// static const int kLowBitsMask = (256 / kNumWords) - 1;
static uint64 BitMask(unsigned char c) {
uint64 mask = 1;
return mask << (c & 0x3f);
}
};
} // namespace strings
#endif // STRINGS_CHARSET_H_