blob: 3221a6bc9d16fbf89e3fa4aa7ba9c7053655c86a [file] [log] [blame]
/*
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz),
// morlovich@google.com (Maksim Orlovich)
#ifndef PAGESPEED_KERNEL_BASE_SYMBOL_TABLE_H_
#define PAGESPEED_KERNEL_BASE_SYMBOL_TABLE_H_
#include <cstddef>
#include <list>
#include <vector>
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/atom.h"
#include "pagespeed/kernel/base/string_hash.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/dense_hash_map.h"
namespace net_instaweb {
// Implements a generic symbol table, allowing for case-sensitive
// and case insensitive versions. The elements of SymbolTables are
// Atoms. Atoms are created by Interning strings.
//
// Atoms are cheap and are passed around by value, not by reference or
// pointer. Atoms can be compared to one another via ==. A const char*
// can be extracted from an Atom via ==.
//
// Atoms are memory-managed by the symbol table from which they came.
// When the symbol table is destroyed, so are all the Atoms that
// were interned in it.
//
// Care should be taken not to attempt to compare Atoms created from
// multiple symbol tables.
//
// TODO(jmarantz): Symbol tables are not currently thread-safe. We
// should consider whether it's worth making them thread-safe, or
// whether it's better to use separate symbol tables in each thread.
template<class CharTransform> class SymbolTable {
public:
SymbolTable();
~SymbolTable() { Clear(); }
// Remove all symbols in the table, invalidating any Atoms that
// were previously Interned.
void Clear();
// Remember a string in the table, returning it as an Atom.
Atom Intern(const StringPiece& src);
// Returns the number of bytes allocated on behalf of the data,
// excluding any overhead added by the symbol table.
size_t string_bytes_allocated() const { return string_bytes_allocated_; }
private:
// StringPiece equality aware of CharTransform
struct Comparator {
bool operator()(const StringPiece& key_a, const StringPiece& key_b) const {
if (key_a.length() == key_b.length()) {
const char* a = key_a.data();
const char* b = key_b.data();
const char* a_end = a + key_a.length();
while (a < a_end) {
if (CharTransform::Normalize(*a) != CharTransform::Normalize(*b)) {
return false;
}
++a;
++b;
}
return true;
} else {
return false;
}
}
};
struct Hash {
size_t operator()(const StringPiece& key) const {
return HashString<CharTransform, size_t>(key.data(), key.length());
}
};
typedef dense_hash_map<StringPiece, StringPiece*,
Hash, Comparator> SymbolMap;
SymbolMap string_map_;
// Since we don't want to have Atom include both base and size, it keeps
// a StringPiece*, meaning that SymbolTable must keep StringPiece's at
// stable location. This manages the location, and string_map_ points at it.
std::list<StringPiece> pieces_;
// Allocates a new chunk of storage.
inline void NewStorage();
// Keep a vector of char* as simple pooled allocator. Since we have no
// mechanism to free an individual string -- only the entire symbol table
// can be cleared -- we can allocate by bumping a pointer pretty cheaply.
//
// Each element of 'storage_' contains a large character buffer, and
// next_ptr is a pointer into that buffer. We implicitly know how
// much is used by subtracting next_ptr-storage_.back(), and we know
// how much is left because we know how big each storage_ element is.
//
// To intern large strings above some threshold, 25% of the string-buffer
// size, we just allocate them directly with malloc and put them into
// the storage_ array in the second-to-last position. The only reason
// to put them in the storage_ array is to ensure the large strings
// are reclaimed along with the aggregated small-string storage buffers.
std::vector<char*> storage_;
char* next_ptr_; // Used for bump-pointer pooled allocation of strings.
size_t string_bytes_allocated_;
DISALLOW_COPY_AND_ASSIGN(SymbolTable);
};
typedef SymbolTable<CaseFold> SymbolTableInsensitive;
typedef SymbolTable<CasePreserve> SymbolTableSensitive;
} // namespace net_instaweb
#endif // PAGESPEED_KERNEL_BASE_SYMBOL_TABLE_H_