blob: 808922cb505728e5a0dbc8ea5e17037ad7c27456 [file] [log] [blame]
/*
* Copyright 2012 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmaessen@google.com (Jan-Willem Maessen)
#include "pagespeed/kernel/base/fast_wildcard_group.h"
#include <algorithm>
#include <vector>
#include "base/logging.h"
#include "pagespeed/kernel/base/atomic_int32.h"
#include "pagespeed/kernel/base/basictypes.h"
#include "pagespeed/kernel/base/stl_util.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
#include "pagespeed/kernel/base/rolling_hash.h"
#include "pagespeed/kernel/base/wildcard.h"
namespace net_instaweb {
namespace {
// Maximum rolling hash window size
const int32 kMaxRollingHashWindow = 256;
// Special index value for unused hash table entry.
const int kNoEntry = -1;
StringPiece LongestLiteralStringInWildcard(const Wildcard* wildcard) {
StringPiece spec = wildcard->spec();
const char kWildcardChars[] = { Wildcard::kMatchAny, Wildcard::kMatchOne };
StringPiece wildcardChars(kWildcardChars, arraysize(kWildcardChars));
int longest_pos = 0;
int longest_len = 0;
int next_wildcard = 0;
for (int pos = 0;
pos < static_cast<int>(spec.size()); pos = next_wildcard + 1) {
next_wildcard = spec.find_first_of(wildcardChars, pos);
if (next_wildcard == static_cast<int>(StringPiece::npos)) {
next_wildcard = spec.size();
}
int len = next_wildcard - pos;
if (len > longest_len) {
longest_pos = pos;
longest_len = len;
}
}
return spec.substr(longest_pos, longest_len);
}
} // namespace
FastWildcardGroup::~FastWildcardGroup() {
Clear();
}
void FastWildcardGroup::Uncompile() {
if (rolling_hash_length_.value() == kUncompiled) {
return;
}
rolling_hash_length_.set_value(kUncompiled);
rolling_hashes_.clear();
effective_indices_.clear();
wildcard_only_indices_.clear();
pattern_hash_index_.clear();
}
void FastWildcardGroup::Clear() {
Uncompile();
STLDeleteElements(&wildcards_);
allow_.clear();
}
inline int& FastWildcardGroup::pattern_hash_index(uint64 rolling_hash) const {
// We exploit the fact that pattern_hash_index.size() is a power of two
// to do integer modulus by bit masking.
return pattern_hash_index_[rolling_hash & (pattern_hash_index_.size() - 1)];
}
void FastWildcardGroup::CompileNonTrivial() const {
// First, assemble longest literal strings of each pattern
std::vector<StringPiece> longest_literal_strings;
int num_nontrivial_patterns = 0;
int32 rolling_hash_length = kMaxRollingHashWindow;
for (int i = 0; i < static_cast<int>(wildcards_.size()); ++i) {
longest_literal_strings.push_back(
LongestLiteralStringInWildcard(wildcards_[i]));
DCHECK_EQ(i + 1, static_cast<int>(longest_literal_strings.size()));
int length = longest_literal_strings[i].size();
if (length > 0) {
++num_nontrivial_patterns;
rolling_hash_length = std::min(rolling_hash_length, length);
}
}
if (num_nontrivial_patterns < kMinPatterns) {
// Not enough non-trivial patterns.
DCHECK_EQ(kDontHash, rolling_hash_length_.value());
return;
}
// Allocate a hash table that's power-of-2 sized and >=
// 2*num_nontrivial_patterns.
int hash_index_size;
for (hash_index_size = 8;
hash_index_size < 2 * num_nontrivial_patterns;
hash_index_size *= 2) { }
pattern_hash_index_.resize(hash_index_size, kNoEntry);
rolling_hashes_.resize(wildcards_.size());
effective_indices_.resize(allow_.size());
int current_effective_index = allow_.size() - 1;
bool current_allow = allow_[current_effective_index];
// Fill in the hash table with a rolling hash. We do this in
// reverse order so that collisions will result in the later
// pattern being matched first (if that succeeds, no further
// matching will be required).
for (int i = longest_literal_strings.size() - 1; i >= 0; --i) {
const StringPiece literal(longest_literal_strings[i]);
if (allow_[i] != current_allow) {
// Change from allow to deny or vice versa;
// change the current effective index and allow state.
current_effective_index = i;
current_allow = allow_[i];
}
effective_indices_[i] = current_effective_index;
DCHECK_LE(i, current_effective_index);
DCHECK_EQ(allow_[i], current_allow);
DCHECK_EQ(current_allow, allow_[effective_indices_[i]]);
if (literal.size() == 0) {
// All-wildcard pattern.
wildcard_only_indices_.push_back(i);
rolling_hashes_[i] = 0;
} else {
DCHECK_GE(static_cast<int>(literal.size()), rolling_hash_length);
// If possible, find a non-colliding rolling hash taken from literal. If
// the first hash collides, using a different hash is OK; we'll still end
// up checking both matches in the table for an input that matches both.
// The goal is to avoid chaining by spreading the entries out across the
// table.
// TODO(jmaessen): Consider re-hashing the current entry as well on
// collision to favor collision-free hash tables.
int max_start = literal.size() - rolling_hash_length;
int start = 0;
uint64 rolling_hash =
RollingHash(literal.data(), start, rolling_hash_length);
for (start = 1;
start <= max_start && pattern_hash_index(rolling_hash) != kNoEntry;
++start) {
rolling_hash = NextRollingHash(literal.data(), start,
rolling_hash_length, rolling_hash);
}
// Now insert the entry, dealing with any collisions.
rolling_hashes_[i] = rolling_hash;
while (pattern_hash_index(rolling_hash) != kNoEntry) {
++rolling_hash;
}
pattern_hash_index(rolling_hash) = i;
}
}
// Finally, after all the metadata is initialized, make rolling_hash_length
// visible to the world. This has release semantics, meaning that if another
// thread reads rolling_hash_length_ (with acquire semantics) and gets the
// value we set here, it is guaranteed to see all the preceding writes we did
// to the other compilation metadata.
rolling_hash_length_.set_value(rolling_hash_length);
}
void FastWildcardGroup::Compile() const {
// Basic invariant
CHECK_EQ(wildcards_.size(), allow_.size());
// Make sure we don't have cruft left around from a previous compile.
CHECK_EQ(0, static_cast<int>(rolling_hashes_.size()));
CHECK_EQ(0, static_cast<int>(effective_indices_.size()));
CHECK_EQ(0, static_cast<int>(wildcard_only_indices_.size()));
CHECK_EQ(0, static_cast<int>(pattern_hash_index_.size()));
CHECK_EQ(kDontHash, rolling_hash_length_.value());
if (static_cast<int>(wildcards_.size()) >= kMinPatterns) {
// Slow path, compute metadata and set rolling_hash_length_ to
// its final value.
CompileNonTrivial();
}
// When we're done, things should be in a sensible state.
int32 rolling_hash_length = rolling_hash_length_.value();
DCHECK_NE(kUncompiled, rolling_hash_length);
if (rolling_hash_length == kDontHash) {
DCHECK_EQ(0, static_cast<int>(rolling_hashes_.size()));
DCHECK_EQ(0, static_cast<int>(effective_indices_.size()));
DCHECK_EQ(0, static_cast<int>(wildcard_only_indices_.size()));
DCHECK_EQ(0, static_cast<int>(pattern_hash_index_.size()));
} else {
DCHECK_LT(0, rolling_hash_length);
DCHECK_EQ(wildcards_.size(), rolling_hashes_.size());
DCHECK_EQ(wildcards_.size(), effective_indices_.size());
int hash_pats = wildcards_.size() - wildcard_only_indices_.size();
DCHECK_LE(kMinPatterns, hash_pats);
DCHECK_LE(2 * hash_pats, static_cast<int>(pattern_hash_index_.size()));
}
}
void FastWildcardGroup::Allow(const StringPiece& expr) {
Uncompile();
Wildcard* wildcard = new Wildcard(expr);
wildcards_.push_back(wildcard);
allow_.push_back(true);
}
void FastWildcardGroup::Disallow(const StringPiece& expr) {
Uncompile();
Wildcard* wildcard = new Wildcard(expr);
wildcards_.push_back(wildcard);
allow_.push_back(false);
}
bool FastWildcardGroup::Match(const StringPiece& str, bool allow) const {
int32 rolling_hash_length = rolling_hash_length_.value();
// The previous read has acquire semantics, and all writes to
// rolling_hash_length_ have release semantics. This means we'll see the
// results of compilation if rolling_hash_length > 0.
//
// NOTE: it is unsafe (and expensive) to just CompareAndSwap (CAS) here.
// AtomicInt32::CAS guarantees release semantics but not acquire semantics.
// As a result we would potentially miss the results of compilation released
// by a prior write to rolling_hash_length_. This would cause us to read
// inconsistent compilation metadata, possibly resulting in a crash.
if (rolling_hash_length == kUncompiled) {
if (rolling_hash_length_.CompareAndSwap(kUncompiled, kDontHash) ==
kUncompiled) {
// During compilation other Match attempts will see kDontHash
// and will perform matching naively. Only the caller that
// does the kUncompiled -> kDontHash transition is permitted
// to compile.
Compile();
}
// rolling_hash_length is no longer kUncompiled, due to some call to
// Compile(). Re-acquire it so that we can safely view the results of
// compilation so far.
rolling_hash_length = rolling_hash_length_.value();
}
if (rolling_hash_length == kDontHash) {
// Set of wildcards is small, or compilation was ongoing when we last read
// rolling_hash_length_.
// Just match against each pattern in reverse order (starting with most
// recent, which overrides less recent), returning when a match succeeds.
for (int i = wildcards_.size() - 1; i >= 0; --i) {
if (wildcards_[i]->Match(str)) {
return allow_[i];
}
}
return allow;
}
int max_effective_index = kNoEntry;
// Start by matching against all-wildcard patterns in reverse order. Their
// indices are stored in reverse index order in wildcard_only_indices, so
// traverse it forwards. Stop if a match is found (since earlier matches will
// have a smaller index and be overridden by the already-found match).
// TODO(jmaessen): These patterns all devolve to a string length check (== or
// >=). Consider optimizing them.
for (int i = 0, sz = wildcard_only_indices_.size(); i < sz; ++i) {
int index = wildcard_only_indices_[i];
if (wildcards_[index]->Match(str)) {
max_effective_index = effective_indices_[index];
break;
}
}
int exit_effective_index = wildcards_.size() - 1;
int rolling_end = str.size() - rolling_hash_length;
if (max_effective_index < exit_effective_index && rolling_end >= 0) {
// Do a Rabin-Karp rolling match through the string.
uint64 rolling_hash = RollingHash(str.data(), 0, rolling_hash_length);
// Uses signed arithmetic for correct comparison below.
for (int ofs = 0;
max_effective_index < exit_effective_index && ofs <= rolling_end; ) {
// Look up rolling_hash in table, stopping if we find a:
// 1) Smaller index than max_effective_index
// 2) Matching string (update max_effective_index).
// In either case, all subsequent hash matches will be overridden by
// max_effective_index, so we need not search the bucket anymore. This is
// guaranteed by the order of table insertion (largest index first)
// and the fact that later smaller colliding entries are further along
// in the linear probe. This is true even if intervening slots are
// occupied by entries with completely different hash values - those
// entries will still have larger indices as they were inserted earlier.
for (uint64 probe = 0; ; ++probe) {
// The following invariant (and thus loop termination) should be
// guaranteed by the sparseness of pattern_hash_index_.
DCHECK_GT(pattern_hash_index_.size(), probe);
int index = pattern_hash_index(rolling_hash + probe);
if (index <= max_effective_index) {
// This also includes the kNoEntry case.
break;
}
if (rolling_hash == rolling_hashes_[index] &&
wildcards_[index]->Match(str)) {
max_effective_index = effective_indices_[index];
break;
}
}
if (++ofs <= rolling_end) {
rolling_hash = NextRollingHash(str.data(), ofs, rolling_hash_length,
rolling_hash);
}
}
}
if (max_effective_index == kNoEntry) {
return allow;
} else {
return allow_[max_effective_index];
}
}
void FastWildcardGroup::CopyFrom(const FastWildcardGroup& src) {
Clear();
AppendFrom(src);
}
void FastWildcardGroup::AppendFrom(const FastWildcardGroup& src) {
Uncompile();
CHECK_EQ(src.wildcards_.size(), src.allow_.size());
for (int i = 0, n = src.wildcards_.size(); i < n; ++i) {
wildcards_.push_back(src.wildcards_[i]->Duplicate());
allow_.push_back(src.allow_[i]);
}
}
GoogleString FastWildcardGroup::Signature() const {
GoogleString signature;
for (int i = 0, n = wildcards_.size(); i < n; ++i) {
StrAppend(&signature, wildcards_[i]->spec(), (allow_[i] ? "A" : "D"), ",");
}
return signature;
}
} // namespace net_instaweb