| /* |
| * Copyright 2012 Google Inc. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| |
| // Author: jmaessen@google.com (Jan-Willem Maessen) |
| |
| #ifndef PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_ |
| #define PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_ |
| |
| #include <vector> |
| #include "pagespeed/kernel/base/atomic_int32.h" |
| #include "pagespeed/kernel/base/basictypes.h" |
| #include "pagespeed/kernel/base/string.h" |
| #include "pagespeed/kernel/base/string_util.h" |
| |
| namespace net_instaweb { |
| |
| class Wildcard; |
| |
| // This forms the basis of a wildcard selection mechanism, allowing |
| // a user to issue a sequence of commands like: |
| // |
| // 1. allow *.cc |
| // 2. allow *.h |
| // 3. disallow a*.h |
| // 4. allow ab*.h |
| // 5. disallow c*.cc |
| // |
| // This sequence would yield the following results: |
| // Match("x.cc") --> true due to rule #1 |
| // Match("c.cc") --> false due to rule #5 which overrides rule #1 |
| // Match("y.h") --> true due to rule #2 |
| // Match("a.h") --> false due to rule #3 which overrides rule #2 |
| // Match("ab.h") --> true due to rule #4 which overrides rule #3 |
| // So order matters. |
| // |
| // Note that concurrent calls to Match(...) are permitted, but modifications |
| // must not occur concurrently (as you would expect). |
| |
| /* A note on the algorithm used here: |
| |
| Wildcard matching uses an O(nm) string search algorithm, where m is pattern |
| length and n is string length (basically we search forward for first char in the |
| next pattern chunk, then attempt a match at that position). This is not the |
| asymptotically efficient O(n+m) as it ignores the effects of prefixes and |
| repeated substrings, but the wildcards that occur in PageSpeed tend to contain |
| chunks of diverse literals and so it's good enough in practice. |
| |
| WildcardGroup simply iterates through wildcards in the group, attempting to |
| match against each one in turn. |
| |
| In FastWildcardGroup we attempt a Rabin-Karp string match for a fixed-size |
| substring of each of the wildcards. We choose the largest possible substring |
| size for a given group (for a single wildcard pattern, this will be the length |
| of the longest literal in the pattern; for the group, it is the minimum such |
| length). Note that in the worst case this is a single character (we treat |
| all-wildcard patterns specially). We track the insertion index of the |
| latest-inserted matched pattern (so the first pattern in the set has index 0, |
| and initially our insertion index is -1). As in Rabin-Karp we traverse the |
| string using a rolling hash; when we encounter a hash match, we retrieve the |
| corresponding insertion index. If it's larger than our current insertion index |
| (the pattern would override), we retrieve the pattern and attempt to match the |
| whole string against it. If the match succeeds we update the insertion index. |
| Our return value is the corresponding "allow" status. |
| |
| We actually optimize this a little in two ways: rather than remembering the |
| insertion index, we actually remember the insertion index just before the next |
| change in "allow" status (the effective index). So for example, if we insert 10 |
| "allow" patterns in a row and then a single "deny" pattern, matching against the |
| first "allow" pattern means that we will subsequently check only against the |
| "deny" pattern. The second optimization builds on this: if the effective index |
| is the last pattern in the group (always true if the group is nothing but |
| "allow" or "deny" entries) then we can immediately return. |
| |
| We use a simple vector of indexes to store the hash table, dealing with |
| collisions by linear probing. Metadata (eg a cached hash) is stored with the |
| patterns. We make the table size >= 2x the number of patterns so that chains |
| don't get long, and all failed probes terminate in an empty bucket. |
| |
| */ |
| |
| class FastWildcardGroup { |
| public: |
| // Don't generate a hash unless there are this many non-wildcard-only |
| // patterns. Exposed for testing purposes (we can't use FRIEND_TEST here for |
| // open-source dependency reasons). |
| static const int kMinPatterns = 11; |
| |
| FastWildcardGroup() |
| : rolling_hash_length_(kUncompiled) { } |
| FastWildcardGroup(const FastWildcardGroup& src) |
| : rolling_hash_length_(kUncompiled) { |
| CopyFrom(src); |
| } |
| |
| ~FastWildcardGroup(); |
| FastWildcardGroup& operator=(const FastWildcardGroup& other) { |
| CopyFrom(other); |
| return *this; |
| } |
| |
| // Determines whether a string is allowed by the wildcard group. If none of |
| // the wildcards in the group matches, allow_by_default is returned. |
| bool Match(const StringPiece& str, bool allow_by_default) const; |
| |
| // Add an expression to Allow, potentially overriding previous calls to |
| // Disallow. |
| void Allow(const StringPiece& wildcard); |
| |
| // Add an expression to Disallow, potentially overriding previous calls to |
| // Allow. |
| void Disallow(const StringPiece& wildcard); |
| |
| void CopyFrom(const FastWildcardGroup& src); |
| void AppendFrom(const FastWildcardGroup& src); |
| |
| // Alias for use of CopyOnWrite |
| void Merge(const FastWildcardGroup& src) { AppendFrom(src); } |
| |
| GoogleString Signature() const; |
| |
| // Return the number of configured wildcards. |
| int num_wildcards() const { return wildcards_.size(); } |
| bool empty() const { return wildcards_.empty(); } |
| |
| private: |
| // Special values for rolling hash size. |
| static const int32 kUncompiled = -1; |
| static const int32 kDontHash = 0; |
| |
| void Uncompile(); |
| void Clear(); |
| inline int& pattern_hash_index(uint64 rolling_hash) const; |
| void Compile() const; |
| void CompileNonTrivial() const; |
| |
| // To avoid having to new another structure we use parallel |
| // vectors. Note that vector<bool> is special-case implemented |
| // in STL to be bit-packed. |
| std::vector<Wildcard*> wildcards_; |
| std::vector<bool> allow_; // parallel array (actually a bitvector) |
| |
| // Information that is computed during compilation. |
| mutable std::vector<uint64> rolling_hashes_; // One per wildcard |
| mutable std::vector<int> effective_indices_; // One per wildcard |
| mutable std::vector<int> wildcard_only_indices_; // Reverse order |
| mutable std::vector<int> pattern_hash_index_; // hash table |
| mutable AtomicInt32 rolling_hash_length_; |
| |
| // This is copyable, since we want to use this with CopyOnWrite<> |
| }; |
| |
| } // namespace net_instaweb |
| |
| #endif // PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_ |