src/pagespeed/kernel/base/fast_wildcard_group.h - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2012 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmaessen@google.com (Jan-Willem Maessen)

 #ifndef PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_
 #define PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_

 #include <vector>
 #include "pagespeed/kernel/base/atomic_int32.h"
 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"

 namespace net_instaweb {

 class Wildcard;

 // This forms the basis of a wildcard selection mechanism, allowing
 // a user to issue a sequence of commands like:
 //
 //   1. allow *.cc
 //   2. allow *.h
 //   3. disallow a*.h
 //   4. allow ab*.h
 //   5. disallow c*.cc
 //
 // This sequence would yield the following results:
 //   Match("x.cc") --> true  due to rule #1
 //   Match("c.cc") --> false due to rule #5 which overrides rule #1
 //   Match("y.h")  --> true  due to rule #2
 //   Match("a.h")  --> false due to rule #3 which overrides rule #2
 //   Match("ab.h") --> true  due to rule #4 which overrides rule #3
 // So order matters.
 //
 // Note that concurrent calls to Match(...) are permitted, but modifications
 // must not occur concurrently (as you would expect).

 /* A note on the algorithm used here:

 Wildcard matching uses an O(nm) string search algorithm, where m is pattern
 length and n is string length (basically we search forward for first char in the
 next pattern chunk, then attempt a match at that position).  This is not the
 asymptotically efficient O(n+m) as it ignores the effects of prefixes and
 repeated substrings, but the wildcards that occur in PageSpeed tend to contain
 chunks of diverse literals and so it's good enough in practice.

 WildcardGroup simply iterates through wildcards in the group, attempting to
 match against each one in turn.

 In FastWildcardGroup we attempt a Rabin-Karp string match for a fixed-size
 substring of each of the wildcards.  We choose the largest possible substring
 size for a given group (for a single wildcard pattern, this will be the length
 of the longest literal in the pattern; for the group, it is the minimum such
 length).  Note that in the worst case this is a single character (we treat
 all-wildcard patterns specially).  We track the insertion index of the
 latest-inserted matched pattern (so the first pattern in the set has index 0,
 and initially our insertion index is -1).  As in Rabin-Karp we traverse the
 string using a rolling hash; when we encounter a hash match, we retrieve the
 corresponding insertion index.  If it's larger than our current insertion index
 (the pattern would override), we retrieve the pattern and attempt to match the
 whole string against it.  If the match succeeds we update the insertion index.
 Our return value is the corresponding "allow" status.

 We actually optimize this a little in two ways: rather than remembering the
 insertion index, we actually remember the insertion index just before the next
 change in "allow" status (the effective index).  So for example, if we insert 10
 "allow" patterns in a row and then a single "deny" pattern, matching against the
 first "allow" pattern means that we will subsequently check only against the
 "deny" pattern.  The second optimization builds on this: if the effective index
 is the last pattern in the group (always true if the group is nothing but
 "allow" or "deny" entries) then we can immediately return.

 We use a simple vector of indexes to store the hash table, dealing with
 collisions by linear probing.  Metadata (eg a cached hash) is stored with the
 patterns.  We make the table size >= 2x the number of patterns so that chains
 don't get long, and all failed probes terminate in an empty bucket.

 */

 class FastWildcardGroup {
  public:
   // Don't generate a hash unless there are this many non-wildcard-only
   // patterns.  Exposed for testing purposes (we can't use FRIEND_TEST here for
   // open-source dependency reasons).
   static const int kMinPatterns = 11;

   FastWildcardGroup()
       : rolling_hash_length_(kUncompiled) { }
   FastWildcardGroup(const FastWildcardGroup& src)
       : rolling_hash_length_(kUncompiled) {
     CopyFrom(src);
   }

   ~FastWildcardGroup();
   FastWildcardGroup& operator=(const FastWildcardGroup& other) {
     CopyFrom(other);
     return *this;
   }

   // Determines whether a string is allowed by the wildcard group.  If none of
   // the wildcards in the group matches, allow_by_default is returned.
   bool Match(const StringPiece& str, bool allow_by_default) const;

   // Add an expression to Allow, potentially overriding previous calls to
   // Disallow.
   void Allow(const StringPiece& wildcard);

   // Add an expression to Disallow, potentially overriding previous calls to
   // Allow.
   void Disallow(const StringPiece& wildcard);

   void CopyFrom(const FastWildcardGroup& src);
   void AppendFrom(const FastWildcardGroup& src);

   // Alias for use of CopyOnWrite
   void Merge(const FastWildcardGroup& src) { AppendFrom(src); }

   GoogleString Signature() const;

   // Return the number of configured wildcards.
   int num_wildcards() const { return wildcards_.size(); }
   bool empty() const { return wildcards_.empty(); }

  private:
   // Special values for rolling hash size.
   static const int32 kUncompiled = -1;
   static const int32 kDontHash = 0;

   void Uncompile();
   void Clear();
   inline int& pattern_hash_index(uint64 rolling_hash) const;
   void Compile() const;
   void CompileNonTrivial() const;

   // To avoid having to new another structure we use parallel
   // vectors.  Note that vector<bool> is special-case implemented
   // in STL to be bit-packed.
   std::vector<Wildcard*> wildcards_;
   std::vector<bool> allow_;  // parallel array (actually a bitvector)

   // Information that is computed during compilation.
   mutable std::vector<uint64> rolling_hashes_;  // One per wildcard
   mutable std::vector<int> effective_indices_;  // One per wildcard
   mutable std::vector<int> wildcard_only_indices_;  // Reverse order
   mutable std::vector<int> pattern_hash_index_;  // hash table
   mutable AtomicInt32 rolling_hash_length_;

   // This is copyable, since we want to use this with CopyOnWrite<>
 };

 }  // namespace net_instaweb

 #endif  // PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_
	/*
	* Copyright 2012 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmaessen@google.com (Jan-Willem Maessen)

	#ifndef PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_
	#define PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_

	#include <vector>
	#include "pagespeed/kernel/base/atomic_int32.h"
	#include "pagespeed/kernel/base/basictypes.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"

	namespace net_instaweb {

	class Wildcard;

	// This forms the basis of a wildcard selection mechanism, allowing
	// a user to issue a sequence of commands like:
	//
	// 1. allow *.cc
	// 2. allow *.h
	// 3. disallow a*.h
	// 4. allow ab*.h
	// 5. disallow c*.cc
	//
	// This sequence would yield the following results:
	// Match("x.cc") --> true due to rule #1
	// Match("c.cc") --> false due to rule #5 which overrides rule #1
	// Match("y.h") --> true due to rule #2
	// Match("a.h") --> false due to rule #3 which overrides rule #2
	// Match("ab.h") --> true due to rule #4 which overrides rule #3
	// So order matters.
	//
	// Note that concurrent calls to Match(...) are permitted, but modifications
	// must not occur concurrently (as you would expect).

	/* A note on the algorithm used here:

	Wildcard matching uses an O(nm) string search algorithm, where m is pattern
	length and n is string length (basically we search forward for first char in the
	next pattern chunk, then attempt a match at that position). This is not the
	asymptotically efficient O(n+m) as it ignores the effects of prefixes and
	repeated substrings, but the wildcards that occur in PageSpeed tend to contain
	chunks of diverse literals and so it's good enough in practice.

	WildcardGroup simply iterates through wildcards in the group, attempting to
	match against each one in turn.

	In FastWildcardGroup we attempt a Rabin-Karp string match for a fixed-size
	substring of each of the wildcards. We choose the largest possible substring
	size for a given group (for a single wildcard pattern, this will be the length
	of the longest literal in the pattern; for the group, it is the minimum such
	length). Note that in the worst case this is a single character (we treat
	all-wildcard patterns specially). We track the insertion index of the
	latest-inserted matched pattern (so the first pattern in the set has index 0,
	and initially our insertion index is -1). As in Rabin-Karp we traverse the
	string using a rolling hash; when we encounter a hash match, we retrieve the
	corresponding insertion index. If it's larger than our current insertion index
	(the pattern would override), we retrieve the pattern and attempt to match the
	whole string against it. If the match succeeds we update the insertion index.
	Our return value is the corresponding "allow" status.

	We actually optimize this a little in two ways: rather than remembering the
	insertion index, we actually remember the insertion index just before the next
	change in "allow" status (the effective index). So for example, if we insert 10
	"allow" patterns in a row and then a single "deny" pattern, matching against the
	first "allow" pattern means that we will subsequently check only against the
	"deny" pattern. The second optimization builds on this: if the effective index
	is the last pattern in the group (always true if the group is nothing but
	"allow" or "deny" entries) then we can immediately return.

	We use a simple vector of indexes to store the hash table, dealing with
	collisions by linear probing. Metadata (eg a cached hash) is stored with the
	patterns. We make the table size >= 2x the number of patterns so that chains
	don't get long, and all failed probes terminate in an empty bucket.

	*/

	class FastWildcardGroup {
	public:
	// Don't generate a hash unless there are this many non-wildcard-only
	// patterns. Exposed for testing purposes (we can't use FRIEND_TEST here for
	// open-source dependency reasons).
	static const int kMinPatterns = 11;

	FastWildcardGroup()
	: rolling_hash_length_(kUncompiled) { }
	FastWildcardGroup(const FastWildcardGroup& src)
	: rolling_hash_length_(kUncompiled) {
	CopyFrom(src);
	}

	~FastWildcardGroup();
	FastWildcardGroup& operator=(const FastWildcardGroup& other) {
	CopyFrom(other);
	return *this;
	}

	// Determines whether a string is allowed by the wildcard group. If none of
	// the wildcards in the group matches, allow_by_default is returned.
	bool Match(const StringPiece& str, bool allow_by_default) const;

	// Add an expression to Allow, potentially overriding previous calls to
	// Disallow.
	void Allow(const StringPiece& wildcard);

	// Add an expression to Disallow, potentially overriding previous calls to
	// Allow.
	void Disallow(const StringPiece& wildcard);

	void CopyFrom(const FastWildcardGroup& src);
	void AppendFrom(const FastWildcardGroup& src);

	// Alias for use of CopyOnWrite
	void Merge(const FastWildcardGroup& src) { AppendFrom(src); }

	GoogleString Signature() const;

	// Return the number of configured wildcards.
	int num_wildcards() const { return wildcards_.size(); }
	bool empty() const { return wildcards_.empty(); }

	private:
	// Special values for rolling hash size.
	static const int32 kUncompiled = -1;
	static const int32 kDontHash = 0;

	void Uncompile();
	void Clear();
	inline int& pattern_hash_index(uint64 rolling_hash) const;
	void Compile() const;
	void CompileNonTrivial() const;

	// To avoid having to new another structure we use parallel
	// vectors. Note that vector<bool> is special-case implemented
	// in STL to be bit-packed.
	std::vector<Wildcard*> wildcards_;
	std::vector<bool> allow_; // parallel array (actually a bitvector)

	// Information that is computed during compilation.
	mutable std::vector<uint64> rolling_hashes_; // One per wildcard
	mutable std::vector<int> effective_indices_; // One per wildcard
	mutable std::vector<int> wildcard_only_indices_; // Reverse order
	mutable std::vector<int> pattern_hash_index_; // hash table
	mutable AtomicInt32 rolling_hash_length_;

	// This is copyable, since we want to use this with CopyOnWrite<>
	};

	} // namespace net_instaweb

	#endif // PAGESPEED_KERNEL_BASE_FAST_WILDCARD_GROUP_H_