blob: f59ec3cef40d55d1698060e6cbfda46f4434ae8e [file] [log] [blame]
// Copyright 2010 Google Inc.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
// Author: jmarantz@google.com (Joshua Marantz)
// jmaessen@google.com (Jan-Willem Maessen)
#include "pagespeed/kernel/base/wildcard.h"
#include <cstddef>
#include "base/logging.h"
#include "pagespeed/kernel/base/string.h"
#include "pagespeed/kernel/base/string_util.h"
namespace net_instaweb {
const char Wildcard::kMatchAny = '*';
const char Wildcard::kMatchOne = '?';
Wildcard::Wildcard(const StringPiece& wildcard_spec) {
InitFromSpec(wildcard_spec);
}
Wildcard::Wildcard(const GoogleString& storage, int num_blocks,
int last_block_offset, bool is_simple)
: storage_(storage),
num_blocks_(num_blocks),
last_block_offset_(last_block_offset),
is_simple_(is_simple) {
}
// Pre-scan the wildcard spec into storage_, canonicalizing its representation
// as we go. We view the input wildcard_spec as a series of possibly-empty
// blocks each of which contains a mix of literal characters and kMatchOne (?),
// separated by kMatchAny (*). Each block matches a fixed number of characters
// in a candidate string.
//
// We transform this into an internal representation (in storage_) that contains
// a series of blocks each *terminated* by a *. This means that we end up
// adding a sentinel * at the end of the string, and that our interpretation of
// * changes: it now represents a block terminator, rather than a sequence of
// arbitrary characters. This transformation simplifies termination testing in
// the inner match loop (MatchBlock).
//
// Another way to think about this is that we could use a special 257th
// separator character, and rewrite the input into blocks terminated by the
// separator character. Rather than using this nonexistent character, we decide
// to reuse * instead---at the price of a bit of potential confusion.
//
// We also observe that the sub-pattern *? matches exactly the same set of
// strings as ?*, and that ** matches the same set of strings as *. We use this
// to eliminate empty blocks (except at the start and end of string), and to
// make sure that every block except the first begins with a literal character
// and not a ? (by shifting the ? to the end of the previous block). This
// permits a fast search for start of block during matching using memchr.
//
// We also remember the start of the last block in storage_, as the first and
// last blocks must match at an exact position in a string; the middle blocks
// are treated differently, as their position in a matched string can vary.
// After preprocessing, only the first or last block may be empty (corresponding
// to a leading or trailing * respectively).
void Wildcard::InitFromSpec(const StringPiece& wildcard_spec) {
storage_.reserve(wildcard_spec.size() + 1);
num_blocks_ = 1;
last_block_offset_ = 0;
is_simple_ = true;
bool last_was_any = false;
for (size_t i = 0; i < wildcard_spec.size(); ++i) {
char c = wildcard_spec[i];
if (c == kMatchAny) {
// Note that this in effect deletes redundant *s
// (by simply setting last_was_any more than once).
last_was_any = true;
is_simple_ = false;
} else if (c == kMatchOne) {
// Move ? to end of previous block by dint of adding it to pattern
// without inserting * first if last_was_any is set.
// So a? => a? but a*? => a?*. This means that * is always followed by
// a literal char or end of string after preprocessing.
storage_.push_back(c);
is_simple_ = false;
} else {
if (last_was_any) {
++num_blocks_;
storage_.push_back(kMatchAny);
last_block_offset_ = storage_.size();
last_was_any = false;
}
storage_.push_back(c);
}
}
// Clean up after trailing * (leading to empty last block)
if (last_was_any) {
++num_blocks_;
storage_.push_back(kMatchAny);
last_block_offset_ = storage_.size();
}
// Insert sentinel * at end of storage_ to make inner match loop simpler.
storage_.push_back(kMatchAny);
}
namespace {
// Match pat block (terminated by a *) against str, return offset of first
// mismatch or of the * in pat. Requires that str be long enough to
// contain chars in block (not counting final *).
int MatchBlock(const char* pat, const char* str) {
int pos;
for (pos = 0 ;
pat[pos] != Wildcard::kMatchAny &&
(pat[pos] == str[pos] || pat[pos] == Wildcard::kMatchOne); ++pos) { }
return pos;
}
} // namespace
bool Wildcard::Match(const StringPiece& actual) const {
if (is_simple_) {
return (actual == spec());
}
// We match a block at a time, checking incrementally that there are always
// enough characters remaining in actual to match the remaining blocks in
// storage_. We do this by maintaining "chars_to_skip", which counts the
// remaining number of characters that must be skipped over between blocks.
// We start by matching the first and last blocks, as those must be located at
// the beginning and end of the string respectively. We then match the middle
// blocks left to right, using memchr() to identify candidate positions for
// matching. We only need to match a given block once, but that might require
// multiple match attempts. The leftmost match is sufficient because our only
// wildcards are ? and *, which match arbitrary characters.
// Overall length check. Guarantees that the first and last pattern blocks
// will match without length checking, since they're matched at fixed
// positions in actual and we don't skip any chars.
int chars_in_pat = storage_.size() - num_blocks_;
int chars_to_skip = actual.size() - chars_in_pat;
if (chars_to_skip < 0) {
return false;
}
const char* pat = storage_.data();
const char* str = actual.data();
int blocks_left = num_blocks_;
// Match last block. This block can't be shifted wrt actual.
int last_block_size = storage_.size() - last_block_offset_ - 1;
const char* pat_last_block = pat + last_block_offset_;
const char* str_last_block = str + actual.size() - last_block_size;
int ofs = MatchBlock(pat_last_block, str_last_block);
if (pat_last_block[ofs] != kMatchAny) {
return false;
}
if (--blocks_left == 0) {
// There was only one block (the last), and it matched.
// Succeed if entire string was consumed.
return (chars_to_skip == 0);
}
// Match first block. This block can't be shifted wrt actual.
ofs = MatchBlock(pat, str);
if (pat[ofs] != kMatchAny) {
return false;
}
str += ofs;
pat += ofs + 1; // Skip leading *
--blocks_left;
// Match all remaining blocks, left to right. We try candidate
// positions that match the first char in each block.
while (blocks_left > 0) {
// Here are our invariants (the latter two guaranteed by
// initialization).
DCHECK_EQ(kMatchAny, pat[-1]);
DCHECK_NE(kMatchAny, pat[ 0]);
DCHECK_NE(kMatchOne, pat[ 0]);
// The number of characters left to match in the pattern plus the remaining
// chars_to_skip must be equal to the number of characters remaining in the
// string. This invariant is guaranteed by reducing chars_to_skip when we
// skip chars in str.
DCHECK_EQ(chars_to_skip + (pat_last_block - pat) - blocks_left,
str_last_block - str);
// Advance str to first occurrence of pat[0]; that's next
// candidate match position.
const char* new_str =
static_cast<const char*>(memchr(str, pat[0], str_last_block - str));
if (new_str == NULL) {
// First char in block wasn't found, so we can't match.
return false;
}
// memchr skipped over chars in str. Adjust chars_to_skip to match.
chars_to_skip -= (new_str - str);
if (chars_to_skip < 0) {
// More chars left in remaining blocks than in str.
return false;
}
str = new_str;
// Now check for a match here. We already know pat[0] == str[0].
ofs = 1 + MatchBlock(pat + 1, str + 1);
if (pat[ofs] != kMatchAny) {
// We failed to match leftmost occurrence of *pat in str.
// Move further right in str and try to match current block again.
++str;
--chars_to_skip;
if (chars_to_skip < 0) {
// With new shift, once again more chars left in remaining blocks than
// in str.
return false;
}
} else {
// Matched. Advance to next block of pattern.
str += ofs;
pat += ofs + 1; // Skip the *
--blocks_left;
}
}
return true;
}
Wildcard* Wildcard::Duplicate() const {
return new Wildcard(storage_, num_blocks_, last_block_offset_, is_simple_);
}
} // namespace net_instaweb