blob: 2c900ebc10e396ac205455754ad61aaa8ae3ab2e [file] [log] [blame]
/**
* Copyright 2010 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: jmarantz@google.com (Joshua Marantz)
#include "net/instaweb/util/public/url_escaper.h"
#include "net/instaweb/util/public/string_util.h"
namespace net_instaweb {
namespace {
// Firefox converts ^ to a % sequence.
// Apache rejects requests with % sequences it does not understand.
// So limit the pass-through characters as follows, and use ',' as
// an escaper.
//
// Unfortunately this makes longer filenames because ',' is also used
// in the filenam encoder.
//
// TODO(jmarantz): Pass through '.', and exploit '/' as a legal character
// in URLs. This requires redefining the constraints of a 'segment', which
// currently excludes both '.' and '/' due to rules enforced primarily
// in net/instaweb/rewriter/resource_manager.cc, but are distributed a bit
// more widely.
const char kPassThroughChars[] = "_=+-&?";
// Checks for 'search' at start of 'src'. If found, appends
// 'replacement' into 'out', and advances the start-point in 'src'
// past the search string, returning true.
bool ReplaceSubstring(const StringPiece& search, const char* replacement,
StringPiece* src, std::string* out) {
bool ret = false;
if ((src->size() >= search.size()) &&
(memcmp(src->data(), search.data(), search.size()) == 0)) {
out->append(replacement);
*src = src->substr(search.size());
ret = true;
}
return ret;
}
} // namespace
UrlEscaper::~UrlEscaper() { }
void UrlEscaper::EncodeToUrlSegment(const StringPiece& in,
std::string* url_segment) {
for (StringPiece src = in; src.size() != 0; ) {
// We need to check for common prefixes that begin with pass-through
// characters before doing the isalnum check.
if (!ReplaceSubstring("http://", ",h", &src, url_segment) &&
!ReplaceSubstring("www.", ",w", &src, url_segment)) {
char c = src[0];
if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
url_segment->append(1, c);
src = src.substr(1);
} else if (
// TODO(jmarantz): put these in a static table and generate
// an FSM so we don't have so much lookahed scanning, and we
// don't have to work hard to keep the encoder and decoder
// in sync.
!ReplaceSubstring(".com", ",c", &src, url_segment) &&
!ReplaceSubstring(".css", ",s", &src, url_segment) &&
!ReplaceSubstring(".edu", ",e", &src, url_segment) &&
!ReplaceSubstring(".gif", ",g", &src, url_segment) &&
!ReplaceSubstring(".html", ",t", &src, url_segment) &&
!ReplaceSubstring(".jpeg", ",k", &src, url_segment) &&
!ReplaceSubstring(".jpg", ",j", &src, url_segment) &&
!ReplaceSubstring(".js", ",l", &src, url_segment) &&
!ReplaceSubstring(".net", ",n", &src, url_segment) &&
!ReplaceSubstring(".png", ",p", &src, url_segment) &&
!ReplaceSubstring(".", ",o", &src, url_segment) &&
!ReplaceSubstring("^", ",u", &src, url_segment) &&
!ReplaceSubstring("%", ",P", &src, url_segment) &&
!ReplaceSubstring("/", ",_", &src, url_segment) &&
!ReplaceSubstring("\\", ",-", &src, url_segment) &&
!ReplaceSubstring(",", ",,", &src, url_segment)) {
url_segment->append(StringPrintf(",%02X",
static_cast<unsigned char>(c)));
src = src.substr(1);
}
}
}
}
bool UrlEscaper::DecodeFromUrlSegment(const StringPiece& url_segment,
std::string* out) {
int remaining = url_segment.size();
for (const char* p = url_segment.data(); remaining != 0; ++p, --remaining) {
char c = *p;
if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
out->append(&c, 1);
} else if ((c != ',') || (remaining < 2)) {
return false; // unknown char or trailing ,; this is an invalid encoding.
} else {
++p;
--remaining;
switch (*p) {
case '_': *out += "/"; break;
case '-': *out += "\\"; break;
case ',': *out += ","; break;
case 'c': *out += ".com"; break;
case 's': *out += ".css"; break;
case 'e': *out += ".edu"; break;
case 'g': *out += ".gif"; break;
case 'h': *out += "http://"; break;
case 'k': *out += ".jpeg"; break;
case 'j': *out += ".jpg"; break;
case 'l': *out += ".js"; break;
case 'n': *out += ".net"; break;
case 'o': *out += "."; break;
case 'p': *out += ".png"; break;
case 'P': *out += "%"; break;
case 't': *out += ".html"; break;
case 'u': *out += "^"; break;
case 'w': *out += "www."; break;
default:
if (remaining < 2) {
return false;
}
--remaining;
int char_val = 0;
if (AccumulateHexValue(*p++, &char_val) &&
AccumulateHexValue(*p, &char_val)) {
out->append(1, static_cast<char>(char_val));
} else {
return false;
}
break;
}
}
}
return true;
}
} // namespace net_instaweb