src/net/instaweb/util/url_escaper.cc - incubator-pagespeed-mod - Git at Google

 /**
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "net/instaweb/util/public/url_escaper.h"
 #include "net/instaweb/util/public/string_util.h"

 namespace net_instaweb {

 namespace {

 // Firefox converts ^ to a % sequence.
 // Apache rejects requests with % sequences it does not understand.
 // So limit the pass-through characters as follows, and use ',' as
 // an escaper.
 //
 // Unfortunately this makes longer filenames because ',' is also used
 // in the filenam encoder.
 //
 // TODO(jmarantz): Pass through '.', and exploit '/' as a legal character
 // in URLs.  This requires redefining the constraints of a 'segment', which
 // currently excludes both '.' and '/' due to rules enforced primarily
 // in net/instaweb/rewriter/resource_manager.cc, but are distributed a bit
 // more widely.
 const char kPassThroughChars[] = "_=+-&?";

 // Checks for 'search' at start of 'src'.  If found, appends
 // 'replacement' into 'out', and advances the start-point in 'src'
 // past the search string, returning true.
 bool ReplaceSubstring(const StringPiece& search, const char* replacement,
                       StringPiece* src, std::string* out) {
   bool ret = false;
   if ((src->size() >= search.size()) &&
       (memcmp(src->data(), search.data(), search.size()) == 0)) {
     out->append(replacement);
     *src = src->substr(search.size());
     ret = true;
   }
   return ret;
 }

 }  // namespace

 UrlEscaper::~UrlEscaper() { }

 void UrlEscaper::EncodeToUrlSegment(const StringPiece& in,
                                     std::string* url_segment) {
   for (StringPiece src = in; src.size() != 0; ) {
     // We need to check for common prefixes that begin with pass-through
     // characters before doing the isalnum check.
     if (!ReplaceSubstring("http://", ",h", &src, url_segment) &&
         !ReplaceSubstring("www.", ",w", &src, url_segment)) {
       char c = src[0];
       if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
         url_segment->append(1, c);
         src = src.substr(1);
       } else if (
           // TODO(jmarantz): put these in a static table and generate
           // an FSM so we don't have so much lookahed scanning, and we
           // don't have to work hard to keep the encoder and decoder
           // in sync.
           !ReplaceSubstring(".com", ",c", &src, url_segment) &&
           !ReplaceSubstring(".css", ",s", &src, url_segment) &&
           !ReplaceSubstring(".edu", ",e", &src, url_segment) &&
           !ReplaceSubstring(".gif", ",g", &src, url_segment) &&
           !ReplaceSubstring(".html", ",t", &src, url_segment) &&
           !ReplaceSubstring(".jpeg", ",k", &src, url_segment) &&
           !ReplaceSubstring(".jpg", ",j", &src, url_segment) &&
           !ReplaceSubstring(".js", ",l", &src, url_segment) &&
           !ReplaceSubstring(".net", ",n", &src, url_segment) &&
           !ReplaceSubstring(".png", ",p", &src, url_segment) &&
           !ReplaceSubstring(".", ",o", &src, url_segment) &&
           !ReplaceSubstring("^", ",u", &src, url_segment) &&
           !ReplaceSubstring("%", ",P", &src, url_segment) &&
           !ReplaceSubstring("/", ",_", &src, url_segment) &&
           !ReplaceSubstring("\\", ",-", &src, url_segment) &&
           !ReplaceSubstring(",", ",,", &src, url_segment)) {
         url_segment->append(StringPrintf(",%02X",
                                          static_cast<unsigned char>(c)));
         src = src.substr(1);
       }
     }
   }
 }

 bool UrlEscaper::DecodeFromUrlSegment(const StringPiece& url_segment,
                                       std::string* out) {
   int remaining = url_segment.size();
   for (const char* p = url_segment.data(); remaining != 0; ++p, --remaining) {
     char c = *p;
     if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
       out->append(&c, 1);
     } else if ((c != ',') || (remaining < 2)) {
       return false;  // unknown char or trailing ,; this is an invalid encoding.
     } else {
       ++p;
       --remaining;
       switch (*p) {
         case '_': *out += "/"; break;
         case '-': *out += "\\"; break;
         case ',': *out += ","; break;
         case 'c': *out += ".com"; break;
         case 's': *out += ".css"; break;
         case 'e': *out += ".edu"; break;
         case 'g': *out += ".gif"; break;
         case 'h': *out += "http://"; break;
         case 'k': *out += ".jpeg"; break;
         case 'j': *out += ".jpg"; break;
         case 'l': *out += ".js"; break;
         case 'n': *out += ".net"; break;
         case 'o': *out += "."; break;
         case 'p': *out += ".png"; break;
         case 'P': *out += "%"; break;
         case 't': *out += ".html"; break;
         case 'u': *out += "^"; break;
         case 'w': *out += "www."; break;
         default:
           if (remaining < 2) {
             return false;
           }
           --remaining;
           int char_val = 0;
           if (AccumulateHexValue(*p++, &char_val) &&
               AccumulateHexValue(*p, &char_val)) {
             out->append(1, static_cast<char>(char_val));
           } else {
             return false;
           }
           break;
       }
     }
   }
   return true;
 }

 }  // namespace net_instaweb
	/**
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#include "net/instaweb/util/public/url_escaper.h"
	#include "net/instaweb/util/public/string_util.h"

	namespace net_instaweb {

	namespace {

	// Firefox converts ^ to a % sequence.
	// Apache rejects requests with % sequences it does not understand.
	// So limit the pass-through characters as follows, and use ',' as
	// an escaper.
	//
	// Unfortunately this makes longer filenames because ',' is also used
	// in the filenam encoder.
	//
	// TODO(jmarantz): Pass through '.', and exploit '/' as a legal character
	// in URLs. This requires redefining the constraints of a 'segment', which
	// currently excludes both '.' and '/' due to rules enforced primarily
	// in net/instaweb/rewriter/resource_manager.cc, but are distributed a bit
	// more widely.
	const char kPassThroughChars[] = "_=+-&?";

	// Checks for 'search' at start of 'src'. If found, appends
	// 'replacement' into 'out', and advances the start-point in 'src'
	// past the search string, returning true.
	bool ReplaceSubstring(const StringPiece& search, const char* replacement,
	StringPiece* src, std::string* out) {
	bool ret = false;
	if ((src->size() >= search.size()) &&
	(memcmp(src->data(), search.data(), search.size()) == 0)) {
	out->append(replacement);
	*src = src->substr(search.size());
	ret = true;
	}
	return ret;
	}

	} // namespace

	UrlEscaper::~UrlEscaper() { }

	void UrlEscaper::EncodeToUrlSegment(const StringPiece& in,
	std::string* url_segment) {
	for (StringPiece src = in; src.size() != 0; ) {
	// We need to check for common prefixes that begin with pass-through
	// characters before doing the isalnum check.
	if (!ReplaceSubstring("http://", ",h", &src, url_segment) &&
	!ReplaceSubstring("www.", ",w", &src, url_segment)) {
	char c = src[0];
	if (isalnum(c) \|\| (strchr(kPassThroughChars, c) != NULL)) {
	url_segment->append(1, c);
	src = src.substr(1);
	} else if (
	// TODO(jmarantz): put these in a static table and generate
	// an FSM so we don't have so much lookahed scanning, and we
	// don't have to work hard to keep the encoder and decoder
	// in sync.
	!ReplaceSubstring(".com", ",c", &src, url_segment) &&
	!ReplaceSubstring(".css", ",s", &src, url_segment) &&
	!ReplaceSubstring(".edu", ",e", &src, url_segment) &&
	!ReplaceSubstring(".gif", ",g", &src, url_segment) &&
	!ReplaceSubstring(".html", ",t", &src, url_segment) &&
	!ReplaceSubstring(".jpeg", ",k", &src, url_segment) &&
	!ReplaceSubstring(".jpg", ",j", &src, url_segment) &&
	!ReplaceSubstring(".js", ",l", &src, url_segment) &&
	!ReplaceSubstring(".net", ",n", &src, url_segment) &&
	!ReplaceSubstring(".png", ",p", &src, url_segment) &&
	!ReplaceSubstring(".", ",o", &src, url_segment) &&
	!ReplaceSubstring("^", ",u", &src, url_segment) &&
	!ReplaceSubstring("%", ",P", &src, url_segment) &&
	!ReplaceSubstring("/", ",_", &src, url_segment) &&
	!ReplaceSubstring("\\", ",-", &src, url_segment) &&
	!ReplaceSubstring(",", ",,", &src, url_segment)) {
	url_segment->append(StringPrintf(",%02X",
	static_cast<unsigned char>(c)));
	src = src.substr(1);
	}
	}
	}
	}

	bool UrlEscaper::DecodeFromUrlSegment(const StringPiece& url_segment,
	std::string* out) {
	int remaining = url_segment.size();
	for (const char* p = url_segment.data(); remaining != 0; ++p, --remaining) {
	char c = *p;
	if (isalnum(c) \|\| (strchr(kPassThroughChars, c) != NULL)) {
	out->append(&c, 1);
	} else if ((c != ',') \|\| (remaining < 2)) {
	return false; // unknown char or trailing ,; this is an invalid encoding.
	} else {
	++p;
	--remaining;
	switch (*p) {
	case '_': *out += "/"; break;
	case '-': *out += "\\"; break;
	case ',': *out += ","; break;
	case 'c': *out += ".com"; break;
	case 's': *out += ".css"; break;
	case 'e': *out += ".edu"; break;
	case 'g': *out += ".gif"; break;
	case 'h': *out += "http://"; break;
	case 'k': *out += ".jpeg"; break;
	case 'j': *out += ".jpg"; break;
	case 'l': *out += ".js"; break;
	case 'n': *out += ".net"; break;
	case 'o': *out += "."; break;
	case 'p': *out += ".png"; break;
	case 'P': *out += "%"; break;
	case 't': *out += ".html"; break;
	case 'u': *out += "^"; break;
	case 'w': *out += "www."; break;
	default:
	if (remaining < 2) {
	return false;
	}
	--remaining;
	int char_val = 0;
	if (AccumulateHexValue(*p++, &char_val) &&
	AccumulateHexValue(*p, &char_val)) {
	out->append(1, static_cast<char>(char_val));
	} else {
	return false;
	}
	break;
	}
	}
	}
	return true;
	}

	} // namespace net_instaweb