src/pagespeed/kernel/util/url_escaper.cc - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2010 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: jmarantz@google.com (Joshua Marantz)

 #include "pagespeed/kernel/util/url_escaper.h"

 #include <cstddef>
 #include <cctype>
 #include "pagespeed/kernel/base/basictypes.h"
 #include "pagespeed/kernel/base/string.h"
 #include "pagespeed/kernel/base/string_util.h"

 namespace net_instaweb {

 namespace {

 // Firefox converts ^ to a % sequence.
 // Apache rejects requests with % sequences it does not understand.
 // So limit the pass-through characters as follows, and use ',' as
 // an escaper.
 //
 // Unfortunately this makes longer filenames because ',' is also used
 // in the filename encoder.
 const char kPassThroughChars[] = "._=+-";

 // Checks for 'search' at start of 'src'.  If found, appends
 // 'replacement' into 'out', and advances the start-point in 'src'
 // past the search string, returning true.
 bool ReplaceSubstring(const StringPiece& search, const char* replacement,
                       StringPiece* src, GoogleString* out) {
   bool ret = false;
   if (src->starts_with(search)) {
     out->append(replacement);
     src->remove_prefix(search.size());
     ret = true;
   }
   return ret;
 }

 }  // namespace

 void UrlEscaper::EncodeToUrlSegment(const StringPiece& in,
                                     GoogleString* url_segment) {
   for (StringPiece src = in; src.size() != 0; ) {
     char c = src[0];
     src.remove_prefix(1);
     // TODO(jmarantz): put these in a static table, to make it
     // faster and so we don't have to work hard to keep the encoder
     // and decoder in sync.
     switch (c) {
       case '^':
         url_segment->append(",u");
         break;
       case '%':
         url_segment->append(",P");
         break;
       case '/':
         url_segment->append(",_");
         break;
       case '\\':
         url_segment->append(",-");
         break;
       case ',':
         url_segment->append(",,");
         break;
       case '?':
         url_segment->append(",q");
         break;
       case '&':
         url_segment->append(",a");
         break;
       case 'h':
         if (!ReplaceSubstring("ttp://", ",h", &src, url_segment)) {
           // Just pass-through 'h'
           url_segment->push_back('h');
         }
         break;
       case '.':
         // . is a passthrough char, but .pagespeed. is special
         if (!ReplaceSubstring("pagespeed.", ",M", &src, url_segment)) {
           url_segment->push_back('.');
         }
         break;
       default:
         if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
           url_segment->push_back(c);
         } else {
           StringAppendF(url_segment, ",%02X", static_cast<unsigned char>(c));
         }
     }
   }
 }


 namespace {

 // DecodeHexEncoding assumes that buffer[pos, pos+1] is of the form "xx" are
 // hexadecimal digits.  It constructs a char from these characters, or returns
 // false to indicate encoding failure.
 bool DecodeHexEncoding(const StringPiece& buffer, size_t i, char* result) {
   uint32 char_val = 0;
   if ((i + 1 < buffer.size()) &&
       AccumulateHexValue(buffer[i], &char_val) &&
       AccumulateHexValue(buffer[i+1], &char_val)) {
     *result = static_cast<char>(char_val);
     return true;
   }
   return false;
 }

 }  // namespace


 bool UrlEscaper::DecodeFromUrlSegment(const StringPiece& url_segment,
                                       GoogleString* out) {
   size_t size = url_segment.size();
   for (size_t i = 0; i < size; ++i) {
     char c = url_segment[i];
     if (isalnum(c) || (strchr(kPassThroughChars, c) != NULL)) {
       out->push_back(c);
       continue;
     }
     // We ought to have a ',' or a '%' to decode (or a bad encoding)
     ++i;  // i points to first char of encoding
     if (i >= size) {
       // No space for encoded data
       return false;
     }
     if (c != ',') {
       if ((c == '%') && DecodeHexEncoding(url_segment, i, &c)) {
         ++i;  // i points to last char of encoding
         // Rare corner case: there exist browsers that percent-encode + to %20
         // (space), which is supposed to be illegal except after ? (in query
         // params).
         if (c == ' ') {
           c = '+';
         }
         if (c != ',') {
           out->push_back(c);
           continue;
         }
         // We found a %-encoded ,
         ++i;  // Make i point to first char of , encoding
         if (i >= size) {
           // trailing %-encoded ,
           return false;
         }
         // Fall through and decode the ,
       } else {
         return false;  // unknown char; this is an invalid encoding.
       }
     }
     // At this point we know we're decoding a , encoding.
     // TODO(jmaessen): Worry about %-encoding here, if that ever comes up.
     // To our knowledge it never has.
     switch (url_segment[i]) {
       case '_': *out += "/"; break;
       case '-': *out += "\\"; break;
       case ',': *out += ","; break;
       case 'a': *out += "&"; break;
       case 'M': *out += ".pagespeed."; break;
       case 'P': *out += "%"; break;
       case 'q': *out += "?"; break;
       case 'u': *out += "^"; break;

       // The following legacy encodings are no longer made.  However we should
       // continue to decode what we previously encoded in November 2010 to
       // avoid (for example) breaking image search.
       case 'c': *out += ".com"; break;
       case 'e': *out += ".edu"; break;
       case 'g': *out += ".gif"; break;
       case 'h': *out += "http://"; break;
       case 'j': *out += ".jpg"; break;
       case 'k': *out += ".jpeg"; break;
       case 'l': *out += ".js"; break;
       case 'n': *out += ".net"; break;
       case 'o': *out += "."; break;
       case 'p': *out += ".png"; break;
       case 's': *out += ".css"; break;
       case 't': *out += ".html"; break;
       case 'w': *out += "www."; break;

       default:
         if (DecodeHexEncoding(url_segment, i, &c)) {
           ++i;
           out->push_back(c);
         } else {
             return false;
         }
         break;
     }
     // At this point i points to last char of just-decoded encoding.
   }
   return true;
 }

 }  // namespace net_instaweb
	/*
	* Copyright 2010 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: jmarantz@google.com (Joshua Marantz)

	#include "pagespeed/kernel/util/url_escaper.h"

	#include <cstddef>
	#include <cctype>
	#include "pagespeed/kernel/base/basictypes.h"
	#include "pagespeed/kernel/base/string.h"
	#include "pagespeed/kernel/base/string_util.h"

	namespace net_instaweb {

	namespace {

	// Firefox converts ^ to a % sequence.
	// Apache rejects requests with % sequences it does not understand.
	// So limit the pass-through characters as follows, and use ',' as
	// an escaper.
	//
	// Unfortunately this makes longer filenames because ',' is also used
	// in the filename encoder.
	const char kPassThroughChars[] = "._=+-";

	// Checks for 'search' at start of 'src'. If found, appends
	// 'replacement' into 'out', and advances the start-point in 'src'
	// past the search string, returning true.
	bool ReplaceSubstring(const StringPiece& search, const char* replacement,
	StringPiece* src, GoogleString* out) {
	bool ret = false;
	if (src->starts_with(search)) {
	out->append(replacement);
	src->remove_prefix(search.size());
	ret = true;
	}
	return ret;
	}

	} // namespace

	void UrlEscaper::EncodeToUrlSegment(const StringPiece& in,
	GoogleString* url_segment) {
	for (StringPiece src = in; src.size() != 0; ) {
	char c = src[0];
	src.remove_prefix(1);
	// TODO(jmarantz): put these in a static table, to make it
	// faster and so we don't have to work hard to keep the encoder
	// and decoder in sync.
	switch (c) {
	case '^':
	url_segment->append(",u");
	break;
	case '%':
	url_segment->append(",P");
	break;
	case '/':
	url_segment->append(",_");
	break;
	case '\\':
	url_segment->append(",-");
	break;
	case ',':
	url_segment->append(",,");
	break;
	case '?':
	url_segment->append(",q");
	break;
	case '&':
	url_segment->append(",a");
	break;
	case 'h':
	if (!ReplaceSubstring("ttp://", ",h", &src, url_segment)) {
	// Just pass-through 'h'
	url_segment->push_back('h');
	}
	break;
	case '.':
	// . is a passthrough char, but .pagespeed. is special
	if (!ReplaceSubstring("pagespeed.", ",M", &src, url_segment)) {
	url_segment->push_back('.');
	}
	break;
	default:
	if (isalnum(c) \|\| (strchr(kPassThroughChars, c) != NULL)) {
	url_segment->push_back(c);
	} else {
	StringAppendF(url_segment, ",%02X", static_cast<unsigned char>(c));
	}
	}
	}
	}


	namespace {

	// DecodeHexEncoding assumes that buffer[pos, pos+1] is of the form "xx" are
	// hexadecimal digits. It constructs a char from these characters, or returns
	// false to indicate encoding failure.
	bool DecodeHexEncoding(const StringPiece& buffer, size_t i, char* result) {
	uint32 char_val = 0;
	if ((i + 1 < buffer.size()) &&
	AccumulateHexValue(buffer[i], &char_val) &&
	AccumulateHexValue(buffer[i+1], &char_val)) {
	*result = static_cast<char>(char_val);
	return true;
	}
	return false;
	}

	} // namespace


	bool UrlEscaper::DecodeFromUrlSegment(const StringPiece& url_segment,
	GoogleString* out) {
	size_t size = url_segment.size();
	for (size_t i = 0; i < size; ++i) {
	char c = url_segment[i];
	if (isalnum(c) \|\| (strchr(kPassThroughChars, c) != NULL)) {
	out->push_back(c);
	continue;
	}
	// We ought to have a ',' or a '%' to decode (or a bad encoding)
	++i; // i points to first char of encoding
	if (i >= size) {
	// No space for encoded data
	return false;
	}
	if (c != ',') {
	if ((c == '%') && DecodeHexEncoding(url_segment, i, &c)) {
	++i; // i points to last char of encoding
	// Rare corner case: there exist browsers that percent-encode + to %20
	// (space), which is supposed to be illegal except after ? (in query
	// params).
	if (c == ' ') {
	c = '+';
	}
	if (c != ',') {
	out->push_back(c);
	continue;
	}
	// We found a %-encoded ,
	++i; // Make i point to first char of , encoding
	if (i >= size) {
	// trailing %-encoded ,
	return false;
	}
	// Fall through and decode the ,
	} else {
	return false; // unknown char; this is an invalid encoding.
	}
	}
	// At this point we know we're decoding a , encoding.
	// TODO(jmaessen): Worry about %-encoding here, if that ever comes up.
	// To our knowledge it never has.
	switch (url_segment[i]) {
	case '_': *out += "/"; break;
	case '-': *out += "\\"; break;
	case ',': *out += ","; break;
	case 'a': *out += "&"; break;
	case 'M': *out += ".pagespeed."; break;
	case 'P': *out += "%"; break;
	case 'q': *out += "?"; break;
	case 'u': *out += "^"; break;

	// The following legacy encodings are no longer made. However we should
	// continue to decode what we previously encoded in November 2010 to
	// avoid (for example) breaking image search.
	case 'c': *out += ".com"; break;
	case 'e': *out += ".edu"; break;
	case 'g': *out += ".gif"; break;
	case 'h': *out += "http://"; break;
	case 'j': *out += ".jpg"; break;
	case 'k': *out += ".jpeg"; break;
	case 'l': *out += ".js"; break;
	case 'n': *out += ".net"; break;
	case 'o': *out += "."; break;
	case 'p': *out += ".png"; break;
	case 's': *out += ".css"; break;
	case 't': *out += ".html"; break;
	case 'w': *out += "www."; break;

	default:
	if (DecodeHexEncoding(url_segment, i, &c)) {
	++i;
	out->push_back(c);
	} else {
	return false;
	}
	break;
	}
	// At this point i points to last char of just-decoded encoding.
	}
	return true;
	}

	} // namespace net_instaweb