src/tscore/Encoding.cc - trafficserver - Git at Google

 /** @file

   @section license License

   Licensed to the Apache Software Foundation (ASF) under one
   or more contributor license agreements.  See the NOTICE file
   distributed with this work for additional information
   regarding copyright ownership.  The ASF licenses this file
   to you under the Apache License, Version 2.0 (the
   "License"); you may not use this file except in compliance
   with the License.  You may obtain a copy of the License at

       http://www.apache.org/licenses/LICENSE-2.0

   Unless required by applicable law or agreed to in writing, software
   distributed under the License is distributed on an "AS IS" BASIS,
   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
   See the License for the specific language governing permissions and
   limitations under the License.
  */

 #include "swoc/bwf_ip.h"
 #include "tscore/Encoding.h"

 /*-------------------------------------------------------------------------
   Encoding::escapify_url_common

   This routine will escapify a URL to remove spaces (and perhaps other ugly
   characters) from a URL and replace them with a hex escape sequence.
   Since the escapes are larger (multi-byte) than the characters being
   replaced, the string returned will be longer than the string passed.

   This is a worker function called by escapify_url and pure_escapify_url.  These
   functions differ on whether the function tries to detect and avoid
   double URL encoding (escapify_url) or not (pure_escapify_url)
   -------------------------------------------------------------------------*/

 namespace
 {
 char *
 escapify_url_common(Arena *arena, char *url, size_t len_in, int *len_out, char *dst, size_t dst_size, const unsigned char *map,
                     bool pure_escape)
 {
   // codes_to_escape is a bitmap encoding the codes that should be escaped.
   // These are all the codes defined in section 2.4.3 of RFC 2396
   // (control, space, delims, and unwise) plus the tilde. In RFC 2396
   // the tilde is an "unreserved" character, but we escape it because
   // historically this is what the traffic_server has done.
   // Note that we leave codes beyond 127 unmodified.
   //
   // NOTE: any updates to this table should result in an update to:
   // tools/escape_mapper/escape_mapper.cc.
   static const unsigned char codes_to_escape[32] = {
     0xFF, 0xFF, 0xFF,
     0xFF,             // control
     0xB4,             // space " # %
     0x00, 0x00,       //
     0x0A,             // < >
     0x00, 0x00, 0x00, //
     0x1E, 0x80,       // [ \ ] ^ `
     0x00, 0x00,       //
     0x1F,             // { | } ~ DEL
     0x00, 0x00, 0x00,
     0x00, // all non-ascii characters unmodified
     0x00, 0x00, 0x00,
     0x00, //               .
     0x00, 0x00, 0x00,
     0x00, //               .
     0x00, 0x00, 0x00,
     0x00 //               .
   };

   static char hex_digit[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};

   if (!url || (dst && dst_size < len_in)) {
     *len_out = 0;
     return nullptr;
   }

   if (!map) {
     map = codes_to_escape;
   }

   // Count specials in the url, assuming that there won't be any.
   //
   int count        = 0;
   char *p          = url;
   char *in_url_end = url + len_in;

   while (p < in_url_end) {
     unsigned char c = *p;
     if (map[c / 8] & (1 << (7 - c % 8))) {
       ++count;
     }
     ++p;
   }

   if (!count) {
     // The common case, no escapes, so just return the source string.
     //
     *len_out = len_in;
     if (dst) {
       ink_strlcpy(dst, url, dst_size);
     }
     return url;
   }

   // For each special char found, we'll need an escape string, which is
   // three characters long.  Count this and allocate the string required.
   //
   // make sure we take into account the characters we are substituting
   // for when we calculate out_len !!! in other words,
   // out_len = len_in + 3*count - count
   //
   size_t out_len = len_in + 2 * count;

   if (dst && (out_len + 1) > dst_size) {
     *len_out = 0;
     return nullptr;
   }

   // To play it safe, we null terminate the string we return in case
   // a module that expects null-terminated strings calls escapify_url,
   // so we allocate an extra byte for the EOS
   //
   char *new_url;

   if (dst) {
     new_url = dst;
   } else {
     new_url = arena->str_alloc(out_len + 1);
   }

   char *from = url;
   char *to   = new_url;

   while (from < in_url_end) {
     unsigned char c = *from;
     if (map[c / 8] & (1 << (7 - c % 8))) {
       /*
        * If two characters following a '%' don't need to be encoded, then it must
        * mean that the three character sequence is already encoded.  Just copy it over.
        */
       if (!pure_escape && (*from == '%') && ((from + 2) < in_url_end)) {
         unsigned char c1   = *(from + 1);
         unsigned char c2   = *(from + 2);
         bool needsEncoding = ((map[c1 / 8] & (1 << (7 - c1 % 8))) || (map[c2 / 8] & (1 << (7 - c2 % 8))));
         if (!needsEncoding) {
           out_len -= 2;
           Debug("log-utils", "character already encoded..skipping %c, %c, %c", *from, *(from + 1), *(from + 2));
           *to++ = *from++;
           continue;
         }
       }

       *to++ = '%';
       *to++ = hex_digit[c / 16];
       *to++ = hex_digit[c % 16];
     } else {
       *to++ = *from;
     }
     from++;
   }
   *to = '\0'; // null terminate string

   *len_out = out_len;
   return new_url;
 }
 } // namespace

 namespace Encoding
 {
 char *
 escapify_url(Arena *arena, char *url, size_t len_in, int *len_out, char *dst, size_t dst_size, const unsigned char *map)
 {
   return escapify_url_common(arena, url, len_in, len_out, dst, dst_size, map, false);
 }

 char *
 pure_escapify_url(Arena *arena, char *url, size_t len_in, int *len_out, char *dst, size_t dst_size, const unsigned char *map)
 {
   return escapify_url_common(arena, url, len_in, len_out, dst, dst_size, map, true);
 }
 }; // namespace Encoding
	/** @file

	@section license License

	Licensed to the Apache Software Foundation (ASF) under one
	or more contributor license agreements. See the NOTICE file
	distributed with this work for additional information
	regarding copyright ownership. The ASF licenses this file
	to you under the Apache License, Version 2.0 (the
	"License"); you may not use this file except in compliance
	with the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	*/

	#include "swoc/bwf_ip.h"
	#include "tscore/Encoding.h"

	/*-------------------------------------------------------------------------
	Encoding::escapify_url_common

	This routine will escapify a URL to remove spaces (and perhaps other ugly
	characters) from a URL and replace them with a hex escape sequence.
	Since the escapes are larger (multi-byte) than the characters being
	replaced, the string returned will be longer than the string passed.

	This is a worker function called by escapify_url and pure_escapify_url. These
	functions differ on whether the function tries to detect and avoid
	double URL encoding (escapify_url) or not (pure_escapify_url)
	-------------------------------------------------------------------------*/

	namespace
	{
	char *
	escapify_url_common(Arena arena, char url, size_t len_in, int len_out, char dst, size_t dst_size, const unsigned char *map,
	bool pure_escape)
	{
	// codes_to_escape is a bitmap encoding the codes that should be escaped.
	// These are all the codes defined in section 2.4.3 of RFC 2396
	// (control, space, delims, and unwise) plus the tilde. In RFC 2396
	// the tilde is an "unreserved" character, but we escape it because
	// historically this is what the traffic_server has done.
	// Note that we leave codes beyond 127 unmodified.
	//
	// NOTE: any updates to this table should result in an update to:
	// tools/escape_mapper/escape_mapper.cc.
	static const unsigned char codes_to_escape[32] = {
	0xFF, 0xFF, 0xFF,
	0xFF, // control
	0xB4, // space " # %
	0x00, 0x00, //
	0x0A, // < >
	0x00, 0x00, 0x00, //
	0x1E, 0x80, // [ \ ] ^ `
	0x00, 0x00, //
	0x1F, // { \| } ~ DEL
	0x00, 0x00, 0x00,
	0x00, // all non-ascii characters unmodified
	0x00, 0x00, 0x00,
	0x00, // .
	0x00, 0x00, 0x00,
	0x00, // .
	0x00, 0x00, 0x00,
	0x00 // .
	};

	static char hex_digit[16] = {'0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'};

	if (!url \|\| (dst && dst_size < len_in)) {
	*len_out = 0;
	return nullptr;
	}

	if (!map) {
	map = codes_to_escape;
	}

	// Count specials in the url, assuming that there won't be any.
	//
	int count = 0;
	char *p = url;
	char *in_url_end = url + len_in;

	while (p < in_url_end) {
	unsigned char c = *p;
	if (map[c / 8] & (1 << (7 - c % 8))) {
	++count;
	}
	++p;
	}

	if (!count) {
	// The common case, no escapes, so just return the source string.
	//
	*len_out = len_in;
	if (dst) {
	ink_strlcpy(dst, url, dst_size);
	}
	return url;
	}

	// For each special char found, we'll need an escape string, which is
	// three characters long. Count this and allocate the string required.
	//
	// make sure we take into account the characters we are substituting
	// for when we calculate out_len !!! in other words,
	// out_len = len_in + 3*count - count
	//
	size_t out_len = len_in + 2 * count;

	if (dst && (out_len + 1) > dst_size) {
	*len_out = 0;
	return nullptr;
	}

	// To play it safe, we null terminate the string we return in case
	// a module that expects null-terminated strings calls escapify_url,
	// so we allocate an extra byte for the EOS
	//
	char *new_url;

	if (dst) {
	new_url = dst;
	} else {
	new_url = arena->str_alloc(out_len + 1);
	}

	char *from = url;
	char *to = new_url;

	while (from < in_url_end) {
	unsigned char c = *from;
	if (map[c / 8] & (1 << (7 - c % 8))) {
	/*
	* If two characters following a '%' don't need to be encoded, then it must
	* mean that the three character sequence is already encoded. Just copy it over.
	*/
	if (!pure_escape && (*from == '%') && ((from + 2) < in_url_end)) {
	unsigned char c1 = *(from + 1);
	unsigned char c2 = *(from + 2);
	bool needsEncoding = ((map[c1 / 8] & (1 << (7 - c1 % 8))) \|\| (map[c2 / 8] & (1 << (7 - c2 % 8))));
	if (!needsEncoding) {
	out_len -= 2;
	Debug("log-utils", "character already encoded..skipping %c, %c, %c", from, (from + 1), *(from + 2));
	to++ = from++;
	continue;
	}
	}

	*to++ = '%';
	*to++ = hex_digit[c / 16];
	*to++ = hex_digit[c % 16];
	} else {
	to++ = from;
	}
	from++;
	}
	*to = '\0'; // null terminate string

	*len_out = out_len;
	return new_url;
	}
	} // namespace

	namespace Encoding
	{
	char *
	escapify_url(Arena arena, char url, size_t len_in, int len_out, char dst, size_t dst_size, const unsigned char *map)
	{
	return escapify_url_common(arena, url, len_in, len_out, dst, dst_size, map, false);
	}

	char *
	pure_escapify_url(Arena arena, char url, size_t len_in, int len_out, char dst, size_t dst_size, const unsigned char *map)
	{
	return escapify_url_common(arena, url, len_in, len_out, dst, dst_size, map, true);
	}
	}; // namespace Encoding