src/pagespeed/kernel/base/charset_util.h - incubator-pagespeed-debian - Git at Google

 /*
  * Copyright 2012 Google Inc.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
  * You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 // Author: matterbury@google.com (Matt Atterbury)

 // A set of utility functions for handling character sets/encodings and
 // related concepts like byte-order-marks (BOM). Currently the only methods
 // relate to BOMs.

 #ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
 #define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_

 #include "pagespeed/kernel/base/string_util.h"

 namespace net_instaweb {

 // The charsets we understand. Currently only those that have BOMs below.
 const char kUtf8Charset[]                  = "utf-8";
 const char kUtf16BigEndianCharset[]        = "utf-16be";
 const char kUtf16LittleEndianCharset[]     = "utf-16le";
 const char kUtf32BigEndianCharset[]        = "utf-32be";
 const char kUtf32LittleEndianCharset[]     = "utf-32le";

 // The Byte-Order-Mark (BOM) for the various UTF encodings.
 const char kUtf8Bom[]                  = "\xEF\xBB\xBF";
 const char kUtf16BigEndianBom[]        = "\xFE\xFF";
 const char kUtf16LittleEndianBom[]     = "\xFF\xFE";
 const char kUtf32BigEndianBom[]        = "\x00\x00\xFE\xFF";
 const char kUtf32LittleEndianBom[]     = "\xFF\xFE\x00\x00";

 // Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
 // Returns true if a BOM was stripped, false if not.
 //
 // In addition to specifying the encoding in the ContentType header, one
 // can also specify it at the beginning of the file using a Byte Order Mark.
 //
 // Bytes        Encoding Form
 // 00 00 FE FF  UTF-32, big-endian
 // FF FE 00 00  UTF-32, little-endian
 // FE FF        UTF-16, big-endian
 // FF FE        UTF-16, little-endian
 // EF BB BF     UTF-8
 // See: http://www.unicode.org/faq/utf_bom.html
 //
 // TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
 // We currently handle only utf-8 BOM because we assume the resources
 // we get are not in utf-16 or utf-32 when we read and parse them, anyway.
 bool StripUtf8Bom(StringPiece* contents);

 // Return the charset string for the given contents' BOM if any. If the
 // contents start with one of the BOMs defined above then the corresponding
 // charset is returned, otherwise an empty StringPiece.
 const StringPiece GetCharsetForBom(const StringPiece contents);

 }  // namespace net_instaweb

 #endif  // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
	/*
	* Copyright 2012 Google Inc.
	*
	* Licensed under the Apache License, Version 2.0 (the "License");
	* you may not use this file except in compliance with the License.
	* You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/

	// Author: matterbury@google.com (Matt Atterbury)

	// A set of utility functions for handling character sets/encodings and
	// related concepts like byte-order-marks (BOM). Currently the only methods
	// relate to BOMs.

	#ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
	#define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_

	#include "pagespeed/kernel/base/string_util.h"

	namespace net_instaweb {

	// The charsets we understand. Currently only those that have BOMs below.
	const char kUtf8Charset[] = "utf-8";
	const char kUtf16BigEndianCharset[] = "utf-16be";
	const char kUtf16LittleEndianCharset[] = "utf-16le";
	const char kUtf32BigEndianCharset[] = "utf-32be";
	const char kUtf32LittleEndianCharset[] = "utf-32le";

	// The Byte-Order-Mark (BOM) for the various UTF encodings.
	const char kUtf8Bom[] = "\xEF\xBB\xBF";
	const char kUtf16BigEndianBom[] = "\xFE\xFF";
	const char kUtf16LittleEndianBom[] = "\xFF\xFE";
	const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF";
	const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00";

	// Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
	// Returns true if a BOM was stripped, false if not.
	//
	// In addition to specifying the encoding in the ContentType header, one
	// can also specify it at the beginning of the file using a Byte Order Mark.
	//
	// Bytes Encoding Form
	// 00 00 FE FF UTF-32, big-endian
	// FF FE 00 00 UTF-32, little-endian
	// FE FF UTF-16, big-endian
	// FF FE UTF-16, little-endian
	// EF BB BF UTF-8
	// See: http://www.unicode.org/faq/utf_bom.html
	//
	// TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
	// We currently handle only utf-8 BOM because we assume the resources
	// we get are not in utf-16 or utf-32 when we read and parse them, anyway.
	bool StripUtf8Bom(StringPiece* contents);

	// Return the charset string for the given contents' BOM if any. If the
	// contents start with one of the BOMs defined above then the corresponding
	// charset is returned, otherwise an empty StringPiece.
	const StringPiece GetCharsetForBom(const StringPiece contents);

	} // namespace net_instaweb

	#endif // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_