pagespeed/kernel/base/charset_util.h - incubator-pagespeed-mod - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  */

 // A set of utility functions for handling character sets/encodings and
 // related concepts like byte-order-marks (BOM). Currently the only methods
 // relate to BOMs.

 #ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
 #define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_

 #include "pagespeed/kernel/base/string_util.h"

 namespace net_instaweb {

 // The charsets we understand. Currently only those that have BOMs below.
 const char kUtf8Charset[] = "utf-8";
 const char kUtf16BigEndianCharset[] = "utf-16be";
 const char kUtf16LittleEndianCharset[] = "utf-16le";
 const char kUtf32BigEndianCharset[] = "utf-32be";
 const char kUtf32LittleEndianCharset[] = "utf-32le";

 // The Byte-Order-Mark (BOM) for the various UTF encodings.
 const char kUtf8Bom[] = "\xEF\xBB\xBF";
 const char kUtf16BigEndianBom[] = "\xFE\xFF";
 const char kUtf16LittleEndianBom[] = "\xFF\xFE";
 const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF";
 const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00";

 // Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
 // Returns true if a BOM was stripped, false if not.
 //
 // In addition to specifying the encoding in the ContentType header, one
 // can also specify it at the beginning of the file using a Byte Order Mark.
 //
 // Bytes        Encoding Form
 // 00 00 FE FF  UTF-32, big-endian
 // FF FE 00 00  UTF-32, little-endian
 // FE FF        UTF-16, big-endian
 // FF FE        UTF-16, little-endian
 // EF BB BF     UTF-8
 // See: http://www.unicode.org/faq/utf_bom.html
 //
 // TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
 // We currently handle only utf-8 BOM because we assume the resources
 // we get are not in utf-16 or utf-32 when we read and parse them, anyway.
 bool StripUtf8Bom(StringPiece* contents);

 // Return the charset string for the given contents' BOM if any. If the
 // contents start with one of the BOMs defined above then the corresponding
 // charset is returned, otherwise an empty StringPiece.
 const StringPiece GetCharsetForBom(const StringPiece contents);

 }  // namespace net_instaweb

 #endif  // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*/

	// A set of utility functions for handling character sets/encodings and
	// related concepts like byte-order-marks (BOM). Currently the only methods
	// relate to BOMs.

	#ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
	#define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_

	#include "pagespeed/kernel/base/string_util.h"

	namespace net_instaweb {

	// The charsets we understand. Currently only those that have BOMs below.
	const char kUtf8Charset[] = "utf-8";
	const char kUtf16BigEndianCharset[] = "utf-16be";
	const char kUtf16LittleEndianCharset[] = "utf-16le";
	const char kUtf32BigEndianCharset[] = "utf-32be";
	const char kUtf32LittleEndianCharset[] = "utf-32le";

	// The Byte-Order-Mark (BOM) for the various UTF encodings.
	const char kUtf8Bom[] = "\xEF\xBB\xBF";
	const char kUtf16BigEndianBom[] = "\xFE\xFF";
	const char kUtf16LittleEndianBom[] = "\xFF\xFE";
	const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF";
	const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00";

	// Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
	// Returns true if a BOM was stripped, false if not.
	//
	// In addition to specifying the encoding in the ContentType header, one
	// can also specify it at the beginning of the file using a Byte Order Mark.
	//
	// Bytes Encoding Form
	// 00 00 FE FF UTF-32, big-endian
	// FF FE 00 00 UTF-32, little-endian
	// FE FF UTF-16, big-endian
	// FF FE UTF-16, little-endian
	// EF BB BF UTF-8
	// See: http://www.unicode.org/faq/utf_bom.html
	//
	// TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
	// We currently handle only utf-8 BOM because we assume the resources
	// we get are not in utf-16 or utf-32 when we read and parse them, anyway.
	bool StripUtf8Bom(StringPiece* contents);

	// Return the charset string for the given contents' BOM if any. If the
	// contents start with one of the BOMs defined above then the corresponding
	// charset is returned, otherwise an empty StringPiece.
	const StringPiece GetCharsetForBom(const StringPiece contents);

	} // namespace net_instaweb

	#endif // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_