blob: 574f881bec6a9efb6c9ac268fcdd052103123505 [file] [log] [blame]
/*
* Copyright 2012 Google Inc.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
// Author: matterbury@google.com (Matt Atterbury)
// A set of utility functions for handling character sets/encodings and
// related concepts like byte-order-marks (BOM). Currently the only methods
// relate to BOMs.
#ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
#define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
#include "pagespeed/kernel/base/string_util.h"
namespace net_instaweb {
// The charsets we understand. Currently only those that have BOMs below.
const char kUtf8Charset[] = "utf-8";
const char kUtf16BigEndianCharset[] = "utf-16be";
const char kUtf16LittleEndianCharset[] = "utf-16le";
const char kUtf32BigEndianCharset[] = "utf-32be";
const char kUtf32LittleEndianCharset[] = "utf-32le";
// The Byte-Order-Mark (BOM) for the various UTF encodings.
const char kUtf8Bom[] = "\xEF\xBB\xBF";
const char kUtf16BigEndianBom[] = "\xFE\xFF";
const char kUtf16LittleEndianBom[] = "\xFF\xFE";
const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF";
const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00";
// Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
// Returns true if a BOM was stripped, false if not.
//
// In addition to specifying the encoding in the ContentType header, one
// can also specify it at the beginning of the file using a Byte Order Mark.
//
// Bytes Encoding Form
// 00 00 FE FF UTF-32, big-endian
// FF FE 00 00 UTF-32, little-endian
// FE FF UTF-16, big-endian
// FF FE UTF-16, little-endian
// EF BB BF UTF-8
// See: http://www.unicode.org/faq/utf_bom.html
//
// TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
// We currently handle only utf-8 BOM because we assume the resources
// we get are not in utf-16 or utf-32 when we read and parse them, anyway.
bool StripUtf8Bom(StringPiece* contents);
// Return the charset string for the given contents' BOM if any. If the
// contents start with one of the BOMs defined above then the corresponding
// charset is returned, otherwise an empty StringPiece.
const StringPiece GetCharsetForBom(const StringPiece contents);
} // namespace net_instaweb
#endif // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_