blob: 8fb8011e2bd10118e43c562663b8d8c42be75151 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
// A set of utility functions for handling character sets/encodings and
// related concepts like byte-order-marks (BOM). Currently the only methods
// relate to BOMs.
#ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
#define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_
#include "pagespeed/kernel/base/string_util.h"
namespace net_instaweb {
// The charsets we understand. Currently only those that have BOMs below.
const char kUtf8Charset[] = "utf-8";
const char kUtf16BigEndianCharset[] = "utf-16be";
const char kUtf16LittleEndianCharset[] = "utf-16le";
const char kUtf32BigEndianCharset[] = "utf-32be";
const char kUtf32LittleEndianCharset[] = "utf-32le";
// The Byte-Order-Mark (BOM) for the various UTF encodings.
const char kUtf8Bom[] = "\xEF\xBB\xBF";
const char kUtf16BigEndianBom[] = "\xFE\xFF";
const char kUtf16LittleEndianBom[] = "\xFF\xFE";
const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF";
const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00";
// Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents.
// Returns true if a BOM was stripped, false if not.
//
// In addition to specifying the encoding in the ContentType header, one
// can also specify it at the beginning of the file using a Byte Order Mark.
//
// Bytes Encoding Form
// 00 00 FE FF UTF-32, big-endian
// FF FE 00 00 UTF-32, little-endian
// FE FF UTF-16, big-endian
// FF FE UTF-16, little-endian
// EF BB BF UTF-8
// See: http://www.unicode.org/faq/utf_bom.html
//
// TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files.
// We currently handle only utf-8 BOM because we assume the resources
// we get are not in utf-16 or utf-32 when we read and parse them, anyway.
bool StripUtf8Bom(StringPiece* contents);
// Return the charset string for the given contents' BOM if any. If the
// contents start with one of the BOMs defined above then the corresponding
// charset is returned, otherwise an empty StringPiece.
const StringPiece GetCharsetForBom(const StringPiece contents);
} // namespace net_instaweb
#endif // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_