| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, |
| * software distributed under the License is distributed on an |
| * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| * KIND, either express or implied. See the License for the |
| * specific language governing permissions and limitations |
| * under the License. |
| */ |
| |
| // A set of utility functions for handling character sets/encodings and |
| // related concepts like byte-order-marks (BOM). Currently the only methods |
| // relate to BOMs. |
| |
| #ifndef PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_ |
| #define PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_ |
| |
| #include "pagespeed/kernel/base/string_util.h" |
| |
| namespace net_instaweb { |
| |
| // The charsets we understand. Currently only those that have BOMs below. |
| const char kUtf8Charset[] = "utf-8"; |
| const char kUtf16BigEndianCharset[] = "utf-16be"; |
| const char kUtf16LittleEndianCharset[] = "utf-16le"; |
| const char kUtf32BigEndianCharset[] = "utf-32be"; |
| const char kUtf32LittleEndianCharset[] = "utf-32le"; |
| |
| // The Byte-Order-Mark (BOM) for the various UTF encodings. |
| const char kUtf8Bom[] = "\xEF\xBB\xBF"; |
| const char kUtf16BigEndianBom[] = "\xFE\xFF"; |
| const char kUtf16LittleEndianBom[] = "\xFF\xFE"; |
| const char kUtf32BigEndianBom[] = "\x00\x00\xFE\xFF"; |
| const char kUtf32LittleEndianBom[] = "\xFF\xFE\x00\x00"; |
| |
| // Strips any initial UTF-8 BOM (Byte Order Mark) from the given contents. |
| // Returns true if a BOM was stripped, false if not. |
| // |
| // In addition to specifying the encoding in the ContentType header, one |
| // can also specify it at the beginning of the file using a Byte Order Mark. |
| // |
| // Bytes Encoding Form |
| // 00 00 FE FF UTF-32, big-endian |
| // FF FE 00 00 UTF-32, little-endian |
| // FE FF UTF-16, big-endian |
| // FF FE UTF-16, little-endian |
| // EF BB BF UTF-8 |
| // See: http://www.unicode.org/faq/utf_bom.html |
| // |
| // TODO(nforman): Possibly handle stripping BOMs from non-utf-8 files. |
| // We currently handle only utf-8 BOM because we assume the resources |
| // we get are not in utf-16 or utf-32 when we read and parse them, anyway. |
| bool StripUtf8Bom(StringPiece* contents); |
| |
| // Return the charset string for the given contents' BOM if any. If the |
| // contents start with one of the BOMs defined above then the corresponding |
| // charset is returned, otherwise an empty StringPiece. |
| const StringPiece GetCharsetForBom(const StringPiece contents); |
| |
| } // namespace net_instaweb |
| |
| #endif // PAGESPEED_KERNEL_BASE_CHARSET_UTIL_H_ |