| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.jasper.compiler; |
| |
| import java.io.BufferedInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| |
| import javax.xml.stream.XMLInputFactory; |
| import javax.xml.stream.XMLStreamException; |
| import javax.xml.stream.XMLStreamReader; |
| |
| /* |
| * The BoM detection is derived from: |
| * https://svn.us.apache.org/viewvc/tomcat/trunk/java/org/apache/jasper/xmlparser/XMLEncodingDetector.java?annotate=1742248 |
| * |
| * The prolog is always at least as specific as the BOM therefore any encoding |
| * specified in the prolog should take priority over the BOM. |
| */ |
| class EncodingDetector { |
| |
| private static final XMLInputFactory XML_INPUT_FACTORY; |
| static { |
| XML_INPUT_FACTORY = XMLInputFactory.newInstance(); |
| } |
| |
| private final String encoding; |
| private final int skip; |
| private final boolean encodingSpecifiedInProlog; |
| |
| |
| EncodingDetector(BufferedInputStream bis) throws IOException { |
| // Buffer is 1k. BOM is only 4 bytes. |
| bis.mark(4); |
| |
| BomResult bomResult = processBom(bis); |
| |
| // Reset the stream back to the start to allow the XML prolog detection |
| // to work. Skip any BoM we discovered. |
| bis.reset(); |
| for (int i = 0; i < bomResult.skip; i++) { |
| bis.read(); |
| } |
| |
| String prologEncoding = getPrologEncoding(bis); |
| if (prologEncoding == null) { |
| encodingSpecifiedInProlog = false; |
| encoding = bomResult.encoding; |
| } else { |
| encodingSpecifiedInProlog = true; |
| encoding = prologEncoding; |
| } |
| skip = bomResult.skip; |
| } |
| |
| |
| String getEncoding() { |
| return encoding; |
| } |
| |
| |
| int getSkip() { |
| return skip; |
| } |
| |
| |
| boolean isEncodingSpecifiedInProlog() { |
| return encodingSpecifiedInProlog; |
| } |
| |
| |
| private String getPrologEncoding(InputStream stream) { |
| String encoding = null; |
| try { |
| XMLStreamReader xmlStreamReader = XML_INPUT_FACTORY.createXMLStreamReader(stream); |
| encoding = xmlStreamReader.getCharacterEncodingScheme(); |
| } catch (XMLStreamException e) { |
| // Ignore |
| } |
| return encoding; |
| } |
| |
| |
| private BomResult processBom(InputStream stream) { |
| // Read first four bytes (or as many are available) and determine |
| // encoding |
| try { |
| final byte[] b4 = new byte[4]; |
| int count = 0; |
| int singleByteRead; |
| while (count < 4) { |
| singleByteRead = stream.read(); |
| if (singleByteRead == -1) { |
| break; |
| } |
| b4[count] = (byte) singleByteRead; |
| count++; |
| } |
| |
| return parseBom(b4, count); |
| } catch (IOException ioe) { |
| // Failed. |
| return new BomResult("UTF-8", 0); |
| } |
| } |
| |
| |
| private BomResult parseBom(byte[] b4, int count) { |
| |
| if (count < 2) { |
| return new BomResult("UTF-8", 0); |
| } |
| |
| // UTF-16, with BOM |
| int b0 = b4[0] & 0xFF; |
| int b1 = b4[1] & 0xFF; |
| if (b0 == 0xFE && b1 == 0xFF) { |
| // UTF-16, big-endian |
| return new BomResult("UTF-16BE", 2); |
| } |
| if (b0 == 0xFF && b1 == 0xFE) { |
| // UTF-16, little-endian |
| return new BomResult("UTF-16LE", 2); |
| } |
| |
| // default to UTF-8 if we don't have enough bytes to make a |
| // good determination of the encoding |
| if (count < 3) { |
| return new BomResult("UTF-8", 0); |
| } |
| |
| // UTF-8 with a BOM |
| int b2 = b4[2] & 0xFF; |
| if (b0 == 0xEF && b1 == 0xBB && b2 == 0xBF) { |
| return new BomResult("UTF-8", 3); |
| } |
| |
| // default to UTF-8 if we don't have enough bytes to make a |
| // good determination of the encoding |
| if (count < 4) { |
| return new BomResult("UTF-8", 0); |
| } |
| |
| // Other encodings. No BOM. Try and ID encoding. |
| int b3 = b4[3] & 0xFF; |
| if (b0 == 0x00 && b1 == 0x00 && b2 == 0x00 && b3 == 0x3C) { |
| // UCS-4, big endian (1234) |
| return new BomResult("ISO-10646-UCS-4", 0); |
| } |
| if (b0 == 0x3C && b1 == 0x00 && b2 == 0x00 && b3 == 0x00) { |
| // UCS-4, little endian (4321) |
| return new BomResult("ISO-10646-UCS-4", 0); |
| } |
| if (b0 == 0x00 && b1 == 0x00 && b2 == 0x3C && b3 == 0x00) { |
| // UCS-4, unusual octet order (2143) |
| // REVISIT: What should this be? |
| return new BomResult("ISO-10646-UCS-4", 0); |
| } |
| if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x00) { |
| // UCS-4, unusual octet order (3412) |
| // REVISIT: What should this be? |
| return new BomResult("ISO-10646-UCS-4", 0); |
| } |
| if (b0 == 0x00 && b1 == 0x3C && b2 == 0x00 && b3 == 0x3F) { |
| // UTF-16, big-endian, no BOM |
| // (or could turn out to be UCS-2... |
| // REVISIT: What should this be? |
| return new BomResult("UTF-16BE", 0); |
| } |
| if (b0 == 0x3C && b1 == 0x00 && b2 == 0x3F && b3 == 0x00) { |
| // UTF-16, little-endian, no BOM |
| // (or could turn out to be UCS-2... |
| return new BomResult("UTF-16LE", 0); |
| } |
| if (b0 == 0x4C && b1 == 0x6F && b2 == 0xA7 && b3 == 0x94) { |
| // EBCDIC |
| // a la xerces1, return CP037 instead of EBCDIC here |
| return new BomResult("CP037", 0); |
| } |
| |
| // default encoding |
| return new BomResult("UTF-8", 0); |
| } |
| |
| |
| private static class BomResult { |
| |
| public final String encoding; |
| public final int skip; |
| |
| public BomResult(String encoding, int skip) { |
| this.encoding = encoding; |
| this.skip = skip; |
| } |
| } |
| } |