blob: c1b69ab2b51e0262b399845f82899f7729bbb02a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html.charsetdetector;
import java.io.BufferedInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import java.util.AbstractMap;
import java.util.BitSet;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
/**
* A scanner meant to detect charset meta tags in a byte stream
* See: https://html.spec.whatwg.org/multipage/parsing.html#prescan-a-byte-stream-to-determine-its-encoding
*/
class PreScanner {
private static final Pattern CHARSET_PATTERN =
Pattern.compile("charset\\s*=\\s*([\"']?)([^\"'\\s;]+)\\1");
private static final byte[] COMMENT_START = {(byte) '<', (byte) '!', (byte) '-', (byte) '-'};
private static final byte[] COMMENT_END = {(byte) '-', (byte) '-', (byte) '>'};
private static final byte[] META_TAG_START =
{(byte) '<', (byte) 'm', (byte) 'e', (byte) 't', (byte) 'a'};
private static final byte SLASH = (byte) '/';
private static final byte EQUAL = (byte) '=';
private static final byte TAG_START = (byte) '<';
private static final byte TAG_END = (byte) '>';
private static final BitSet QUOTE = bitSet('"', '\'');
private static final BitSet WHITESPACE = bitSet(0x09, 0x0A, 0x0C, 0x0D, 0x0D, 0x20);
private static final BitSet SPACE_OR_TAG_END = bitSet(WHITESPACE, TAG_END);
private static final BitSet SPACE_OR_SLASH = bitSet(WHITESPACE, SLASH);
private static final BitSet SPECIAL_TAGS = bitSet('!', '/', '?');
private static final byte[] UTF8_BOM = {(byte) 0xEF, (byte) 0xBB, (byte) 0xBF};
private static final byte[] UTF16_BE_BOM = {(byte) 0xFE, (byte) 0xFF};
private static final byte[] UTF16_LE_BOM = {(byte) 0xFF, (byte) 0xFE};
private static final byte LOWER_A = (byte) 'a';
private static final byte LOWER_Z = (byte) 'z';
private static final byte UPPER_A = (byte) 'A';
private static final byte UPPER_Z = (byte) 'Z';
private BufferedInputStream stream;
private CharsetDetectionResult detectedCharset = CharsetDetectionResult.notFound();
PreScanner(InputStream inputStream) {
this.stream = new BufferedInputStream(inputStream);
}
private static BitSet bitSet(int... bs) {
BitSet bitSet = new BitSet(0xFF);
for (int b : bs) bitSet.set(b);
return bitSet;
}
private static BitSet bitSet(BitSet base, int... bs) {
BitSet bitSet = (BitSet) base.clone();
for (int b : bs) bitSet.set(b);
return bitSet;
}
static String getEncodingFromMeta(String attributeValue) {
Matcher matcher = CHARSET_PATTERN.matcher(attributeValue);
if (!matcher.find()) {
return null;
}
return matcher.group(2);
}
private static boolean contains(BitSet bitSet, byte b) {
return bitSet.get(b & 0xFF);
}
Charset scan() {
while (processAtLeastOneByte()) {
if (detectedCharset.isFound()) {
return detectedCharset.getCharset();
}
}
return null;
}
Charset detectBOM() {
try {
if (expect(UTF8_BOM)) {
return StandardCharsets.UTF_8;
} else if (expect(UTF16_BE_BOM)) {
return StandardCharsets.UTF_16BE;
} else if (expect(UTF16_LE_BOM)) {
return StandardCharsets.UTF_16LE;
}
} catch (IOException e) { /* stream could not be read, also return null */ }
return null;
}
private boolean processAtLeastOneByte() {
try {
return processComment() || processMeta() || processTag() || processSpecialTag() ||
processAny();
} catch (IOException e) {
return false;
}
}
private boolean processAny() throws IOException {
int read = stream.read();
return read != -1;
}
private boolean processTag() throws IOException {
stream.mark(3);
if (read() == TAG_START) {
int read = stream.read();
if (read == SLASH) {
read = stream.read();
}
if ((LOWER_A <= read && read <= LOWER_Z) || (UPPER_A <= read && read <= UPPER_Z)) {
do {
stream.mark(1);
} while (!contains(SPACE_OR_TAG_END, read()));
stream.reset();
while (getAttribute() != null) {/* ignore the attribute*/}
return true;
}
}
stream.reset();
return false;
}
private boolean processSpecialTag() throws IOException {
stream.mark(2);
if (read() == TAG_START && contains(SPECIAL_TAGS, read())) {
skipUntil(TAG_END);
return true;
}
stream.reset();
return false;
}
private boolean processMeta() throws IOException {
stream.mark(6); // len("<meta ") == 6
if (readCaseInsensitive(META_TAG_START) && contains(SPACE_OR_SLASH, read())) {
MetaProcessor metaProcessor = new MetaProcessor();
for (Map.Entry<String, String> attribute = getAttribute(); attribute != null;
attribute = getAttribute()) {
metaProcessor.processAttribute(attribute);
}
metaProcessor.updateDetectedCharset(detectedCharset);
return true;
}
stream.reset();
return false;
}
/**
* Read an attribute from the stream
*
* @return the attribute as a Map.Entry, where the key is the attribute's name and
* the value is the attribute's value. If there is no attribute, return null
*/
private Map.Entry<String, String> getAttribute() throws IOException {
String name = getAttributeName();
if (name == null) {
return null;
}
if (!expect(EQUAL)) {
return new AbstractMap.SimpleEntry<>(name, "");
}
skipAll(WHITESPACE);
String value = getAttributeValue();
return new AbstractMap.SimpleEntry<>(name, value);
}
private String getAttributeName() throws IOException {
skipAll(SPACE_OR_SLASH);
if (expect(TAG_END)) {
return null;
}
StringBuilder name = new StringBuilder();
while (!(peek() == EQUAL && name.length() > 0) && !(peek() == TAG_END || peek() == SLASH) &&
!skipAll(WHITESPACE)) {
name.append((char) getLowerCaseChar());
}
return name.toString();
}
private String getAttributeValue() throws IOException {
StringBuilder value = new StringBuilder();
stream.mark(1);
byte quote = read();
if (contains(QUOTE, quote)) {
for (byte b = getLowerCaseChar(); b != quote; b = getLowerCaseChar()) {
value.append((char) b);
}
} else {
stream.reset();
for (byte b = getLowerCaseChar(); !contains(SPACE_OR_TAG_END, b);
b = getLowerCaseChar()) {
value.append((char) b);
stream.mark(1);
}
stream.reset(); // unread the space or tag end
}
return value.toString();
}
private boolean skipAll(BitSet bitSet) throws IOException {
boolean skipped = false;
stream.mark(1);
for (byte read = read(); contains(bitSet, read); read = read()) {
skipped = true;
stream.mark(1);
}
stream.reset();
return skipped;
}
private byte getLowerCaseChar() throws IOException {
byte nextPoint = read();
if (nextPoint >= 'A' && nextPoint <= 'Z') {
nextPoint += 0x20; // lowercase
}
return nextPoint;
}
private boolean processComment() throws IOException {
if (!expect(COMMENT_START)) {
return false;
}
if (!expect(TAG_END)) {
skipUntil(COMMENT_END);
}
return true;
}
private boolean expect(byte... expected) throws IOException {
stream.mark(expected.length);
for (byte b : expected) {
byte read = read();
if (read != b) {
stream.reset();
return false;
}
}
return true;
}
private void skipUntil(byte... expected) throws IOException {
while (!expect(expected)) {
if (stream.read() == -1) {
return;
}
}
}
private boolean readCaseInsensitive(byte... bs) throws IOException {
for (byte b : bs)
if (getLowerCaseChar() != b) {
return false;
}
return true;
}
private byte read() throws IOException {
int r = stream.read();
if (r == -1) {
throw new IOException();
}
return (byte) r;
}
private byte peek() throws IOException {
stream.mark(1);
byte b = read();
stream.reset();
return b;
}
}