blob: 671743fe62a14d8def30a7dd8bf82ef2cec9c57e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.html;
import static org.junit.Assert.assertArrayEquals;
import static org.junit.Assert.assertEquals;
import java.io.BufferedInputStream;
import java.io.ByteArrayInputStream;
import java.io.IOException;
import java.io.InputStream;
import java.io.SequenceInputStream;
import java.nio.charset.Charset;
import java.nio.charset.StandardCharsets;
import org.junit.Before;
import org.junit.Test;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.html.charsetdetector.StandardHtmlEncodingDetector;
import org.apache.tika.parser.html.charsetdetector.charsets.ReplacementCharset;
public class StandardHtmlEncodingDetectorTest {
private Metadata metadata = new Metadata();
@Before
public void setUp() {
this.metadata = new Metadata();
}
@Test
public void basic() throws IOException {
assertWindows1252("<meta charset=WINDOWS-1252>");
}
@Test
public void quoted() throws IOException {
assertWindows1252("<meta charset='WINDOWS-1252'>");
}
@Test
public void duplicateMeta() throws IOException {
assertWindows1252("<meta charset='WINDOWS-1252'>" + "<meta charset='UTF-8'>");
}
@Test
public void duplicateAttribute() throws IOException {
assertWindows1252("<meta charset='WINDOWS-1252' charset='UTF-8'>");
}
@Test
public void invalidThenValid() throws IOException {
assertCharset("<meta charset=blah>" + "<meta charset=WINDOWS-1252>", null);
}
@Test
public void spacesInAttributes() throws IOException {
assertWindows1252("<meta charset\f= \t WINDOWS-1252>");
}
@Test
public void httpEquiv() throws IOException {
assertWindows1252("<meta " + "http-equiv='content-type' " +
"content='text/html; charset=\"WINDOWS-1252\"'>"); // quotes around the
// charset are allowed
assertWindows1252("<meta " + "content=' charset = WINDOWS-1252' " +
// The charset may be anywhere in the content attribute
"http-equiv='content-type' >");
}
@Test
public void emptyAttributeEnd() throws IOException {
assertWindows1252("<meta charset=WINDOWS-1252 a>");
}
@Test
public void httpEquivDuplicateCharset() throws IOException {
assertWindows1252(
"<meta " + "http-equiv='content-type' " + "content='charset=WINDOWS-1252;" +
// The detection should stop after the semicolon
"charset=UTF-8'>");
}
@Test
public void htmlFragment() throws IOException {
assertWindows1252("<!doctype html><html class=nojs><head><meta charset='WINDOWS-1252'>");
}
@Test
public void veryBadHtml() throws IOException {
// check that the parser is not confused by garbage before the declaration
assertWindows1252("<< l \" == / '=x\n >" + "<!--> " + "< <x'/ <=> " + "<meta/>" + "<meta>" +
"<a x/>" + "<meta charset='WINDOWS-1252'>");
}
@Test
public void specialTag() throws IOException {
// special tags cannot have arguments, any '>' ends them
assertWindows1252("<? x='><meta charset='WINDOWS-1252'>");
}
@Test
public void longHtml() throws IOException {
StringBuilder sb = new StringBuilder(
"<!doctype html>\n" + "<html>\n" + "<head>\n" + "<title>Hello world</title>\n");
String repeated = "<meta x='y' />\n";
String charsetMeta = "<meta charset='windows-1252'>";
while (sb.length() + repeated.length() + charsetMeta.length() < 1024) sb.append(repeated);
sb.append(charsetMeta);
assertWindows1252(sb.toString());
}
@Test
public void tooLong() throws IOException {
// Create a string with 1Mb of '\0' followed by a meta
String padded = new String(new byte[1000000], StandardCharsets.ISO_8859_1) +
"<meta charset='windows-1252'>";
// Only the first bytes should be prescanned, so the algorithm should stop before
// the meta tag
assertCharset(padded, null);
}
@Test
public void incompleteMeta() throws IOException {
assertCharset("<meta charset='WINDOWS-1252'", null); // missing '>' at the end
}
@Test
public void charsetWithWhiteSpaces() throws IOException {
assertWindows1252("<meta charset=' \t\n WINDOWS-1252 \t\n'>");
}
@Test
public void mixedCase() throws IOException {
assertWindows1252("<mEtA chArsEt='WInDOWs-1252'>");
}
@Test
public void utf16() throws IOException {
// According to the specification 'If charset is a UTF-16 encoding, then set
// charset to UTF-8.'
assertCharset("<meta charset='UTF-16BE'>", StandardCharsets.UTF_8);
}
@Test
public void xUserDefined() throws IOException {
// According to the specification 'If charset is x-user-defined, then set charset
// to windows-1252.'
assertWindows1252("<meta charset='x-user-defined'>");
}
@Test
public void replacement() throws IOException {
// Several dangerous charsets should are aliases of 'replacement' in the spec
String inString = "<meta charset='iso-2022-cn'>";
assertCharset(new ByteArrayInputStream(inString.getBytes(StandardCharsets.ISO_8859_1)),
new ReplacementCharset());
}
@Test
public void iso88591() throws IOException {
// In the spec, iso-8859-1 is an alias for WINDOWS-1252
assertWindows1252("<meta charset='iso-8859-1'>");
}
@Test
public void macintoshEncoding() throws IOException {
// The mac roman encoding exists in java, but under the name x-MacRoman
assertCharset("<meta charset='macintosh'>", Charset.forName("x-MacRoman"));
}
@Test
public void bom() throws IOException {
// A BOM should have precedence over the meta
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
}
@Test
public void withSlash() throws IOException {
assertWindows1252("<meta/charset='WINDOWS-1252'>");
}
@Test
public void insideDescription() throws IOException {
assertWindows1252("<meta name='description'" +
"content='If I write charset=UTF-8 here, it doesnt mean the page is in UTF-8'/>" +
"<meta charset='WINDOWS-1252'>");
}
@Test
public void insideTag() throws IOException {
assertWindows1252("<tag " + "attribute=\"<meta charset='UTF-8'>\" " + // inside attribute
"<meta charset='UTF-8' " + // still inside tag
"/>" + // tag end
"<meta charset='WINDOWS-1252'>");
}
@Test
public void missingAttribute() throws IOException {
assertWindows1252("<meta content='charset=UTF-8'>" + // missing http-equiv attribute
"<meta charset='WINDOWS-1252'>" // valid declaration
);
}
@Test
public void insideSpecialTag() throws IOException {
// Content inside <?, <!, and </ should be ignored
for (byte b : "?!/".getBytes(StandardCharsets.US_ASCII))
assertWindows1252("<" + (char) b + // start comment
"<meta charset='UTF-8'>" + // inside special tag
"<meta charset='WINDOWS-1252'>" // real charset declaration
);
}
@Test
public void spaceBeforeTag() throws IOException {
assertWindows1252("< meta charset='UTF-8'>" + // invalid charset declaration
"<meta charset='WINDOWS-1252'>" // real charset declaration
);
}
@Test
public void invalidAttribute() throws IOException {
assertWindows1252("<meta " + "badcharset='UTF-8' " + // invalid charset declaration
"charset='WINDOWS-1252'>" // real charset declaration
);
}
@Test
public void unmatchedQuote() throws IOException {
assertWindows1252("<meta http-equiv='content-type' content='charset=\"UTF-8'>" +
// invalid charset declaration
"<meta charset='WINDOWS-1252'>" // real charset declaration
);
}
@Test
public void realWorld() throws IOException {
assertWindows1252("<!DOCTYPE html>\n" + "<html lang=\"fr\">\n" + "<head>\n" +
"<script>(function(w,d,s,l,i){w[l]=w[l]||[];w[l].push({'gtm.start':\n" +
"\t\t\tnew Date().getTime(),event:'gtm.js'});var " +
"f=d.getElementsByTagName(s)[0],\n" +
"\t\t\tj=d.createElement(s),dl=l!='dataLayer'?'&l='+l:'';j.async=true;j.src=\n" +
"\t\t\t'https://www.googletagmanager.com/gtm.js?id='+i+dl;" +
"f.parentNode.insertBefore(j,f);\n" +
"\t\t\t})(window,document,'script','dataLayer','GTM-PNX8H8X');</script>\n" +
"<title>Horaires Transilien 2018 - Lignes A B C D E H J K L N P R U</title>\n" +
"<meta name=\"description\" content=\"Consultez les horaires du Transilien en " +
"temps réel. Lignes A et B du RER. Lignes C " +
"D E H J K L N P R U du Transilien.\">\n" +
"<meta name=\"keywords\" content=\"horaires transilien\">\n" +
"<meta charset=\"windows-1252\">\n" +
"<meta name=\"viewport\" content=\"width=device-width, initial-scale=1.0\">\n" +
"<meta name=\"robots\" content=\"follow, index\">\n" + "<base hr");
}
@Test
public void withCompactComment() throws IOException {
// <!--> is a valid comment
assertWindows1252("<!--" + // start comment
"<meta charset='UTF-8'>" + // inside comment
"-->" + // end comment
"<!-->" + // compact comment
"<meta charset='WINDOWS-1252'>" // outside comment, charset declaration
);
}
@Test
public void withCharsetInContentType() throws IOException {
metadata.set(Metadata.CONTENT_TYPE, "text/html; Charset=ISO-8859-1");
// ISO-8859-1 is an alias for WINDOWS-1252, even if it's set at the transport layer level
assertWindows1252("");
assertWindows1252("<meta charset='UTF-8'>");
assertWindows1252("<meta http-equiv='content-type' content='charset=utf-8'>");
// if a BOM is present, it has precedence over transport layer information
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_8);
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16LE);
assertCharset("\ufeff<meta charset='WINDOWS-1252'>", StandardCharsets.UTF_16BE);
}
@Test
public void throwResistance() throws IOException {
// The preprocessing should return right after having found the charset
// So if an error is thrown in the stream AFTER the declaration,
// it shouldn't see it
assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'>"));
assertWindows1252(throwAfter("<meta charset='WINDOWS-1252'><some other tag"));
// But if an error is thrown before the end of the meta tag, it should see it
// and return unsuccessfully
assertCharset(throwAfter("<meta charset='WINDOWS-1252'"), null);
// If there is no meta, but an error is thrown, the detector simply returns
// unsuccessfully (it should not throw runtime errors)
assertCharset(throwAfter("<"), null);
assertCharset(throwAfter("<!"), null);
assertCharset(throwAfter("<!doctype"), null);
assertCharset(throwAfter("<!doctype html><html"), null);
assertCharset(throwAfter("<!doctype html><html attr"), null);
assertCharset(throwAfter("<!doctype html><html attr="), null);
assertCharset(throwAfter("<!doctype html><html attr=x"), null);
assertCharset(throwAfter("<!doctype html><html attr='x"), null);
}
@Test
public void streamReset() throws IOException {
// The stream should be reset after detection
byte[] inBytes = {0, 1, 2, 3, 4};
byte[] outBytes = new byte[5];
InputStream inStream = new ByteArrayInputStream(inBytes);
detectCharset(inStream);
// The stream should still be readable from the beginning after detection
inStream.read(outBytes);
assertArrayEquals(inBytes, outBytes);
}
private void assertWindows1252(String html) throws IOException {
assertCharset(html, Charset.forName("WINDOWS-1252"));
}
private void assertWindows1252(InputStream inStream) throws IOException {
assertCharset(inStream, Charset.forName("WINDOWS-1252"));
}
private void assertCharset(String html, Charset charset) throws IOException {
final Charset contentsCharset = (charset == null) ? StandardCharsets.UTF_8 : charset;
InputStream inStream = new ByteArrayInputStream(html.getBytes(contentsCharset));
final Charset detected = detectCharset(inStream);
assertEquals(html + " should be detected as " + charset, charset, detected);
}
private void assertCharset(InputStream inStream, Charset charset) throws IOException {
final Charset detected = detectCharset(inStream);
assertEquals(charset, detected);
}
private Charset detectCharset(InputStream inStream) throws IOException {
return new StandardHtmlEncodingDetector().detect(inStream, metadata);
}
private InputStream throwAfter(String html) {
byte[] contents = html.getBytes(StandardCharsets.UTF_8);
InputStream contentsInStream = new ByteArrayInputStream(contents);
InputStream errorThrowing = new InputStream() {
@Override
public int read() throws IOException {
throw new IOException("test exception");
}
};
return new BufferedInputStream(new SequenceInputStream(contentsInStream, errorThrowing));
}
}