blob: fd2b0092d2ea9596edeb58010842e991a635b871 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser;
import static org.junit.Assert.fail;
import java.io.ByteArrayInputStream;
import java.io.ByteArrayOutputStream;
import java.io.FileNotFoundException;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.nio.file.Files;
import java.nio.file.Path;
import javax.xml.parsers.SAXParser;
import javax.xml.parsers.SAXParserFactory;
import org.apache.commons.io.IOUtils;
import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
import org.junit.Ignore;
import org.junit.Test;
import org.xml.sax.ContentHandler;
import org.apache.tika.config.TikaConfig;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.microsoft.OfficeParserConfig;
import org.apache.tika.sax.ToHTMLContentHandler;
import org.apache.tika.utils.XMLReaderUtils;
/**
* This tests for XXE in basically xml type files, straight xml and zipped
* xmls, e.g. ebook and ooxml.
* It does not test for XXE prevention in files that may contain xml
* files, such as PDFs and other XMP-containing files.
*/
public class TestXXEInXML extends XMLTestBase {
//TODO: figure out how to test XFA and xmp in PDFs
private static final byte[] XXE =
"<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">"
.getBytes(StandardCharsets.UTF_8);
@Test
@Ignore("ignore vulnerable tests")
public void testConfirmVulnerable() throws Exception {
try {
parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"),
new VulnerableSAXParser(), new ParseContext());
fail("should have failed!!!");
} catch (FileNotFoundException e) {
//expected
}
}
@Test
public void testXML() throws Exception {
try (InputStream is = getResourceAsStream("/test-documents/testXXE.xml")) {
parse("testXXE.xml", is, AUTO_DETECT_PARSER, new ParseContext());
}
}
@Test
public void testInjectedXML() throws Exception {
byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
.getBytes(StandardCharsets.UTF_8);
byte[] injected = injectXML(bytes, XXE);
try {
parse("injected", new ByteArrayInputStream(injected), new VulnerableSAXParser(),
new ParseContext());
fail("injected should have triggered xxe");
} catch (FileNotFoundException e) {
//expected
}
}
@Test
public void test2003_2006xml() throws Exception {
InputStream is = getResourceAsStream("/test-documents/testWORD_2003ml.xml");
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] injected = injectXML(bos.toByteArray(), XXE);
parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
new ParseContext());
is.close();
is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
injected = injectXML(bos.toByteArray(), XXE);
parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
new ParseContext());
}
@Test
public void testPOIOOXMLs() throws Exception {
for (String fileName : new String[]{"testWORD.docx", "testWORD_1img.docx",
"testWORD_2006ml.docx", "testWORD_embedded_pics.docx", "testWORD_macros.docm",
"testEXCEL_textbox.xlsx", "testEXCEL_macro.xlsm", "testEXCEL_phonetic.xlsx",
"testEXCEL_embeddedPDF_windows.xlsx", "testPPT_2imgs.pptx", "testPPT_comment.pptx",
"testPPT_EmbeddedPDF.pptx", "testPPT_macros.pptm"}) {
_testPOIOOXMLs(fileName);
}
}
private void _testPOIOOXMLs(String fileName) throws Exception {
Path injected = null;
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsStream("/test-documents/" + fileName))) {
Path originalOOXML = tis.getPath();
injected = injectZippedXMLs(originalOOXML, XXE, false);
ContentHandler xhtml = new ToHTMLContentHandler();
ParseContext parseContext = new ParseContext();
//if the SafeContentHandler is turned off, this will throw an FNFE
Metadata metadata = new Metadata();
try {
AUTO_DETECT_PARSER
.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
} catch (TikaException e) {
Throwable cause = e.getCause();
if (!(cause instanceof InvalidFormatException)) {
//as of POI 4.1.x
fail("POI should have thrown an IFE complaining about " +
"not being able to read content types part !");
}
} finally {
Files.delete(injected);
}
try {
metadata = new Metadata();
xhtml = new ToHTMLContentHandler();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
parseContext.set(OfficeParserConfig.class, officeParserConfig);
officeParserConfig.setUseSAXDocxExtractor(true);
officeParserConfig.setUseSAXPptxExtractor(true);
injected = injectZippedXMLs(originalOOXML, XXE, true);
AUTO_DETECT_PARSER
.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
} catch (FileNotFoundException e) {
e.printStackTrace();
fail("problem with SAX-based: " + fileName + ": " + e.getMessage());
} finally {
Files.delete(injected);
}
}
}
@Test
public void testXMLInZips() throws Exception {
for (String fileName : new String[]{"testEPUB.epub"}) {
_testXMLInZips(fileName);
}
}
private void _testXMLInZips(String fileName) throws Exception {
Path injected = null;
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsStream("/test-documents/" + fileName))) {
injected = injectZippedXMLs(tis.getPath(), XXE, false);
}
Parser p = AUTO_DETECT_PARSER;
ContentHandler xhtml = new ToHTMLContentHandler();
ParseContext parseContext = new ParseContext();
//if the SafeContentHandler is turned off, this will throw an FNFE
Metadata metadata = new Metadata();
try {
p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
} finally {
Files.delete(injected);
}
}
@Test
public void testDOM() throws Exception {
byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
.getBytes(StandardCharsets.UTF_8);
byte[] injected = injectXML(bytes, XXE);
for (int i = 0; i < XMLReaderUtils.getPoolSize() * 2; i++) {
//this shouldn't throw an exception
XMLReaderUtils.buildDOM(new ByteArrayInputStream(injected), new ParseContext());
}
}
//use this to confirm that this works
//by manually turning off the SafeContentHandler in SXWPFWordExtractorDecorator's
//handlePart
public void testDocxWithIncorrectSAXConfiguration() throws Exception {
Path injected = null;
try (TikaInputStream tis = TikaInputStream
.get(getResourceAsStream("/test-documents/testWORD_macros.docm"))) {
injected = injectZippedXMLs(tis.getPath(), XXE, true);
}
ContentHandler xhtml = new ToHTMLContentHandler();
ParseContext parseContext = new ParseContext();
OfficeParserConfig officeParserConfig = new OfficeParserConfig();
officeParserConfig.setUseSAXDocxExtractor(true);
parseContext.set(OfficeParserConfig.class, officeParserConfig);
parseContext.set(SAXParser.class, SAXParserFactory.newInstance().newSAXParser());
//if the SafeContentHandler is turned off, this will throw an FNFE
try {
AUTO_DETECT_PARSER
.parse(Files.newInputStream(injected), xhtml, new Metadata(), parseContext);
} finally {
//Files.delete(injected);
}
}
@Test
public void testDOMTikaConfig() throws Exception {
//tests the DOM reader in TikaConfig
//if the safeguards aren't in place, this throws a FNFE
try (InputStream is = getResourceAsStream(
"/org/apache/tika/config/TIKA-1558-exclude.xml")) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
byte[] injected = injectXML(bos.toByteArray(), XXE);
TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
}
}
}