tika-parsers/tika-parsers-standard/tika-parsers-standard-package/src/test/java/org/apache/tika/parser/TestXXEInXML.java - tika - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.tika.parser;

 import static org.junit.Assert.fail;

 import java.io.ByteArrayInputStream;
 import java.io.ByteArrayOutputStream;
 import java.io.FileNotFoundException;
 import java.io.InputStream;
 import java.nio.charset.StandardCharsets;
 import java.nio.file.Files;
 import java.nio.file.Path;
 import javax.xml.parsers.SAXParser;
 import javax.xml.parsers.SAXParserFactory;

 import org.apache.commons.io.IOUtils;
 import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
 import org.junit.Ignore;
 import org.junit.Test;
 import org.xml.sax.ContentHandler;

 import org.apache.tika.config.TikaConfig;
 import org.apache.tika.exception.TikaException;
 import org.apache.tika.io.TikaInputStream;
 import org.apache.tika.metadata.Metadata;
 import org.apache.tika.parser.microsoft.OfficeParserConfig;
 import org.apache.tika.sax.ToHTMLContentHandler;
 import org.apache.tika.utils.XMLReaderUtils;

 /**
  * This tests for XXE in basically xml type files, straight xml and zipped
  * xmls, e.g. ebook and ooxml.
  * It does not test for XXE prevention in files that may contain xml
  * files, such as PDFs and other XMP-containing files.
  */
 public class TestXXEInXML extends XMLTestBase {
     //TODO: figure out how to test XFA and xmp in PDFs

     private static final byte[] XXE =
             "<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">"
                     .getBytes(StandardCharsets.UTF_8);

     @Test
     @Ignore("ignore vulnerable tests")
     public void testConfirmVulnerable() throws Exception {
         try {
             parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"),
                     new VulnerableSAXParser(), new ParseContext());
             fail("should have failed!!!");
         } catch (FileNotFoundException e) {
             //expected
         }
     }

     @Test
     public void testXML() throws Exception {
         try (InputStream is = getResourceAsStream("/test-documents/testXXE.xml")) {
             parse("testXXE.xml", is, AUTO_DETECT_PARSER, new ParseContext());
         }
     }

     @Test
     public void testInjectedXML() throws Exception {
         byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
                 .getBytes(StandardCharsets.UTF_8);
         byte[] injected = injectXML(bytes, XXE);
         try {
             parse("injected", new ByteArrayInputStream(injected), new VulnerableSAXParser(),
                     new ParseContext());
             fail("injected should have triggered xxe");
         } catch (FileNotFoundException e) {
             //expected
         }
     }

     @Test
     public void test2003_2006xml() throws Exception {
         InputStream is = getResourceAsStream("/test-documents/testWORD_2003ml.xml");
         ByteArrayOutputStream bos = new ByteArrayOutputStream();
         IOUtils.copy(is, bos);
         byte[] injected = injectXML(bos.toByteArray(), XXE);
         parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
                 new ParseContext());
         is.close();

         is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
         bos = new ByteArrayOutputStream();
         IOUtils.copy(is, bos);
         injected = injectXML(bos.toByteArray(), XXE);
         parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
                 new ParseContext());
     }


     @Test
     public void testPOIOOXMLs() throws Exception {
         for (String fileName : new String[]{"testWORD.docx", "testWORD_1img.docx",
                 "testWORD_2006ml.docx", "testWORD_embedded_pics.docx", "testWORD_macros.docm",
                 "testEXCEL_textbox.xlsx", "testEXCEL_macro.xlsm", "testEXCEL_phonetic.xlsx",
                 "testEXCEL_embeddedPDF_windows.xlsx", "testPPT_2imgs.pptx", "testPPT_comment.pptx",
                 "testPPT_EmbeddedPDF.pptx", "testPPT_macros.pptm"}) {
             _testPOIOOXMLs(fileName);
         }
     }

     private void _testPOIOOXMLs(String fileName) throws Exception {
         Path injected = null;
         try (TikaInputStream tis = TikaInputStream
                 .get(getResourceAsStream("/test-documents/" + fileName))) {
             Path originalOOXML = tis.getPath();
             injected = injectZippedXMLs(originalOOXML, XXE, false);


             ContentHandler xhtml = new ToHTMLContentHandler();
             ParseContext parseContext = new ParseContext();
             //if the SafeContentHandler is turned off, this will throw an FNFE
             Metadata metadata = new Metadata();
             try {
                 AUTO_DETECT_PARSER
                         .parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
             } catch (TikaException e) {
                 Throwable cause = e.getCause();
                 if (!(cause instanceof InvalidFormatException)) {
                     //as of POI 4.1.x
                     fail("POI should have thrown an IFE complaining about " +
                             "not being able to read content types part !");
                 }
             } finally {
                 Files.delete(injected);
             }

             try {
                 metadata = new Metadata();
                 xhtml = new ToHTMLContentHandler();

                 OfficeParserConfig officeParserConfig = new OfficeParserConfig();
                 parseContext.set(OfficeParserConfig.class, officeParserConfig);
                 officeParserConfig.setUseSAXDocxExtractor(true);
                 officeParserConfig.setUseSAXPptxExtractor(true);
                 injected = injectZippedXMLs(originalOOXML, XXE, true);

                 AUTO_DETECT_PARSER
                         .parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
             } catch (FileNotFoundException e) {
                 e.printStackTrace();
                 fail("problem with SAX-based: " + fileName + ": " + e.getMessage());
             } finally {
                 Files.delete(injected);
             }
         }
     }

     @Test
     public void testXMLInZips() throws Exception {
         for (String fileName : new String[]{"testEPUB.epub"}) {
             _testXMLInZips(fileName);
         }
     }

     private void _testXMLInZips(String fileName) throws Exception {
         Path injected = null;
         try (TikaInputStream tis = TikaInputStream
                 .get(getResourceAsStream("/test-documents/" + fileName))) {
             injected = injectZippedXMLs(tis.getPath(), XXE, false);
         }
         Parser p = AUTO_DETECT_PARSER;
         ContentHandler xhtml = new ToHTMLContentHandler();
         ParseContext parseContext = new ParseContext();
         //if the SafeContentHandler is turned off, this will throw an FNFE
         Metadata metadata = new Metadata();
         try {
             p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
         } finally {
             Files.delete(injected);
         }

     }


     @Test
     public void testDOM() throws Exception {
         byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
                 .getBytes(StandardCharsets.UTF_8);
         byte[] injected = injectXML(bytes, XXE);
         for (int i = 0; i < XMLReaderUtils.getPoolSize() * 2; i++) {
             //this shouldn't throw an exception
             XMLReaderUtils.buildDOM(new ByteArrayInputStream(injected), new ParseContext());
         }
     }

     //use this to confirm that this works
     //by manually turning off the SafeContentHandler in SXWPFWordExtractorDecorator's
     //handlePart
     public void testDocxWithIncorrectSAXConfiguration() throws Exception {
         Path injected = null;

         try (TikaInputStream tis = TikaInputStream
                 .get(getResourceAsStream("/test-documents/testWORD_macros.docm"))) {
             injected = injectZippedXMLs(tis.getPath(), XXE, true);
         }

         ContentHandler xhtml = new ToHTMLContentHandler();
         ParseContext parseContext = new ParseContext();
         OfficeParserConfig officeParserConfig = new OfficeParserConfig();
         officeParserConfig.setUseSAXDocxExtractor(true);
         parseContext.set(OfficeParserConfig.class, officeParserConfig);
         parseContext.set(SAXParser.class, SAXParserFactory.newInstance().newSAXParser());
         //if the SafeContentHandler is turned off, this will throw an FNFE
         try {
             AUTO_DETECT_PARSER
                     .parse(Files.newInputStream(injected), xhtml, new Metadata(), parseContext);
         } finally {
             //Files.delete(injected);
         }
     }

     @Test
     public void testDOMTikaConfig() throws Exception {
         //tests the DOM reader in TikaConfig
         //if the safeguards aren't in place, this throws a FNFE
         try (InputStream is = getResourceAsStream(
                 "/org/apache/tika/config/TIKA-1558-exclude.xml")) {
             ByteArrayOutputStream bos = new ByteArrayOutputStream();
             IOUtils.copy(is, bos);
             byte[] injected = injectXML(bos.toByteArray(), XXE);
             TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
         }
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.tika.parser;

	import static org.junit.Assert.fail;

	import java.io.ByteArrayInputStream;
	import java.io.ByteArrayOutputStream;
	import java.io.FileNotFoundException;
	import java.io.InputStream;
	import java.nio.charset.StandardCharsets;
	import java.nio.file.Files;
	import java.nio.file.Path;
	import javax.xml.parsers.SAXParser;
	import javax.xml.parsers.SAXParserFactory;

	import org.apache.commons.io.IOUtils;
	import org.apache.poi.openxml4j.exceptions.InvalidFormatException;
	import org.junit.Ignore;
	import org.junit.Test;
	import org.xml.sax.ContentHandler;

	import org.apache.tika.config.TikaConfig;
	import org.apache.tika.exception.TikaException;
	import org.apache.tika.io.TikaInputStream;
	import org.apache.tika.metadata.Metadata;
	import org.apache.tika.parser.microsoft.OfficeParserConfig;
	import org.apache.tika.sax.ToHTMLContentHandler;
	import org.apache.tika.utils.XMLReaderUtils;

	/**
	* This tests for XXE in basically xml type files, straight xml and zipped
	* xmls, e.g. ebook and ooxml.
	* It does not test for XXE prevention in files that may contain xml
	* files, such as PDFs and other XMP-containing files.
	*/
	public class TestXXEInXML extends XMLTestBase {
	//TODO: figure out how to test XFA and xmp in PDFs

	private static final byte[] XXE =
	"<!DOCTYPE roottag PUBLIC \"-//OXML/XXE/EN\" \"file:///couldnt_possibly_exist\">"
	.getBytes(StandardCharsets.UTF_8);

	@Test
	@Ignore("ignore vulnerable tests")
	public void testConfirmVulnerable() throws Exception {
	try {
	parse("testXXE.xml", getResourceAsStream("/test-documents/testXXE.xml"),
	new VulnerableSAXParser(), new ParseContext());
	fail("should have failed!!!");
	} catch (FileNotFoundException e) {
	//expected
	}
	}

	@Test
	public void testXML() throws Exception {
	try (InputStream is = getResourceAsStream("/test-documents/testXXE.xml")) {
	parse("testXXE.xml", is, AUTO_DETECT_PARSER, new ParseContext());
	}
	}

	@Test
	public void testInjectedXML() throws Exception {
	byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
	.getBytes(StandardCharsets.UTF_8);
	byte[] injected = injectXML(bytes, XXE);
	try {
	parse("injected", new ByteArrayInputStream(injected), new VulnerableSAXParser(),
	new ParseContext());
	fail("injected should have triggered xxe");
	} catch (FileNotFoundException e) {
	//expected
	}
	}

	@Test
	public void test2003_2006xml() throws Exception {
	InputStream is = getResourceAsStream("/test-documents/testWORD_2003ml.xml");
	ByteArrayOutputStream bos = new ByteArrayOutputStream();
	IOUtils.copy(is, bos);
	byte[] injected = injectXML(bos.toByteArray(), XXE);
	parse("testWORD_2003ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
	new ParseContext());
	is.close();

	is = getResourceAsStream("/test-documents/testWORD_2006ml.xml");
	bos = new ByteArrayOutputStream();
	IOUtils.copy(is, bos);
	injected = injectXML(bos.toByteArray(), XXE);
	parse("testWORD_2006ml.xml", new ByteArrayInputStream(injected), AUTO_DETECT_PARSER,
	new ParseContext());
	}


	@Test
	public void testPOIOOXMLs() throws Exception {
	for (String fileName : new String[]{"testWORD.docx", "testWORD_1img.docx",
	"testWORD_2006ml.docx", "testWORD_embedded_pics.docx", "testWORD_macros.docm",
	"testEXCEL_textbox.xlsx", "testEXCEL_macro.xlsm", "testEXCEL_phonetic.xlsx",
	"testEXCEL_embeddedPDF_windows.xlsx", "testPPT_2imgs.pptx", "testPPT_comment.pptx",
	"testPPT_EmbeddedPDF.pptx", "testPPT_macros.pptm"}) {
	_testPOIOOXMLs(fileName);
	}
	}

	private void _testPOIOOXMLs(String fileName) throws Exception {
	Path injected = null;
	try (TikaInputStream tis = TikaInputStream
	.get(getResourceAsStream("/test-documents/" + fileName))) {
	Path originalOOXML = tis.getPath();
	injected = injectZippedXMLs(originalOOXML, XXE, false);


	ContentHandler xhtml = new ToHTMLContentHandler();
	ParseContext parseContext = new ParseContext();
	//if the SafeContentHandler is turned off, this will throw an FNFE
	Metadata metadata = new Metadata();
	try {
	AUTO_DETECT_PARSER
	.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
	} catch (TikaException e) {
	Throwable cause = e.getCause();
	if (!(cause instanceof InvalidFormatException)) {
	//as of POI 4.1.x
	fail("POI should have thrown an IFE complaining about " +
	"not being able to read content types part !");
	}
	} finally {
	Files.delete(injected);
	}

	try {
	metadata = new Metadata();
	xhtml = new ToHTMLContentHandler();

	OfficeParserConfig officeParserConfig = new OfficeParserConfig();
	parseContext.set(OfficeParserConfig.class, officeParserConfig);
	officeParserConfig.setUseSAXDocxExtractor(true);
	officeParserConfig.setUseSAXPptxExtractor(true);
	injected = injectZippedXMLs(originalOOXML, XXE, true);

	AUTO_DETECT_PARSER
	.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
	} catch (FileNotFoundException e) {
	e.printStackTrace();
	fail("problem with SAX-based: " + fileName + ": " + e.getMessage());
	} finally {
	Files.delete(injected);
	}
	}
	}

	@Test
	public void testXMLInZips() throws Exception {
	for (String fileName : new String[]{"testEPUB.epub"}) {
	_testXMLInZips(fileName);
	}
	}

	private void _testXMLInZips(String fileName) throws Exception {
	Path injected = null;
	try (TikaInputStream tis = TikaInputStream
	.get(getResourceAsStream("/test-documents/" + fileName))) {
	injected = injectZippedXMLs(tis.getPath(), XXE, false);
	}
	Parser p = AUTO_DETECT_PARSER;
	ContentHandler xhtml = new ToHTMLContentHandler();
	ParseContext parseContext = new ParseContext();
	//if the SafeContentHandler is turned off, this will throw an FNFE
	Metadata metadata = new Metadata();
	try {
	p.parse(Files.newInputStream(injected), xhtml, metadata, parseContext);
	} finally {
	Files.delete(injected);
	}

	}


	@Test
	public void testDOM() throws Exception {
	byte[] bytes = "<?xml version=\"1.0\" encoding=\"UTF-8\"?><document>blah</document>"
	.getBytes(StandardCharsets.UTF_8);
	byte[] injected = injectXML(bytes, XXE);
	for (int i = 0; i < XMLReaderUtils.getPoolSize() * 2; i++) {
	//this shouldn't throw an exception
	XMLReaderUtils.buildDOM(new ByteArrayInputStream(injected), new ParseContext());
	}
	}

	//use this to confirm that this works
	//by manually turning off the SafeContentHandler in SXWPFWordExtractorDecorator's
	//handlePart
	public void testDocxWithIncorrectSAXConfiguration() throws Exception {
	Path injected = null;

	try (TikaInputStream tis = TikaInputStream
	.get(getResourceAsStream("/test-documents/testWORD_macros.docm"))) {
	injected = injectZippedXMLs(tis.getPath(), XXE, true);
	}

	ContentHandler xhtml = new ToHTMLContentHandler();
	ParseContext parseContext = new ParseContext();
	OfficeParserConfig officeParserConfig = new OfficeParserConfig();
	officeParserConfig.setUseSAXDocxExtractor(true);
	parseContext.set(OfficeParserConfig.class, officeParserConfig);
	parseContext.set(SAXParser.class, SAXParserFactory.newInstance().newSAXParser());
	//if the SafeContentHandler is turned off, this will throw an FNFE
	try {
	AUTO_DETECT_PARSER
	.parse(Files.newInputStream(injected), xhtml, new Metadata(), parseContext);
	} finally {
	//Files.delete(injected);
	}
	}

	@Test
	public void testDOMTikaConfig() throws Exception {
	//tests the DOM reader in TikaConfig
	//if the safeguards aren't in place, this throws a FNFE
	try (InputStream is = getResourceAsStream(
	"/org/apache/tika/config/TIKA-1558-exclude.xml")) {
	ByteArrayOutputStream bos = new ByteArrayOutputStream();
	IOUtils.copy(is, bos);
	byte[] injected = injectXML(bos.toByteArray(), XXE);
	TikaConfig tikaConfig = new TikaConfig(new ByteArrayInputStream(injected));
	}
	}
	}