blob: 833baeb913a6fe8f830de1069c013f59a73b1378 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.ooxml;
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.fail;
import java.io.ByteArrayOutputStream;
import java.io.EOFException;
import java.io.File;
import java.io.FileInputStream;
import java.io.InputStream;
import java.nio.charset.StandardCharsets;
import java.util.List;
import java.util.Random;
import org.apache.commons.compress.archivers.zip.ZipArchiveEntry;
import org.apache.commons.compress.archivers.zip.ZipArchiveInputStream;
import org.apache.commons.io.IOUtils;
import org.junit.Ignore;
import org.junit.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.TikaCoreProperties;
public class TruncatedOOXMLTest extends TikaTest {
@Test
public void testWordTrunc14435() throws Exception {
//this is only very slightly truncated
List<Metadata> metadataList =
getRecursiveMetadata(truncate("testWORD_various.docx", 14435), true);
assertEquals(1, metadataList.size());
Metadata metadata = metadataList.get(0);
String content = metadata.get(TikaCoreProperties.TIKA_CONTENT);
assertEquals("application/vnd.openxmlformats-officedocument.wordprocessingml.document",
metadata.get(Metadata.CONTENT_TYPE));
assertContains("This is the header", content);
assertContains("This is the footer text", content);
assertContains("Suddenly some Japanese", content);
}
@Test
public void testTruncation() throws Exception {
int length = (int) getResourceAsFile("/test-documents/testWORD_various.docx").length();
Random r = new Random();
for (int i = 0; i < 50; i++) {
int targetLength = r.nextInt(length);
InputStream is = truncate("testWORD_various.docx", targetLength);
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(is, bos);
assertEquals(targetLength, bos.toByteArray().length);
}
try {
InputStream is = truncate("testWORD_various.docx", length + 1);
fail("should have thrown EOF");
} catch (EOFException e) {
//swallow
}
}
@Test
@Ignore("for dev/debugging only")
public void listStreams() throws Exception {
File tstDir = new File(getResourceAsUri("/test-documents"));
for (File f : tstDir.listFiles()) {
if (f.isDirectory()) {
continue;
}
if (f.getName().endsWith(
".xlsx")) { // || f.getName().endsWith(".pptx") || f.getName().endsWith("
// .docx")) {
} else {
continue;
}
try (InputStream is = new FileInputStream(f)) {
ZipArchiveInputStream zipArchiveInputStream = new ZipArchiveInputStream(is);
ZipArchiveEntry zae = zipArchiveInputStream.getNextZipEntry();
int cnt = 0;
while (zae != null && !zae.isDirectory() && ++cnt <= 10) {
System.out.println(f.getName() + " : " + zae.getName());
if (zae.getName().equals("_rels/.rels")) {
ByteArrayOutputStream bos = new ByteArrayOutputStream();
IOUtils.copy(zipArchiveInputStream, bos);
System.out.println(new String(bos.toByteArray(), StandardCharsets.UTF_8));
}
zae = zipArchiveInputStream.getNextZipEntry();
}
} catch (Exception e) {
System.out.println(f.getName() + " : " + e.getMessage());
}
}
}
}