blob: e01e9ee134564159087396643770316f29d7017c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.onenote;
import java.time.Instant;
import java.util.Arrays;
import java.util.List;
import org.junit.Assert;
import org.junit.Test;
import org.apache.tika.TikaTest;
import org.apache.tika.metadata.Metadata;
public class OneNoteParserTest extends TikaTest {
//test recursive parser wrapper for image files
/**
* This is the sample document that is automatically created from onenote 2013.
*/
@Test
public void testOneNote2013Doc1() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote1.one", metadata);
assertNoJunk(txt);
List<String> authors = Arrays.asList(metadata.getValues("authors"));
assertContains("Olya Veselova\u0000", authors);
assertContains("Microsoft\u0000", authors);
assertContains("Scott\u0000", authors);
assertContains("Scott H. W. Snyder\u0000", authors);
List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
assertContains("Microsoft\u0000", mostRecentAuthors);
List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
assertContains("Microsoft\u0000", originalAuthors);
Assert.assertEquals(Instant.ofEpochSecond(1336059427),
Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
Assert.assertEquals(Instant.ofEpochMilli(1383613114000L),
Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
Assert.assertEquals(Instant.ofEpochSecond(1446572147),
Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
}
@Test
public void testOneNote2013Doc2() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote2.one", metadata);
assertContains("wow this is neat", txt);
assertContains("neat info about totally killin it bro", txt);
assertContains("Section1TextArea1", txt);
assertContains("Section1HeaderTitle", txt);
assertContains("Section1TextArea2", txt);
assertNoJunk(txt);
List<String> authors = Arrays.asList(metadata.getValues("authors"));
assertContains("Olya Veselova\u0000", authors);
assertContains("Microsoft\u0000", authors);
assertContains("Scott\u0000", authors);
assertContains("Scott H. W. Snyder\u0000", authors);
assertContains("ndipiazza\u0000", authors);
List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
assertContains("ndipiazza\u0000", mostRecentAuthors);
assertContains("Microsoft\u0000", mostRecentAuthors);
List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
assertContains("Microsoft\u0000", originalAuthors);
assertContains("ndipiazza\u0000", mostRecentAuthors);
Assert.assertEquals(Instant.ofEpochSecond(1336059427),
Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
Assert.assertEquals(Instant.ofEpochMilli(1574426629000L),
Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
Assert.assertEquals(Instant.ofEpochSecond(1574426628),
Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
}
@Test
public void testOneNote2013Doc3() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote3.one", metadata);
assertContains("awesome information about sports or some crap like that.", txt);
assertContains("Quit doing horrible things to me. Dang you. ", txt);
assertContains("Section2TextArea1", txt);
assertContains("Section2HeaderTitle", txt);
assertContains("Section2TextArea2", txt);
assertNoJunk(txt);
List<String> authors = Arrays.asList(metadata.getValues("authors"));
assertNotContained("Olya Veselova\u0000", authors);
assertNotContained("Microsoft\u0000", authors);
assertNotContained("Scott\u0000", authors);
assertNotContained("Scott H. W. Snyder\u0000", authors);
assertContains("ndipiazza\u0000", authors);
List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
assertContains("ndipiazza\u0000", mostRecentAuthors);
assertNotContained("Microsoft\u0000", mostRecentAuthors);
List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
assertNotContained("Microsoft\u0000", originalAuthors);
assertContains("ndipiazza\u0000", mostRecentAuthors);
Assert.assertEquals(Instant.ofEpochSecond(1574426349),
Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
Assert.assertEquals(Instant.ofEpochMilli(1574426623000L),
Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
Assert.assertEquals(Instant.ofEpochSecond(1574426624),
Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
}
@Test
public void testOneNote2013Doc4() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote4.one", metadata);
assertContains("way too much information about poptarts to handle.", txt);
assertContains("Section3TextArea1", txt);
assertContains("Section3HeaderTitle", txt);
assertContains("Section3TextArea2", txt);
assertNoJunk(txt);
List<String> authors = Arrays.asList(metadata.getValues("authors"));
assertNotContained("Olya Veselova\u0000", authors);
assertNotContained("Microsoft\u0000", authors);
assertNotContained("Scott\u0000", authors);
assertNotContained("Scott H. W. Snyder\u0000", authors);
assertContains("ndipiazza\u0000", authors);
List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
assertContains("ndipiazza\u0000", mostRecentAuthors);
assertNotContained("Microsoft\u0000", mostRecentAuthors);
List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
assertNotContained("Microsoft\u0000", originalAuthors);
assertContains("ndipiazza\u0000", mostRecentAuthors);
Assert.assertEquals(Instant.ofEpochSecond(1574426385),
Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
Assert.assertEquals(Instant.ofEpochMilli(1574426548000L),
Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
Assert.assertEquals(Instant.ofEpochSecond(1574426547),
Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
}
@Test
public void testOneNote2016() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote2016.one", metadata);
assertContains("So good", txt);
assertContains("This is one note 2016", txt);
assertNoJunk(txt);
List<String> authors = Arrays.asList(metadata.getValues("authors"));
assertContains("nicholas dipiazza\u0000", authors);
List<String> mostRecentAuthors = Arrays.asList(metadata.getValues("mostRecentAuthors"));
assertContains("nicholas dipiazza\u0000", mostRecentAuthors);
List<String> originalAuthors = Arrays.asList(metadata.getValues("originalAuthors"));
assertContains("nicholas dipiazza\u0000", originalAuthors);
Assert.assertEquals(Instant.ofEpochSecond(1576107472),
Instant.ofEpochSecond(Long.parseLong(metadata.get("creationTimestamp"))));
Assert.assertEquals(Instant.ofEpochMilli(1576107481000L),
Instant.ofEpochMilli(Long.parseLong(metadata.get("lastModifiedTimestamp"))));
Assert.assertEquals(Instant.ofEpochSecond(1576107480),
Instant.ofEpochSecond(Long.parseLong(metadata.get("lastModified"))));
}
@Test
public void testOneNote2007OrEarlier1() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote2007OrEarlier1.one", metadata);
// utf-16 LE text
assertContains(
"One note is the application. The notebooks are the files within the application" +
". " +
"Each notebook can have an unlimited amount of sections and pages. To " +
"create a new notebook, go to file, new, computer, " +
"and name it. It will go to my documents, oneNote Notebooks folder. The" +
" notebook doesn't close and you don't have to save. " +
"If it closes, you can go back to it and it will open at the same place " +
"you left off. If you are offline and the notebook is " +
"being stored on a sharepoint site, you can work on it and it will sync " +
"when you go back online.",
txt);
// ascii text
assertContains("Correlation between Outlook and OneNote", txt);
}
@Test
public void testOneNote2007OrEarlier2() throws Exception {
Metadata metadata = new Metadata();
String txt = getText("testOneNote2007OrEarlier2.one", metadata);
// ascii text
assertContains(
"In Outlook meeting notice, select One Note Meeting Notes and then select the " +
"\"page\" you want to link/share",
txt);
}
@Test
public void testOneNoteEmbeddedWordDoc() throws Exception {
List<Metadata> metadataList = getRecursiveMetadata("testOneNoteEmbeddedWordDoc.one");
Assert.assertTrue(metadataList.stream().anyMatch(
ml -> "application/vnd.openxmlformats-officedocument.wordprocessingml.document"
.equals(ml.get("Content-Type"))));
}
private void assertNoJunk(String txt) {
//Should not include font names in the text
assertNotContained("Calibri", txt);
//Should not include UTF-16 property values that are garbage
assertNotContained("\u5902", txt);
assertNotContained("\u83F2", txt);
assertNotContained("\u432F", txt);
assertNotContained("\u01E1", txt);
}
}