blob: 41d342c9921b3fa709736ad4c61989e223fc8f25 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.onenote;
import java.io.IOException;
import java.io.InputStream;
import java.time.Instant;
import java.util.Collections;
import java.util.HashMap;
import java.util.List;
import java.util.Map;
import java.util.Set;
import org.apache.commons.lang3.tuple.Pair;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.TemporaryResources;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* OneNote tika parser capable of parsing Microsoft OneNote files.
* <p>
* Based on the Microsoft specs MS-ONE and MS-ONESTORE.
*/
public class OneNoteParser extends AbstractParser {
private static final Map<MediaType, List<String>> typesMap = new HashMap<>();
/**
* Serial version UID
*/
private static final long serialVersionUID = -5504243905998074168L;
private static final Set<MediaType> SUPPORTED_TYPES =
Collections.unmodifiableSet(typesMap.keySet());
static {
// All types should be 4 bytes long, space padded as needed
typesMap.put(MediaType.application("onenote; format=one"), Collections.singletonList("ONE "));
// TODO - add onetoc and other onenote mime types
}
@Override
public Set<MediaType> getSupportedTypes(ParseContext context) {
return SUPPORTED_TYPES;
}
@Override
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, SAXException, TikaException {
try (TemporaryResources temporaryResources = new TemporaryResources();
TikaInputStream tikaInputStream = TikaInputStream.get(stream, temporaryResources);
OneNoteDirectFileResource oneNoteDirectFileResource =
new OneNoteDirectFileResource(tikaInputStream.getFile())) {
temporaryResources.addResource(oneNoteDirectFileResource);
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
OneNoteDocument oneNoteDocument =
createOneNoteDocumentFromDirectFileResource(oneNoteDirectFileResource);
if (!oneNoteDocument.header.isLegacy()) {
metadata.set("buildNumberCreated",
"0x" + Long.toHexString(oneNoteDocument.header.buildNumberCreated));
metadata.set("buildNumberLastWroteToFile",
"0x" + Long.toHexString(oneNoteDocument.header.buildNumberLastWroteToFile));
metadata.set("buildNumberNewestWritten",
"0x" + Long.toHexString(oneNoteDocument.header.buildNumberNewestWritten));
metadata.set("buildNumberOldestWritten",
"0x" + Long.toHexString(oneNoteDocument.header.buildNumberOldestWritten));
metadata.set("cbExpectedFileLength",
"0x" + Long.toHexString(oneNoteDocument.header.cbExpectedFileLength));
metadata.set("cbFreeSpaceInFreeChunkList",
"0x" + Long.toHexString(oneNoteDocument.header.cbFreeSpaceInFreeChunkList));
metadata.set("cbLegacyExpectedFileLength",
"0x" + Long.toHexString(oneNoteDocument.header.cbLegacyExpectedFileLength));
metadata.set("cbLegacyFreeSpaceInFreeChunkList", "0x" +
Long.toHexString(oneNoteDocument.header.cbLegacyFreeSpaceInFreeChunkList));
metadata.set("crcName", "0x" + Long.toHexString(oneNoteDocument.header.crcName));
metadata.set("cTransactionsInLog",
"0x" + Long.toHexString(oneNoteDocument.header.cTransactionsInLog));
metadata.set("ffvLastCodeThatWroteToThisFile", "0x" +
Long.toHexString(oneNoteDocument.header.ffvLastCodeThatWroteToThisFile));
metadata.set("ffvNewestCodeThatHasWrittenToThisFile", "0x" + Long.toHexString(
oneNoteDocument.header.ffvNewestCodeThatHasWrittenToThisFile));
metadata.set("ffvOldestCodeThatMayReadThisFile", "0x" +
Long.toHexString(oneNoteDocument.header.ffvOldestCodeThatMayReadThisFile));
metadata.set("ffvOldestCodeThatHasWrittenToThisFile", "0x" + Long.toHexString(
oneNoteDocument.header.ffvOldestCodeThatHasWrittenToThisFile));
metadata.set("grfDebugLogFlags",
"0x" + Long.toHexString(oneNoteDocument.header.grfDebugLogFlags));
metadata.set("nFileVersionGeneration",
"0x" + Long.toHexString(oneNoteDocument.header.nFileVersionGeneration));
metadata.set("rgbPlaceholder",
"0x" + Long.toHexString(oneNoteDocument.header.rgbPlaceholder));
Pair<Long, ExtendedGUID> roleAndContext = Pair.of(1L, ExtendedGUID.nil());
OneNoteTreeWalker oneNoteTreeWalker =
new OneNoteTreeWalker(new OneNoteTreeWalkerOptions(), oneNoteDocument,
oneNoteDirectFileResource, xhtml, metadata, context,
roleAndContext);
oneNoteTreeWalker.walkTree();
if (!oneNoteTreeWalker.getAuthors().isEmpty()) {
metadata.set(Property.externalTextBag("authors"),
oneNoteTreeWalker.getAuthors().toArray(new String[]{}));
}
if (!oneNoteTreeWalker.getMostRecentAuthors().isEmpty()) {
metadata.set(Property.externalTextBag("mostRecentAuthors"),
oneNoteTreeWalker.getMostRecentAuthors().toArray(new String[]{}));
}
if (!oneNoteTreeWalker.getOriginalAuthors().isEmpty()) {
metadata.set(Property.externalTextBag("originalAuthors"),
oneNoteTreeWalker.getOriginalAuthors().toArray(new String[]{}));
}
if (!Instant.MAX.equals(oneNoteTreeWalker.getCreationTimestamp())) {
metadata.set("creationTimestamp",
String.valueOf(oneNoteTreeWalker.getCreationTimestamp()));
}
if (!Instant.MIN.equals(oneNoteTreeWalker.getLastModifiedTimestamp())) {
metadata.set("lastModifiedTimestamp", String.valueOf(
oneNoteTreeWalker.getLastModifiedTimestamp().toEpochMilli()));
}
if (oneNoteTreeWalker.getLastModified() > Long.MIN_VALUE) {
metadata.set("lastModified",
String.valueOf(oneNoteTreeWalker.getLastModified()));
}
} else {
OneNoteLegacyDumpStrings dumpStrings =
new OneNoteLegacyDumpStrings(oneNoteDirectFileResource, xhtml);
dumpStrings.dump();
}
xhtml.endDocument();
}
}
/**
* Create a OneNoteDocument object.
* <p>
* This won't actually have the binary data of any of the sections, but it's more of a
* metadata structure that contains
* the general structure of the container and contains offset positions of where to find the
* binary data we care about.
* <p>
* OneNote files are of format:
* <p>
* The header (section 2.3.1 in MS-ONESTORE) is the first 1024 bytes of the file. It contains
* references to the other structures in the
* file as well as metadata about the file.
* The free chunk list (section 2.3.2 in MS-ONESTORE) defines where there are free spaces in
* the file where data can be written.
* The transaction log (section 2.3.3 in MS-ONESTORE) stores the state and length of each
* file node list (section 2.4 in MS-ONESTORE)
* in the file.
* The hashed chunk list (section 2.3.4 in MS-ONESTORE) stores read-only objects in the file
* that can be referenced by multiple
* revisions (section 2.1.8 in MS-ONESTORE).
* The root file node list (section 2.1.14 in MS-ONESTORE) is the file node list that is the
* root of the tree of all file node lists in
* the file.
* <p>
* In this method we first parse the header.
* <p>
* After parsing the header, this results in header.fcrFileNodeListRoot that points to the first
*
* @param oneNoteDirectFileResource A random access file resource used as the source of the
* content.
* @return A parsed one note document. This document does not contain any of the binary data,
* rather it just contains
* the data pointers and metadata.
* @throws IOException Will throw IOException in typical IO issue situations.
*/
public OneNoteDocument createOneNoteDocumentFromDirectFileResource(
OneNoteDirectFileResource oneNoteDirectFileResource) throws IOException, TikaException {
OneNoteDocument oneNoteDocument = new OneNoteDocument();
OneNotePtr oneNotePtr = new OneNotePtr(oneNoteDocument, oneNoteDirectFileResource);
// First parse out the header.
oneNoteDocument.header = oneNotePtr.deserializeHeader();
if (!oneNoteDocument.header.isLegacy()) {
// Now that we parsed the header, the "root file node list"
oneNotePtr.reposition(oneNoteDocument.header.fcrFileNodeListRoot);
FileNodePtr curPath = new FileNodePtr();
oneNotePtr.deserializeFileNodeList(oneNoteDocument.root, curPath);
}
return oneNoteDocument;
}
}