blob: bef41c9864efd585f5e53a65cdc748068e140f8e [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.onenote;
import java.io.IOException;
import java.nio.ByteBuffer;
import java.nio.charset.StandardCharsets;
import java.time.Instant;
import java.time.LocalDateTime;
import java.time.Month;
import java.time.ZoneOffset;
import java.util.ArrayList;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
import org.apache.commons.io.IOUtils;
import org.apache.commons.lang3.tuple.Pair;
import org.xml.sax.SAXException;
import org.xml.sax.helpers.AttributesImpl;
import org.apache.tika.exception.TikaException;
import org.apache.tika.exception.TikaMemoryLimitException;
import org.apache.tika.extractor.EmbeddedDocumentExtractor;
import org.apache.tika.extractor.EmbeddedDocumentUtil;
import org.apache.tika.io.TikaInputStream;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.EmbeddedContentHandler;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* Walk the one note tree and create a Map while it goes.
* Also writes user input text to a print writer as it parses.
*/
class OneNoteTreeWalker {
private static final String P = "p";
/**
* See spec MS-ONE - 2.3.1 - TIME32 - epoch of jan 1 1980 UTC.
* So we create this offset used to calculate number of seconds between this and the Instant
* .EPOCH.
*/
private static final long TIME32_EPOCH_DIFF_1980;
/**
* See spec MS-DTYP - 2.3.3 - DATETIME dates are based on epoch of jan 1 1601 UTC.
* So we create this offset used to calculate number of seconds between this and the Instant
* .EPOCH.
*/
private static final long DATETIME_EPOCH_DIFF_1601;
private static Pattern HYPERLINK_PATTERN =
Pattern.compile("\uFDDFHYPERLINK\\s+\"([^\"]+)\"([^\"]+)$");
static {
LocalDateTime time32Epoch1980 = LocalDateTime.of(
1980, Month.JANUARY, 1, 0, 0);
Instant instant = time32Epoch1980.atZone(ZoneOffset.UTC).toInstant();
TIME32_EPOCH_DIFF_1980 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
}
static {
LocalDateTime time32Epoch1601 = LocalDateTime.of(
1601, Month.JANUARY, 1, 0, 0);
Instant instant = time32Epoch1601.atZone(ZoneOffset.UTC).toInstant();
DATETIME_EPOCH_DIFF_1601 = (instant.toEpochMilli() - Instant.EPOCH.toEpochMilli()) / 1000;
}
private final Metadata parentMetadata;
private final EmbeddedDocumentExtractor embeddedDocumentExtractor;
private final Set<String> authors = new HashSet<>();
private final Set<String> mostRecentAuthors = new HashSet<>();
private final Set<String> originalAuthors = new HashSet<>();
private OneNoteTreeWalkerOptions options;
private OneNoteDocument oneNoteDocument;
private OneNoteDirectFileResource dif;
private XHTMLContentHandler xhtml;
private Pair<Long, ExtendedGUID> roleAndContext;
private Instant lastModifiedTimestamp = Instant.MIN;
private long creationTimestamp = Long.MAX_VALUE;
private long lastModified = Long.MIN_VALUE;
private boolean mostRecentAuthorProp = false;
private boolean originalAuthorProp = false;
/**
* Create a one tree walker.
*
* @param options The options for how to walk this tree.
* @param oneNoteDocument The one note document we want to walk.
* @param dif The rando file access structure we read and reposition while
* extracting the content.
* @param xhtml The XHTMLContentHandler to populate as you walk the tree.
* @param roleAndContext The role nd context value we want to use when crawling. Set this
* to null if you are
* crawling all root file nodes, and don't care about revisions.
*/
public OneNoteTreeWalker(OneNoteTreeWalkerOptions options, OneNoteDocument oneNoteDocument,
OneNoteDirectFileResource dif, XHTMLContentHandler xhtml,
Metadata parentMetadata, ParseContext parseContext,
Pair<Long, ExtendedGUID> roleAndContext) {
this.options = options;
this.oneNoteDocument = oneNoteDocument;
this.dif = dif;
this.roleAndContext = roleAndContext;
this.xhtml = xhtml;
this.parentMetadata = parentMetadata;
this.embeddedDocumentExtractor =
EmbeddedDocumentUtil.getEmbeddedDocumentExtractor(parseContext);
}
/**
* Parse the tree.
*
* @return Map of the fully parsed one note document.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public Map<String, Object> walkTree() throws IOException, TikaException, SAXException {
Map<String, Object> structure = new HashMap<>();
structure.put("header", oneNoteDocument.header);
structure.put("rootFileNodes", walkRootFileNodes());
return structure;
}
/**
* Walk the root file nodes, depending on the options will crawl revisions or the entire
* revision tree.
*
* @return List of the root file nodes.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public List<Map<String, Object>> walkRootFileNodes()
throws IOException, TikaException, SAXException {
List<Map<String, Object>> res = new ArrayList<>();
if (options.isCrawlAllFileNodesFromRoot()) {
res.add(walkFileNodeList(oneNoteDocument.root));
} else {
for (ExtendedGUID revisionListGuid : oneNoteDocument.revisionListOrder) {
Map<String, Object> structure = new HashMap<>();
structure.put("oneNoteType", "Revision");
structure.put("revisionListGuid", revisionListGuid.toString());
FileNodePtr fileNodePtr =
oneNoteDocument.revisionManifestLists.get(revisionListGuid);
structure.put("fileNode", walkRevision(fileNodePtr));
res.add(structure);
}
}
return res;
}
/**
* Does the revision role map have this revision role id.
*
* @param rid The revision id.
* @param revisionRole The revision role Long,GUID pair.
* @return True if exists, false if not.
*/
private boolean hasRevisionRole(ExtendedGUID rid, Pair<Long, ExtendedGUID> revisionRole) {
Pair<Long, ExtendedGUID> where = oneNoteDocument.revisionRoleMap.get(rid);
return where != null && where.equals(revisionRole);
}
/**
* Walk revisions.
*
* @param fileNodePtr The file node pointer to start with.
* @return A map of the parsed data.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
private Map<String, Object> walkRevision(FileNodePtr fileNodePtr)
throws IOException, TikaException, SAXException {
Map<String, Object> structure = new HashMap<>();
structure.put("oneNoteType", "FileNodePointer");
structure.put("offsets", fileNodePtr.nodeListPositions);
FileNode revisionFileNode = fileNodePtr.dereference(oneNoteDocument);
structure.put("fileNodeId", revisionFileNode.id);
if (revisionFileNode.gosid != null) {
structure.put("gosid", revisionFileNode.gosid.toString());
}
structure.put("subType", revisionFileNode.subType);
structure.put("size", revisionFileNode.size);
structure.put("isFileData", revisionFileNode.isFileData);
Set<ExtendedGUID> validRevisions = new HashSet<>();
for (int i = revisionFileNode.childFileNodeList.children.size() - 1; i >= 0; --i) {
FileNode child = revisionFileNode.childFileNodeList.children.get(i);
if (roleAndContext != null && hasRevisionRole(child.gosid, roleAndContext)) {
validRevisions.add(child.gosid);
if (options.isOnlyLatestRevision()) {
break;
}
}
}
List<Map<String, Object>> children = new ArrayList<>();
boolean okGroup = false;
for (FileNode child : revisionFileNode.childFileNodeList.children) {
if (child.id == FndStructureConstants.RevisionManifestStart4FND ||
child.id == FndStructureConstants.RevisionManifestStart6FND ||
child.id == FndStructureConstants.RevisionManifestStart7FND) {
okGroup = validRevisions.contains(child.gosid);
}
if (okGroup) {
if ((child.id == FndStructureConstants.RootObjectReference2FNDX ||
child.id == FndStructureConstants.RootObjectReference3FND) &&
child.subType.rootObjectReference.rootObjectReferenceBase.rootRole == 1) {
FileNodePtr childFileNodePointer =
oneNoteDocument.guidToObject.get(child.gosid);
children.add(walkFileNodePtr(childFileNodePointer));
}
}
}
if (!children.isEmpty()) {
Map<String, Object> childFileNodeListMap = new HashMap<>();
childFileNodeListMap.put("fileNodeListHeader",
revisionFileNode.childFileNodeList.fileNodeListHeader);
childFileNodeListMap.put("children", children);
structure.put("revisionFileNodeList", childFileNodeListMap);
}
return structure;
}
/**
* Walk the file node pointer.
*
* @param fileNodePtr The file node pointer.
* @return Returns a map of the main data.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public Map<String, Object> walkFileNodePtr(FileNodePtr fileNodePtr)
throws IOException, TikaException, SAXException {
if (fileNodePtr != null) {
FileNode fileNode = fileNodePtr.dereference(oneNoteDocument);
return walkFileNode(fileNode);
}
return Collections.emptyMap();
}
/**
* Walk the file node list.
*
* @param fileNodeList The file node list to parse.
* @return The result.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public Map<String, Object> walkFileNodeList(FileNodeList fileNodeList)
throws IOException, TikaException, SAXException {
Map<String, Object> structure = new HashMap<>();
structure.put("oneNoteType", "FileNodeList");
structure.put("fileNodeListHeader", fileNodeList.fileNodeListHeader);
if (!fileNodeList.children.isEmpty()) {
List<Map<String, Object>> children = new ArrayList<>();
for (FileNode child : fileNodeList.children) {
children.add(walkFileNode(child));
}
structure.put("children", children);
}
return structure;
}
/**
* Walk a single file node.
*
* @param fileNode The file node.
* @return Map which is result of the parsed file node.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
public Map<String, Object> walkFileNode(FileNode fileNode)
throws IOException, TikaException, SAXException {
Map<String, Object> structure = new HashMap<>();
structure.put("oneNoteType", "FileNode");
structure.put("gosid", fileNode.gosid.toString());
structure.put("size", fileNode.size);
structure.put("fileNodeId", "0x" + Long.toHexString(fileNode.id));
structure.put("fileNodeIdName", FndStructureConstants.nameOf(fileNode.id));
structure.put("fileNodeBaseType", "0x" + Long.toHexString(fileNode.baseType));
structure.put("isFileData", fileNode.isFileData);
structure.put("idDesc", fileNode.idDesc);
if (fileNode.childFileNodeList != null &&
fileNode.childFileNodeList.fileNodeListHeader != null) {
structure.put("childFileNodeList", walkFileNodeList(fileNode.childFileNodeList));
}
if (fileNode.propertySet != null) {
List<Map<String, Object>> propSet = processPropertySet(fileNode.propertySet);
if (!propSet.isEmpty()) {
structure.put("propertySet", propSet);
}
}
if (fileNode.subType.fileDataStoreObjectReference.ref != null && !FileChunkReference.nil()
.equals(fileNode.subType.fileDataStoreObjectReference.ref.fileData)) {
structure.put("fileDataStoreObjectReference", walkFileDataStoreObjectReference(
fileNode.subType.fileDataStoreObjectReference));
}
return structure;
}
/**
* Walk a file data store object reference.
*
* @param fileDataStoreObjectReference The file data store object reference we are parsing.
* @return Map containing parsed content.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
private Map<String, Object> walkFileDataStoreObjectReference(
FileDataStoreObjectReference fileDataStoreObjectReference)
throws IOException, SAXException, TikaException {
Map<String, Object> structure = new HashMap<>();
OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
content.reposition(fileDataStoreObjectReference.ref.fileData);
if (fileDataStoreObjectReference.ref.fileData.cb > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + fileDataStoreObjectReference.ref.fileData.cb +
" exceeds document size: " + dif.size());
}
handleEmbedded((int) fileDataStoreObjectReference.ref.fileData.cb);
structure.put("fileDataStoreObjectMetadata", fileDataStoreObjectReference);
return structure;
}
private void handleEmbedded(int length) throws TikaException, IOException, SAXException {
TikaInputStream stream = null;
ByteBuffer buf = null;
try {
buf = ByteBuffer.allocate(length);
dif.read(buf);
} catch (IOException e) {
//store this exception in the parent's metadata
EmbeddedDocumentUtil.recordEmbeddedStreamException(e, parentMetadata);
return;
}
Metadata embeddedMetadata = new Metadata();
try {
stream = TikaInputStream.get(buf.array());
embeddedDocumentExtractor
.parseEmbedded(stream, new EmbeddedContentHandler(xhtml), embeddedMetadata,
false);
AttributesImpl attributes = new AttributesImpl();
attributes.addAttribute("", "class", "class", "CDATA", "embedded");
xhtml.startElement("div", attributes);
xhtml.endElement("div");
} finally {
IOUtils.closeQuietly(stream);
}
}
/**
* @param propertySet
* @return
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
private List<Map<String, Object>> processPropertySet(PropertySet propertySet)
throws IOException, TikaException, SAXException {
List<Map<String, Object>> propValues = new ArrayList<>();
for (PropertyValue propertyValue : propertySet.rgPridsData) {
propValues.add(processPropertyValue(propertyValue));
}
return propValues;
}
/**
* Is this property a binary property?
*
* @param property The property.
* @return Is it binary?
*/
private boolean propertyIsBinary(OneNotePropertyEnum property) {
return property == OneNotePropertyEnum.RgOutlineIndentDistance ||
property == OneNotePropertyEnum.NotebookManagementEntityGuid ||
property == OneNotePropertyEnum.RichEditTextUnicode;
}
/**
* Process a property value and populate a map containing all the property value data.
* <p>
* Parse out any relevant text and write it to the print writer as well for easy search
* engine parsing.
*
* @param propertyValue The property value we are parsing.
* @return The map parsed by this property value.
* @throws IOException Can throw these when manipulating the seekable byte channel.
*/
private Map<String, Object> processPropertyValue(PropertyValue propertyValue)
throws IOException, TikaException, SAXException {
Map<String, Object> propMap = new HashMap<>();
propMap.put("oneNoteType", "PropertyValue");
propMap.put("propertyId", propertyValue.propertyId.toString());
if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.LastModifiedTimeStamp) {
long fullval = propertyValue.scalar;
Instant instant = Instant.ofEpochSecond(fullval / 10000000 + DATETIME_EPOCH_DIFF_1601);
if (instant.isAfter(lastModifiedTimestamp)) {
lastModifiedTimestamp = instant;
}
} else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.CreationTimeStamp) {
// add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not
// 1970
long creationTs = propertyValue.scalar + TIME32_EPOCH_DIFF_1980;
if (creationTs < creationTimestamp) {
creationTimestamp = creationTs;
}
} else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.LastModifiedTime) {
// add the TIME32_EPOCH_DIFF_1980 because OneNote TIME32 epoch time is per 1980, not
// 1970
long lastMod = propertyValue.scalar + TIME32_EPOCH_DIFF_1980;
if (lastMod > lastModified) {
lastModified = lastMod;
}
} else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.Author) {
String author = getAuthor(propertyValue);
if (mostRecentAuthorProp) {
propMap.put("MostRecentAuthor", author);
mostRecentAuthors.add(author);
} else if (originalAuthorProp) {
propMap.put("OriginalAuthor", author);
originalAuthors.add(author);
} else {
propMap.put("Author", author);
authors.add(author);
}
mostRecentAuthorProp = false;
originalAuthorProp = false;
} else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.AuthorMostRecent) {
mostRecentAuthorProp = true;
} else if (propertyValue.propertyId.propertyEnum == OneNotePropertyEnum.AuthorOriginal) {
originalAuthorProp = true;
} else if (propertyValue.propertyId.type > 0 && propertyValue.propertyId.type <= 6) {
propMap.put("scalar", propertyValue.scalar);
} else {
OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
content.reposition(propertyValue.rawData);
boolean isBinary = propertyIsBinary(propertyValue.propertyId.propertyEnum);
propMap.put("isBinary", isBinary);
if ((content.size() & 1) == 0 && propertyValue.propertyId.propertyEnum !=
OneNotePropertyEnum.TextExtendedAscii && isBinary == false) {
if (content.size() > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + content.size() + " exceeds document size: " +
dif.size());
}
ByteBuffer buf = ByteBuffer.allocate(content.size());
dif.read(buf);
propMap.put("dataUnicode16LE", new String(buf.array(), StandardCharsets.UTF_16LE));
if (options.getUtf16PropertiesToPrint().contains(propertyValue.propertyId)) {
xhtml.startElement(P);
xhtml.characters((String) propMap.get("dataUnicode16LE"));
xhtml.endElement(P);
}
} else if (propertyValue.propertyId.propertyEnum ==
OneNotePropertyEnum.TextExtendedAscii) {
if (content.size() > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + content.size() + " exceeds document size: " +
dif.size());
}
ByteBuffer buf = ByteBuffer.allocate(content.size());
dif.read(buf);
propMap.put("dataAscii", new String(buf.array(), StandardCharsets.US_ASCII));
xhtml.startElement(P);
xhtml.characters((String) propMap.get("dataAscii"));
xhtml.endElement(P);
} else if (isBinary == false) {
if (content.size() > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + content.size() + " exceeds document size: " +
dif.size());
}
ByteBuffer buf = ByteBuffer.allocate(content.size());
dif.read(buf);
propMap.put("dataUnicode16LE", new String(buf.array(), StandardCharsets.UTF_16LE));
if (options.getUtf16PropertiesToPrint().contains(propertyValue.propertyId)) {
xhtml.startElement(P);
xhtml.characters((String) propMap.get("dataUnicode16LE"));
xhtml.endElement(P);
}
} else {
if (content.size() > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + content.size() + " exceeds document size: " +
dif.size());
}
if (propertyValue.propertyId.propertyEnum ==
OneNotePropertyEnum.RichEditTextUnicode) {
handleRichEditTextUnicode(content.size());
} else {
//TODO -- these seem to be somewhat broken font files and other
//odds and ends...what are they and how should we process them?
//handleEmbedded(content.size());
}
}
}
if (propertyValue.compactIDs != null) {
List<Map<String, Object>> children = new ArrayList<>();
for (CompactID compactID : propertyValue.compactIDs) {
FileNodePtr childFileNodePointer = oneNoteDocument.guidToObject.get(compactID.guid);
children.add(walkFileNodePtr(childFileNodePointer));
}
if (!children.isEmpty()) {
propMap.put("children", children);
}
}
if (propertyValue.propertySet != null && propertyValue.propertySet.rgPridsData != null) {
List<Map<String, Object>> propSet = processPropertySet(propertyValue.propertySet);
if (!propSet.isEmpty()) {
propMap.put("propertySet", propSet);
}
}
return propMap;
}
/**
* returns a UTF-16LE author string.
*
* @param propertyValue The property value of an author.
* @return Resulting author string in UTF-16LE format.
*/
private String getAuthor(PropertyValue propertyValue)
throws IOException, TikaMemoryLimitException {
OneNotePtr content = new OneNotePtr(oneNoteDocument, dif);
content.reposition(propertyValue.rawData);
if (content.size() > dif.size()) {
throw new TikaMemoryLimitException(
"File data store cb " + content.size() + " exceeds document size: " +
dif.size());
}
ByteBuffer buf = ByteBuffer.allocate(content.size());
dif.read(buf);
return new String(buf.array(), StandardCharsets.UTF_16LE);
}
private void handleRichEditTextUnicode(int length)
throws SAXException, IOException, TikaException {
//this is a null-ended UTF-16LE string
ByteBuffer buf = ByteBuffer.allocate(length);
dif.read(buf);
byte[] arr = buf.array();
//look for the first null
int firstNull = 0;
for (int i = 0; i < arr.length - 1; i += 2) {
if (arr[i] == 0 && arr[i + 1] == 0) {
firstNull = (i > 0) ? i : 0;
break;
}
}
if (firstNull == 0) {
return;
}
String txt = new String(arr, 0, firstNull, StandardCharsets.UTF_16LE);
Matcher m = HYPERLINK_PATTERN.matcher(txt);
if (m.find()) {
xhtml.startElement("a", "href", m.group(1));
xhtml.characters(m.group(2));
xhtml.endElement("a");
} else {
xhtml.startElement(P);
xhtml.characters(txt);
xhtml.endElement(P);
}
}
public Set<String> getAuthors() {
return authors;
}
public Set<String> getMostRecentAuthors() {
return mostRecentAuthors;
}
public Set<String> getOriginalAuthors() {
return originalAuthors;
}
public Instant getLastModifiedTimestamp() {
return lastModifiedTimestamp;
}
public void setLastModifiedTimestamp(Instant lastModifiedTimestamp) {
this.lastModifiedTimestamp = lastModifiedTimestamp;
}
public long getLastModified() {
return lastModified;
}
public void setLastModified(long lastModified) {
this.lastModified = lastModified;
}
public long getCreationTimestamp() {
return creationTimestamp;
}
public void setCreationTimestamp(long creationTimestamp) {
this.creationTimestamp = creationTimestamp;
}
}