blob: cf50a42a6ee49daa462f572dee27df6a0218d722 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.dwg;
import java.io.IOException;
import java.io.InputStream;
import java.util.Collections;
import java.util.Set;
import org.apache.poi.util.IOUtils;
import org.apache.poi.util.StringUtil;
import org.xml.sax.ContentHandler;
import org.xml.sax.SAXException;
import org.apache.tika.exception.TikaException;
import org.apache.tika.io.EndianUtils;
import org.apache.tika.metadata.Metadata;
import org.apache.tika.metadata.Property;
import org.apache.tika.metadata.TikaCoreProperties;
import org.apache.tika.mime.MediaType;
import org.apache.tika.parser.AbstractParser;
import org.apache.tika.parser.ParseContext;
import org.apache.tika.sax.XHTMLContentHandler;
/**
* DWG (CAD Drawing) parser. This is a very basic parser, which just
* looks for bits of the headers.
* Note that we use Apache POI for various parts of the processing, as
* lots of the low level string/int/short concepts are the same.
*/
public class DWGParser extends AbstractParser {
/**
* Serial version UID
*/
private static final long serialVersionUID = -7744232583079169119L;
/**
* The order of the fields in the header
*/
private static final Property[] HEADER_PROPERTIES_ENTRIES =
{TikaCoreProperties.TITLE, TikaCoreProperties.DESCRIPTION, TikaCoreProperties.CREATOR,
TikaCoreProperties.SUBJECT, TikaCoreProperties.COMMENTS,
TikaCoreProperties.MODIFIER, null, // Unknown?
TikaCoreProperties.RELATION, // Hyperlink
};
/**
* For the 2000 file, they're indexed
*/
private static final Property[] HEADER_2000_PROPERTIES_ENTRIES =
{null, TikaCoreProperties.RELATION, // 0x01
TikaCoreProperties.TITLE, // 0x02
TikaCoreProperties.DESCRIPTION, // 0x03
TikaCoreProperties.CREATOR, // 0x04
null, TikaCoreProperties.COMMENTS,// 0x06
TikaCoreProperties.SUBJECT, // 0x07
TikaCoreProperties.MODIFIER, // 0x08
};
private static final String HEADER_2000_PROPERTIES_MARKER_STR = "DWGPROPS COOKIE";
private static final byte[] HEADER_2000_PROPERTIES_MARKER =
new byte[HEADER_2000_PROPERTIES_MARKER_STR.length()];
/**
* How far to skip after the last standard property, before
* we find any custom properties that might be there.
*/
private static final int CUSTOM_PROPERTIES_SKIP = 20;
/**
* The value of padding bytes other than 0 in some DWG files.
*/
private static final int[] CUSTOM_PROPERTIES_ALT_PADDING_VALUES = new int[]{0x2, 0, 0, 0};
private static MediaType TYPE = MediaType.image("vnd.dwg");
static {
StringUtil.putCompressedUnicode(HEADER_2000_PROPERTIES_MARKER_STR,
HEADER_2000_PROPERTIES_MARKER, 0);
}
public Set<MediaType> getSupportedTypes(ParseContext context) {
return Collections.singleton(TYPE);
}
public void parse(InputStream stream, ContentHandler handler, Metadata metadata,
ParseContext context) throws IOException, TikaException, SAXException {
// First up, which version of the format are we handling?
byte[] header = new byte[128];
IOUtils.readFully(stream, header);
String version = new String(header, 0, 6, "US-ASCII");
XHTMLContentHandler xhtml = new XHTMLContentHandler(handler, metadata);
xhtml.startDocument();
switch (version) {
case "AC1015":
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipTo2000PropertyInfoSection(stream, header)) {
get2000Props(stream, metadata, xhtml);
}
break;
case "AC1018":
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
get2004Props(stream, metadata, xhtml);
}
break;
case "AC1021":
case "AC1024":
metadata.set(Metadata.CONTENT_TYPE, TYPE.toString());
if (skipToPropertyInfoSection(stream, header)) {
get2007and2010Props(stream, metadata, xhtml);
}
break;
default:
throw new TikaException("Unsupported AutoCAD drawing version: " + version);
}
xhtml.endDocument();
}
/**
* Stored as US-ASCII
*/
private void get2004Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
String headerValue = read2004String(stream);
handleHeader(i, headerValue, metadata, xhtml);
}
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
String propName = read2004String(stream);
String propValue = read2004String(stream);
if (propName.length() > 0 && propValue.length() > 0) {
metadata.add(propName, propValue);
}
}
}
private String read2004String(InputStream stream) throws IOException, TikaException {
int stringLen = EndianUtils.readUShortLE(stream);
byte[] stringData = new byte[stringLen];
IOUtils.readFully(stream, stringData);
// Often but not always null terminated
if (stringData[stringLen - 1] == 0) {
stringLen--;
}
String value = StringUtil.getFromCompressedUnicode(stringData, 0, stringLen);
return value;
}
/**
* Stored as UCS2, so 16 bit "unicode"
*/
private void get2007and2010Props(InputStream stream, Metadata metadata,
XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
// Standard properties
for (int i = 0; i < HEADER_PROPERTIES_ENTRIES.length; i++) {
String headerValue = read2007and2010String(stream);
handleHeader(i, headerValue, metadata, xhtml);
}
// Custom properties
int customCount = skipToCustomProperties(stream);
for (int i = 0; i < customCount; i++) {
String propName = read2007and2010String(stream);
String propValue = read2007and2010String(stream);
if (propName.length() > 0 && propValue.length() > 0) {
metadata.add(propName, propValue);
}
}
}
private String read2007and2010String(InputStream stream) throws IOException, TikaException {
int stringLen = EndianUtils.readUShortLE(stream);
byte[] stringData = new byte[stringLen * 2];
IOUtils.readFully(stream, stringData);
String value = StringUtil.getFromUnicodeLE(stringData);
// Some strings are null terminated
if (value.charAt(value.length() - 1) == 0) {
value = value.substring(0, value.length() - 1);
}
return value;
}
private void get2000Props(InputStream stream, Metadata metadata, XHTMLContentHandler xhtml)
throws IOException, TikaException, SAXException {
int propCount = 0;
while (propCount < 30) {
int propIdx = EndianUtils.readUShortLE(stream);
int length = EndianUtils.readUShortLE(stream);
int valueType = stream.read();
if (propIdx == 0x28) {
// This one seems not to follow the pattern
length = 0x19;
} else if (propIdx == 90) {
// We think this means the end of properties
break;
}
byte[] value = new byte[length];
IOUtils.readFully(stream, value);
if (valueType == 0x1e) {
// Normal string, good
String val = StringUtil.getFromCompressedUnicode(value, 0, length);
// Is it one we can look up by index?
if (propIdx < HEADER_2000_PROPERTIES_ENTRIES.length) {
metadata.add(HEADER_2000_PROPERTIES_ENTRIES[propIdx], val);
xhtml.element("p", val);
} else if (propIdx == 0x012c) {
int splitAt = val.indexOf('=');
if (splitAt > -1) {
String propName = val.substring(0, splitAt);
String propVal = val.substring(splitAt + 1);
metadata.add(propName, propVal);
}
}
} else {
// No idea...
}
propCount++;
}
}
private void handleHeader(int headerNumber, String value, Metadata metadata,
XHTMLContentHandler xhtml) throws SAXException {
if (value == null || value.length() == 0) {
return;
}
Property headerProp = HEADER_PROPERTIES_ENTRIES[headerNumber];
if (headerProp != null) {
metadata.set(headerProp, value);
}
xhtml.element("p", value);
}
/**
* Grab the offset, then skip there
*/
private boolean skipToPropertyInfoSection(InputStream stream, byte[] header)
throws IOException, TikaException {
// The offset is stored in the header from 0x20 onwards
long offsetToSection = EndianUtils.getLongLE(header, 0x20);
// Sanity check the offset. Some files seem to use a different format,
// and the offset isn't available at 0x20. Until we can work out how
// to find the offset in those files, skip them if detected
if (offsetToSection > 0xa00000l) {
// Header should never be more than 10mb into the file, something is wrong
offsetToSection = 0;
}
// Work out how far to skip, and sanity check
long toSkip = offsetToSection - header.length;
if (offsetToSection == 0) {
return false;
}
while (toSkip > 0) {
byte[] skip = new byte[Math.min((int) toSkip, 0x4000)];
IOUtils.readFully(stream, skip);
toSkip -= skip.length;
}
return true;
}
/**
* We think it can be anywhere...
*/
private boolean skipTo2000PropertyInfoSection(InputStream stream, byte[] header)
throws IOException {
int val = 0;
while (val != -1) {
val = stream.read();
if (val == HEADER_2000_PROPERTIES_MARKER[0]) {
boolean going = true;
for (int i = 1; i < HEADER_2000_PROPERTIES_MARKER.length && going; i++) {
val = stream.read();
if (val != HEADER_2000_PROPERTIES_MARKER[i]) {
going = false;
}
}
if (going) {
// Bingo, found it
return true;
}
}
}
return false;
}
private int skipToCustomProperties(InputStream stream) throws IOException, TikaException {
// There should be 4 zero bytes or CUSTOM_PROPERTIES_ALT_PADDING_VALUES next
byte[] padding = new byte[4];
IOUtils.readFully(stream, padding);
if ((padding[0] == 0 && padding[1] == 0 && padding[2] == 0 && padding[3] == 0) ||
(padding[0] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[0] &&
padding[1] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[1] &&
padding[2] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[2] &&
padding[3] == CUSTOM_PROPERTIES_ALT_PADDING_VALUES[3])) {
// Looks hopeful, skip on
padding = new byte[CUSTOM_PROPERTIES_SKIP];
IOUtils.readFully(stream, padding);
// We should now have the count
int count = EndianUtils.readUShortLE(stream);
// Sanity check it
if (count > 0 && count < 0x7f) {
// Looks plausible
return count;
} else {
// No properties / count is too high to trust
return 0;
}
} else {
// No padding. That probably means no custom props
return 0;
}
}
}