blob: d9176d96b44364f1a2c413b452501c9c02120782 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tika.parser.microsoft.chm;
import static java.nio.charset.StandardCharsets.UTF_8;
import org.apache.tika.exception.TikaException;
/**
* Directory header The directory starts with a header; its format is as
* follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length
* of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory
* chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD
* Depth of the index tree - 1 there is no index, 2 if there is one level of
* PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none
* (though at least one file has 0 despite there being no index chunk, probably
* a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD
* Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C:
* DWORD Number of directory chunks (total) 0030: DWORD Windows language ID
* 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is
* the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050:
* DWORD -1 (unknown)
*/
public class ChmItspHeader implements ChmAccessor<ChmItspHeader> {
// TODO: refactor all unmarshals
private static final long serialVersionUID = 1962394421998181341L;
private byte[] signature;
private int version; /* 4 */
private int header_len; /* 8 */
private int unknown_000c; /* c */
private long block_len; /* 10 */
private int blockidx_intvl; /* 14 */
private int index_depth; /* 18 */
private int index_root; /* 1c */
private int index_head; /* 20 */
private int unknown_0024; /* 24 */
private long num_blocks; /* 28 */
private int unknown_002c; /* 2c */
private long lang_id; /* 30 */
private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */
private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */
/* local usage */
private int dataRemained;
private int currentPlace = 0;
public ChmItspHeader() {
signature = ChmConstants.ITSP.getBytes(UTF_8); /*
* 0
* (ITSP
* )
*/
}
public String toString() {
StringBuilder sb = new StringBuilder();
sb.append("[ signature:=")
.append(new String(getSignature(), UTF_8))
.append(System.getProperty("line.separator"));
sb.append("version:=\t")
.append(getVersion())
.append(System.getProperty("line.separator"));
sb.append("header_len:=\t")
.append(getHeader_len())
.append(System.getProperty("line.separator"));
sb.append("unknown_00c:=\t")
.append(getUnknown_000c())
.append(System.getProperty("line.separator"));
sb.append("block_len:=\t")
.append(getBlock_len())
.append(" [directory chunk size]")
.append(System.getProperty("line.separator"));
sb.append("blockidx_intvl:=")
.append(getBlockidx_intvl())
.append(", density of quickref section, usually 2")
.append(System.getProperty("line.separator"));
sb.append("index_depth:=\t")
.append(getIndex_depth())
.append(", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI")
.append(" chunk")
.append(System.getProperty("line.separator"));
sb.append("index_root:=\t")
.append(getIndex_root())
.append(", chunk number of root index chunk, -1 if there is none")
.append(System.getProperty("line.separator"));
sb.append("index_head:=\t")
.append(getIndex_head())
.append(", chunk number of first PMGL (listing) chunk")
.append(System.getProperty("line.separator"));
sb.append("unknown_0024:=\t")
.append(getUnknown_0024())
.append(", chunk number of last PMGL (listing) chunk")
.append(System.getProperty("line.separator"));
sb.append("num_blocks:=\t")
.append(getNum_blocks())
.append(", -1 (unknown)")
.append(System.getProperty("line.separator"));
sb.append("unknown_002c:=\t")
.append(getUnknown_002c()).append(", number of directory chunks (total)")
.append(System.getProperty("line.separator"));
sb.append("lang_id:=\t")
.append(getLang_id())
.append(" - ")
.append(ChmCommons.getLanguage(getLang_id()))
.append(System.getProperty("line.separator"));
sb.append("system_uuid:=")
.append(getSystem_uuid())
.append(System.getProperty("line.separator"));
sb.append("unknown_0044:=")
.append(getUnknown_0044())
.append(" ]");
return sb.toString();
}
/**
* Copies 4 bits from data[]
*
* @param data
* @param chmItspHeader
* @param count
* @throws TikaException
*/
private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader, int count)
throws TikaException {
ChmAssert.assertByteArrayNotNull(data);
ChmAssert.assertChmAccessorNotNull(chmItspHeader);
this.setDataRemained(data.length);
System.arraycopy(data, 0, chmItspHeader.signature, 0, count);
this.setCurrentPlace(this.getCurrentPlace() + count);
this.setDataRemained(this.getDataRemained() - count);
}
private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException {
ChmAssert.assertByteArrayNotNull(data);
if (4 > this.getDataRemained()) {
throw new TikaException("4 > dataLenght");
}
dest = (data[this.getCurrentPlace()] & 0xff) |
(data[this.getCurrentPlace() + 1] & 0xff) << 8 |
(data[this.getCurrentPlace() + 2] & 0xff) << 16 |
(data[this.getCurrentPlace() + 3] & 0xff) << 24;
this.setCurrentPlace(this.getCurrentPlace() + 4);
this.setDataRemained(this.getDataRemained() - 4);
return dest;
}
private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException {
ChmAssert.assertByteArrayNotNull(data);
if (4 > dataLenght) {
throw new TikaException("4 > dataLenght");
}
dest = (data[this.getCurrentPlace()] & 0xff) |
(data[this.getCurrentPlace() + 1] & 0xff) << 8 |
(data[this.getCurrentPlace() + 2] & 0xff) << 16 |
(data[this.getCurrentPlace() + 3] & 0xff) << 24;
setDataRemained(this.getDataRemained() - 4);
this.setCurrentPlace(this.getCurrentPlace() + 4);
return dest;
}
private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest, int count) {
System.arraycopy(data, this.getCurrentPlace(), dest, 0, count);
this.setCurrentPlace(this.getCurrentPlace() + count);
this.setDataRemained(this.getDataRemained() - count);
return dest;
}
/**
* Returns how many bytes remained
*
* @return int
*/
private int getDataRemained() {
return dataRemained;
}
/**
* Sets how many bytes remained
*
* @param dataRemained
*/
private void setDataRemained(int dataRemained) {
this.dataRemained = dataRemained;
}
/**
* Returns a place holder
*
* @return current place
*/
private int getCurrentPlace() {
return currentPlace;
}
/**
* Sets current place
*
* @param currentPlace
*/
private void setCurrentPlace(int currentPlace) {
this.currentPlace = currentPlace;
}
/**
* Returns a signature of the header
*
* @return itsp signature
*/
public byte[] getSignature() {
return signature;
}
/**
* Sets itsp signature
*
* @param signature
*/
protected void setSignature(byte[] signature) {
this.signature = signature;
}
/**
* Returns version of itsp header
*
* @return version
*/
public int getVersion() {
return version;
}
/**
* Sets a version of itsp header
*
* @param version
*/
protected void setVersion(int version) {
this.version = version;
}
/**
* Returns header length
*
* @return header length
*/
public int getHeader_len() {
return header_len;
}
/**
* Sets itsp header length
*
* @param header_len
*/
protected void setHeader_len(int header_len) {
this.header_len = header_len;
}
/**
* Returns 000c unknown bytes
*/
public int getUnknown_000c() {
return unknown_000c;
}
/**
* Sets 000c unknown bytes Unknown means here that those guys who cracked
* the chm format do not know what's it purposes for
*
* @param unknown_000c
*/
protected void setUnknown_000c(int unknown_000c) {
this.unknown_000c = unknown_000c;
}
/**
* Returns block's length
*
* @return block_length
*/
public long getBlock_len() {
return block_len;
}
/**
* Sets block length
*
* @param block_len
*/
protected void setBlock_len(long block_len) {
this.block_len = block_len;
}
/**
* Returns block index interval
*
* @return blockidx_intvl
*/
public int getBlockidx_intvl() {
return blockidx_intvl;
}
/**
* Sets block index interval
*
* @param blockidx_intvl
*/
protected void setBlockidx_intvl(int blockidx_intvl) {
this.blockidx_intvl = blockidx_intvl;
}
/**
* Returns an index depth
*
* @return index_depth
*/
public int getIndex_depth() {
return index_depth;
}
/**
* Sets an index depth
*
* @param index_depth
*/
protected void setIndex_depth(int index_depth) {
this.index_depth = index_depth;
}
/**
* Returns index root
*
* @return index_root
*/
public int getIndex_root() {
return index_root;
}
/**
* Sets an index root
*
* @param index_root
*/
protected void setIndex_root(int index_root) {
this.index_root = index_root;
}
/**
* Returns an index head
*
* @return index_head
*/
public int getIndex_head() {
return index_head;
}
/**
* Sets an index head
*
* @param index_head
*/
protected void setIndex_head(int index_head) {
this.index_head = index_head;
}
/**
* Returns 0024 unknown bytes
*
* @return unknown_0024
*/
public int getUnknown_0024() {
return unknown_0024;
}
/**
* Sets 0024 unknown bytes
*
* @param unknown_0024
*/
protected void setUnknown_0024(int unknown_0024) {
this.unknown_0024 = unknown_0024;
}
/**
* Returns number of blocks
*
* @return num_blocks
*/
public long getNum_blocks() {
return num_blocks;
}
/**
* Sets number of blocks containing in the chm file
*
* @param num_blocks
*/
protected void setNum_blocks(long num_blocks) {
this.num_blocks = num_blocks;
}
/**
* Returns 002c unknown bytes
*
* @return unknown_002c
*/
public int getUnknown_002c() {
return unknown_002c;
}
/**
* Sets 002c unknown bytes
*
* @param unknown_002c
*/
protected void setUnknown_002c(int unknown_002c) {
this.unknown_002c = unknown_002c;
}
/**
* Returns language id
*
* @return lang_id
*/
public long getLang_id() {
return lang_id;
}
/**
* Sets language id
*
* @param lang_id
*/
protected void setLang_id(long lang_id) {
this.lang_id = lang_id;
}
/**
* Returns system uuid
*
* @return system_uuid
*/
public byte[] getSystem_uuid() {
return system_uuid;
}
/**
* Sets system uuid
*
* @param system_uuid
*/
protected void setSystem_uuid(byte[] system_uuid) {
this.system_uuid = system_uuid;
}
/**
* Returns 0044 unknown bytes
*
* @return unknown_0044
*/
public byte[] getUnknown_0044() {
return unknown_0044;
}
/**
* Sets 0044 unknown bytes
*
* @param unknown_0044
*/
protected void setUnknown_0044(byte[] unknown_0044) {
this.unknown_0044 = unknown_0044;
}
// @Override
public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException {
/* we only know how to deal with the 0x58 and 0x60 byte structures */
if (data.length != ChmConstants.CHM_ITSP_V1_LEN) {
throw new ChmParsingException(
"we only know how to deal with the 0x58 and 0x60 byte structures");
}
/* unmarshal common fields */
chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN);
// ChmCommons.unmarshalCharArray(data, chmItspHeader,
// ChmConstants.CHM_SIGNATURE_LEN);
chmItspHeader.setVersion(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(), chmItspHeader.getVersion()));
chmItspHeader.setHeader_len(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getHeader_len()));
chmItspHeader.setUnknown_000c(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getUnknown_000c()));
chmItspHeader.setBlock_len(chmItspHeader
.unmarshalUInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getBlock_len()));
chmItspHeader.setBlockidx_intvl(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getBlockidx_intvl()));
chmItspHeader.setIndex_depth(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getIndex_depth()));
chmItspHeader.setIndex_root(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getIndex_root()));
chmItspHeader.setIndex_head(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getIndex_head()));
chmItspHeader.setUnknown_0024(chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getUnknown_0024()));
chmItspHeader.setNum_blocks(chmItspHeader
.unmarshalUInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getNum_blocks()));
chmItspHeader.setUnknown_002c((chmItspHeader
.unmarshalInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getUnknown_002c())));
chmItspHeader.setLang_id(chmItspHeader
.unmarshalUInt32(data, chmItspHeader.getDataRemained(),
chmItspHeader.getLang_id()));
chmItspHeader.setSystem_uuid(chmItspHeader
.unmarshalUuid(data, chmItspHeader.getDataRemained(),
chmItspHeader.getSystem_uuid(), ChmConstants.BYTE_ARRAY_LENGHT));
chmItspHeader.setUnknown_0044(chmItspHeader
.unmarshalUuid(data, chmItspHeader.getDataRemained(),
chmItspHeader.getUnknown_0044(), ChmConstants.BYTE_ARRAY_LENGHT));
/* Checks validity of the itsp header */
if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP)) {
throw new ChmParsingException("seems not valid signature");
}
if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1) {
throw new ChmParsingException("!=ChmConstants.CHM_VER_1");
}
if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN) {
throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN");
}
}
}