| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft.chm; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| |
| import org.apache.tika.exception.TikaException; |
| |
| /** |
| * Directory header The directory starts with a header; its format is as |
| * follows: 0000: char[4] 'ITSP' 0004: DWORD Version number 1 0008: DWORD Length |
| * of the directory header 000C: DWORD $0a (unknown) 0010: DWORD $1000 Directory |
| * chunk size 0014: DWORD "Density" of quickref section, usually 2 0018: DWORD |
| * Depth of the index tree - 1 there is no index, 2 if there is one level of |
| * PMGI chunks 001C: DWORD Chunk number of root index chunk, -1 if there is none |
| * (though at least one file has 0 despite there being no index chunk, probably |
| * a bug) 0020: DWORD Chunk number of first PMGL (listing) chunk 0024: DWORD |
| * Chunk number of last PMGL (listing) chunk 0028: DWORD -1 (unknown) 002C: |
| * DWORD Number of directory chunks (total) 0030: DWORD Windows language ID |
| * 0034: GUID {5D02926A-212E-11D0-9DF9-00A0C922E6EC} 0044: DWORD $54 (This is |
| * the length again) 0048: DWORD -1 (unknown) 004C: DWORD -1 (unknown) 0050: |
| * DWORD -1 (unknown) |
| */ |
| public class ChmItspHeader implements ChmAccessor<ChmItspHeader> { |
| // TODO: refactor all unmarshals |
| private static final long serialVersionUID = 1962394421998181341L; |
| private byte[] signature; |
| private int version; /* 4 */ |
| private int header_len; /* 8 */ |
| private int unknown_000c; /* c */ |
| private long block_len; /* 10 */ |
| private int blockidx_intvl; /* 14 */ |
| private int index_depth; /* 18 */ |
| private int index_root; /* 1c */ |
| private int index_head; /* 20 */ |
| private int unknown_0024; /* 24 */ |
| private long num_blocks; /* 28 */ |
| private int unknown_002c; /* 2c */ |
| private long lang_id; /* 30 */ |
| private byte[] system_uuid = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 34 */ |
| private byte[] unknown_0044 = new byte[ChmConstants.BYTE_ARRAY_LENGHT]; /* 44 */ |
| |
| /* local usage */ |
| private int dataRemained; |
| private int currentPlace = 0; |
| |
| public ChmItspHeader() { |
| signature = ChmConstants.ITSP.getBytes(UTF_8); /* |
| * 0 |
| * (ITSP |
| * ) |
| */ |
| } |
| |
| public String toString() { |
| StringBuilder sb = new StringBuilder(); |
| sb.append("[ signature:=") |
| .append(new String(getSignature(), UTF_8)) |
| .append(System.getProperty("line.separator")); |
| sb.append("version:=\t") |
| .append(getVersion()) |
| .append(System.getProperty("line.separator")); |
| sb.append("header_len:=\t") |
| .append(getHeader_len()) |
| .append(System.getProperty("line.separator")); |
| sb.append("unknown_00c:=\t") |
| .append(getUnknown_000c()) |
| .append(System.getProperty("line.separator")); |
| sb.append("block_len:=\t") |
| .append(getBlock_len()) |
| .append(" [directory chunk size]") |
| .append(System.getProperty("line.separator")); |
| sb.append("blockidx_intvl:=") |
| .append(getBlockidx_intvl()) |
| .append(", density of quickref section, usually 2") |
| .append(System.getProperty("line.separator")); |
| sb.append("index_depth:=\t") |
| .append(getIndex_depth()) |
| .append(", depth of the index tree - 1 there is no index, 2 if there is one level of PMGI") |
| .append(" chunk") |
| .append(System.getProperty("line.separator")); |
| sb.append("index_root:=\t") |
| .append(getIndex_root()) |
| .append(", chunk number of root index chunk, -1 if there is none") |
| .append(System.getProperty("line.separator")); |
| sb.append("index_head:=\t") |
| .append(getIndex_head()) |
| .append(", chunk number of first PMGL (listing) chunk") |
| .append(System.getProperty("line.separator")); |
| sb.append("unknown_0024:=\t") |
| .append(getUnknown_0024()) |
| .append(", chunk number of last PMGL (listing) chunk") |
| .append(System.getProperty("line.separator")); |
| sb.append("num_blocks:=\t") |
| .append(getNum_blocks()) |
| .append(", -1 (unknown)") |
| .append(System.getProperty("line.separator")); |
| sb.append("unknown_002c:=\t") |
| .append(getUnknown_002c()).append(", number of directory chunks (total)") |
| .append(System.getProperty("line.separator")); |
| sb.append("lang_id:=\t") |
| .append(getLang_id()) |
| .append(" - ") |
| .append(ChmCommons.getLanguage(getLang_id())) |
| .append(System.getProperty("line.separator")); |
| sb.append("system_uuid:=") |
| .append(getSystem_uuid()) |
| .append(System.getProperty("line.separator")); |
| sb.append("unknown_0044:=") |
| .append(getUnknown_0044()) |
| .append(" ]"); |
| return sb.toString(); |
| } |
| |
| /** |
| * Copies 4 bits from data[] |
| * |
| * @param data |
| * @param chmItspHeader |
| * @param count |
| * @throws TikaException |
| */ |
| private void unmarshalCharArray(byte[] data, ChmItspHeader chmItspHeader, int count) |
| throws TikaException { |
| ChmAssert.assertByteArrayNotNull(data); |
| ChmAssert.assertChmAccessorNotNull(chmItspHeader); |
| this.setDataRemained(data.length); |
| System.arraycopy(data, 0, chmItspHeader.signature, 0, count); |
| this.setCurrentPlace(this.getCurrentPlace() + count); |
| this.setDataRemained(this.getDataRemained() - count); |
| } |
| |
| private int unmarshalInt32(byte[] data, int dataLenght, int dest) throws TikaException { |
| ChmAssert.assertByteArrayNotNull(data); |
| if (4 > this.getDataRemained()) { |
| throw new TikaException("4 > dataLenght"); |
| } |
| dest = (data[this.getCurrentPlace()] & 0xff) | |
| (data[this.getCurrentPlace() + 1] & 0xff) << 8 | |
| (data[this.getCurrentPlace() + 2] & 0xff) << 16 | |
| (data[this.getCurrentPlace() + 3] & 0xff) << 24; |
| |
| this.setCurrentPlace(this.getCurrentPlace() + 4); |
| this.setDataRemained(this.getDataRemained() - 4); |
| return dest; |
| } |
| |
| private long unmarshalUInt32(byte[] data, int dataLenght, long dest) throws TikaException { |
| ChmAssert.assertByteArrayNotNull(data); |
| if (4 > dataLenght) { |
| throw new TikaException("4 > dataLenght"); |
| } |
| dest = (data[this.getCurrentPlace()] & 0xff) | |
| (data[this.getCurrentPlace() + 1] & 0xff) << 8 | |
| (data[this.getCurrentPlace() + 2] & 0xff) << 16 | |
| (data[this.getCurrentPlace() + 3] & 0xff) << 24; |
| |
| setDataRemained(this.getDataRemained() - 4); |
| this.setCurrentPlace(this.getCurrentPlace() + 4); |
| return dest; |
| } |
| |
| private byte[] unmarshalUuid(byte[] data, int dataLenght, byte[] dest, int count) { |
| System.arraycopy(data, this.getCurrentPlace(), dest, 0, count); |
| this.setCurrentPlace(this.getCurrentPlace() + count); |
| this.setDataRemained(this.getDataRemained() - count); |
| return dest; |
| } |
| |
| /** |
| * Returns how many bytes remained |
| * |
| * @return int |
| */ |
| private int getDataRemained() { |
| return dataRemained; |
| } |
| |
| /** |
| * Sets how many bytes remained |
| * |
| * @param dataRemained |
| */ |
| private void setDataRemained(int dataRemained) { |
| this.dataRemained = dataRemained; |
| } |
| |
| /** |
| * Returns a place holder |
| * |
| * @return current place |
| */ |
| private int getCurrentPlace() { |
| return currentPlace; |
| } |
| |
| /** |
| * Sets current place |
| * |
| * @param currentPlace |
| */ |
| private void setCurrentPlace(int currentPlace) { |
| this.currentPlace = currentPlace; |
| } |
| |
| /** |
| * Returns a signature of the header |
| * |
| * @return itsp signature |
| */ |
| public byte[] getSignature() { |
| return signature; |
| } |
| |
| /** |
| * Sets itsp signature |
| * |
| * @param signature |
| */ |
| protected void setSignature(byte[] signature) { |
| this.signature = signature; |
| } |
| |
| /** |
| * Returns version of itsp header |
| * |
| * @return version |
| */ |
| public int getVersion() { |
| return version; |
| } |
| |
| /** |
| * Sets a version of itsp header |
| * |
| * @param version |
| */ |
| protected void setVersion(int version) { |
| this.version = version; |
| } |
| |
| /** |
| * Returns header length |
| * |
| * @return header length |
| */ |
| public int getHeader_len() { |
| return header_len; |
| } |
| |
| /** |
| * Sets itsp header length |
| * |
| * @param header_len |
| */ |
| protected void setHeader_len(int header_len) { |
| this.header_len = header_len; |
| } |
| |
| /** |
| * Returns 000c unknown bytes |
| */ |
| public int getUnknown_000c() { |
| return unknown_000c; |
| } |
| |
| /** |
| * Sets 000c unknown bytes Unknown means here that those guys who cracked |
| * the chm format do not know what's it purposes for |
| * |
| * @param unknown_000c |
| */ |
| protected void setUnknown_000c(int unknown_000c) { |
| this.unknown_000c = unknown_000c; |
| } |
| |
| /** |
| * Returns block's length |
| * |
| * @return block_length |
| */ |
| public long getBlock_len() { |
| return block_len; |
| } |
| |
| /** |
| * Sets block length |
| * |
| * @param block_len |
| */ |
| protected void setBlock_len(long block_len) { |
| this.block_len = block_len; |
| } |
| |
| /** |
| * Returns block index interval |
| * |
| * @return blockidx_intvl |
| */ |
| public int getBlockidx_intvl() { |
| return blockidx_intvl; |
| } |
| |
| /** |
| * Sets block index interval |
| * |
| * @param blockidx_intvl |
| */ |
| protected void setBlockidx_intvl(int blockidx_intvl) { |
| this.blockidx_intvl = blockidx_intvl; |
| } |
| |
| /** |
| * Returns an index depth |
| * |
| * @return index_depth |
| */ |
| public int getIndex_depth() { |
| return index_depth; |
| } |
| |
| /** |
| * Sets an index depth |
| * |
| * @param index_depth |
| */ |
| protected void setIndex_depth(int index_depth) { |
| this.index_depth = index_depth; |
| } |
| |
| /** |
| * Returns index root |
| * |
| * @return index_root |
| */ |
| public int getIndex_root() { |
| return index_root; |
| } |
| |
| /** |
| * Sets an index root |
| * |
| * @param index_root |
| */ |
| protected void setIndex_root(int index_root) { |
| this.index_root = index_root; |
| } |
| |
| /** |
| * Returns an index head |
| * |
| * @return index_head |
| */ |
| public int getIndex_head() { |
| return index_head; |
| } |
| |
| /** |
| * Sets an index head |
| * |
| * @param index_head |
| */ |
| protected void setIndex_head(int index_head) { |
| this.index_head = index_head; |
| } |
| |
| /** |
| * Returns 0024 unknown bytes |
| * |
| * @return unknown_0024 |
| */ |
| public int getUnknown_0024() { |
| return unknown_0024; |
| } |
| |
| /** |
| * Sets 0024 unknown bytes |
| * |
| * @param unknown_0024 |
| */ |
| protected void setUnknown_0024(int unknown_0024) { |
| this.unknown_0024 = unknown_0024; |
| } |
| |
| /** |
| * Returns number of blocks |
| * |
| * @return num_blocks |
| */ |
| public long getNum_blocks() { |
| return num_blocks; |
| } |
| |
| /** |
| * Sets number of blocks containing in the chm file |
| * |
| * @param num_blocks |
| */ |
| protected void setNum_blocks(long num_blocks) { |
| this.num_blocks = num_blocks; |
| } |
| |
| /** |
| * Returns 002c unknown bytes |
| * |
| * @return unknown_002c |
| */ |
| public int getUnknown_002c() { |
| return unknown_002c; |
| } |
| |
| /** |
| * Sets 002c unknown bytes |
| * |
| * @param unknown_002c |
| */ |
| protected void setUnknown_002c(int unknown_002c) { |
| this.unknown_002c = unknown_002c; |
| } |
| |
| /** |
| * Returns language id |
| * |
| * @return lang_id |
| */ |
| public long getLang_id() { |
| return lang_id; |
| } |
| |
| /** |
| * Sets language id |
| * |
| * @param lang_id |
| */ |
| protected void setLang_id(long lang_id) { |
| this.lang_id = lang_id; |
| } |
| |
| /** |
| * Returns system uuid |
| * |
| * @return system_uuid |
| */ |
| public byte[] getSystem_uuid() { |
| return system_uuid; |
| } |
| |
| /** |
| * Sets system uuid |
| * |
| * @param system_uuid |
| */ |
| protected void setSystem_uuid(byte[] system_uuid) { |
| this.system_uuid = system_uuid; |
| } |
| |
| /** |
| * Returns 0044 unknown bytes |
| * |
| * @return unknown_0044 |
| */ |
| public byte[] getUnknown_0044() { |
| return unknown_0044; |
| } |
| |
| /** |
| * Sets 0044 unknown bytes |
| * |
| * @param unknown_0044 |
| */ |
| protected void setUnknown_0044(byte[] unknown_0044) { |
| this.unknown_0044 = unknown_0044; |
| } |
| |
| // @Override |
| public void parse(byte[] data, ChmItspHeader chmItspHeader) throws TikaException { |
| /* we only know how to deal with the 0x58 and 0x60 byte structures */ |
| if (data.length != ChmConstants.CHM_ITSP_V1_LEN) { |
| throw new ChmParsingException( |
| "we only know how to deal with the 0x58 and 0x60 byte structures"); |
| } |
| |
| /* unmarshal common fields */ |
| chmItspHeader.unmarshalCharArray(data, chmItspHeader, ChmConstants.CHM_SIGNATURE_LEN); |
| // ChmCommons.unmarshalCharArray(data, chmItspHeader, |
| // ChmConstants.CHM_SIGNATURE_LEN); |
| chmItspHeader.setVersion(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), chmItspHeader.getVersion())); |
| chmItspHeader.setHeader_len(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getHeader_len())); |
| chmItspHeader.setUnknown_000c(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getUnknown_000c())); |
| chmItspHeader.setBlock_len(chmItspHeader |
| .unmarshalUInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getBlock_len())); |
| chmItspHeader.setBlockidx_intvl(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getBlockidx_intvl())); |
| chmItspHeader.setIndex_depth(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getIndex_depth())); |
| chmItspHeader.setIndex_root(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getIndex_root())); |
| chmItspHeader.setIndex_head(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getIndex_head())); |
| chmItspHeader.setUnknown_0024(chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getUnknown_0024())); |
| chmItspHeader.setNum_blocks(chmItspHeader |
| .unmarshalUInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getNum_blocks())); |
| chmItspHeader.setUnknown_002c((chmItspHeader |
| .unmarshalInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getUnknown_002c()))); |
| chmItspHeader.setLang_id(chmItspHeader |
| .unmarshalUInt32(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getLang_id())); |
| chmItspHeader.setSystem_uuid(chmItspHeader |
| .unmarshalUuid(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getSystem_uuid(), ChmConstants.BYTE_ARRAY_LENGHT)); |
| chmItspHeader.setUnknown_0044(chmItspHeader |
| .unmarshalUuid(data, chmItspHeader.getDataRemained(), |
| chmItspHeader.getUnknown_0044(), ChmConstants.BYTE_ARRAY_LENGHT)); |
| |
| /* Checks validity of the itsp header */ |
| if (!new String(chmItspHeader.getSignature(), UTF_8).equals(ChmConstants.ITSP)) { |
| throw new ChmParsingException("seems not valid signature"); |
| } |
| |
| if (chmItspHeader.getVersion() != ChmConstants.CHM_VER_1) { |
| throw new ChmParsingException("!=ChmConstants.CHM_VER_1"); |
| } |
| |
| if (chmItspHeader.getHeader_len() != ChmConstants.CHM_ITSP_V1_LEN) { |
| throw new ChmParsingException("!= ChmConstants.CHM_ITSP_V1_LEN"); |
| } |
| } |
| } |