| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft.chm; |
| |
| import static java.nio.charset.StandardCharsets.UTF_8; |
| |
| import java.io.ByteArrayOutputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.util.ArrayList; |
| import java.util.List; |
| |
| import org.apache.commons.io.IOUtils; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.tika.exception.TikaException; |
| import org.apache.tika.parser.microsoft.chm.ChmCommons.EntryType; |
| |
| /** |
| * Extracts text from chm file. Enumerates chm entries. |
| */ |
| public class ChmExtractor { |
| |
| private static final Logger LOG = LoggerFactory.getLogger(ChmExtractor.class); |
| |
| |
| private List<ChmLzxBlock> lzxBlocksCache = null; |
| private ChmDirectoryListingSet chmDirList = null; |
| private ChmItsfHeader chmItsfHeader = null; |
| private ChmItspHeader chmItspHeader = null; |
| private ChmLzxcResetTable chmLzxcResetTable = null; |
| private ChmLzxcControlData chmLzxcControlData = null; |
| private byte[] data = null; |
| private int indexOfContent; |
| private long lzxBlockOffset; |
| private long lzxBlockLength; |
| private ChmBlockInfo chmBlockInfo = null;//this will be instantiated at first call of |
| |
| public ChmExtractor(InputStream is) throws TikaException, IOException { |
| ChmAssert.assertInputStreamNotNull(is); |
| try { |
| setData(IOUtils.toByteArray(is)); |
| |
| /* Creates and parses chm itsf header */ |
| setChmItsfHeader(new ChmItsfHeader()); |
| // getChmItsfHeader().parse(Arrays.copyOfRange(getData(), 0, |
| // ChmConstants.CHM_ITSF_V3_LEN - 1), getChmItsfHeader()); |
| getChmItsfHeader() |
| .parse(ChmCommons.copyOfRange(getData(), 0, |
| ChmConstants.CHM_ITSF_V3_LEN - 1), |
| getChmItsfHeader()); |
| |
| /* Creates and parses chm itsp header */ |
| setChmItspHeader(new ChmItspHeader()); |
| // getChmItspHeader().parse(Arrays.copyOfRange( getData(), (int) |
| // getChmItsfHeader().getDirOffset(), |
| // (int) getChmItsfHeader().getDirOffset() + |
| // ChmConstants.CHM_ITSP_V1_LEN), getChmItspHeader()); |
| getChmItspHeader().parse(ChmCommons |
| .copyOfRange(getData(), (int) getChmItsfHeader().getDirOffset(), |
| (int) getChmItsfHeader().getDirOffset() + |
| ChmConstants.CHM_ITSP_V1_LEN), |
| getChmItspHeader()); |
| |
| /* Creates instance of ChmDirListingContainer */ |
| setChmDirList( |
| new ChmDirectoryListingSet(getData(), getChmItsfHeader(), getChmItspHeader())); |
| |
| int indexOfControlData = getChmDirList().getControlDataIndex(); |
| int indexOfResetData = |
| ChmCommons.indexOfResetTableBlock(getData(), ChmConstants.LZXC.getBytes(UTF_8)); |
| byte[] dir_chunk = null; |
| if (indexOfResetData > 0) { |
| dir_chunk = ChmCommons.copyOfRange(getData(), indexOfResetData, indexOfResetData + |
| getChmDirList().getDirectoryListingEntryList().get(indexOfControlData) |
| .getLength()); |
| } |
| // dir_chunk = Arrays.copyOfRange(getData(), indexOfResetData, |
| // indexOfResetData |
| // + |
| // getChmDirList().getDirectoryListingEntryList().get(indexOfControlData).getLength()); |
| |
| /* Creates and parses chm control data */ |
| setChmLzxcControlData(new ChmLzxcControlData()); |
| getChmLzxcControlData().parse(dir_chunk, getChmLzxcControlData()); |
| |
| int indexOfResetTable = getChmDirList().getResetTableIndex(); |
| setChmLzxcResetTable(new ChmLzxcResetTable()); |
| |
| int startIndex = (int) getChmDirList().getDataOffset() + |
| getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable) |
| .getOffset(); |
| |
| // assert startIndex < data.length |
| ChmAssert.assertCopyingDataIndex(startIndex, getData().length); |
| |
| // dir_chunk = Arrays.copyOfRange(getData(), startIndex, startIndex |
| // + |
| // getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable).getLength()); |
| dir_chunk = ChmCommons.copyOfRange(getData(), startIndex, startIndex + |
| getChmDirList().getDirectoryListingEntryList().get(indexOfResetTable) |
| .getLength()); |
| |
| getChmLzxcResetTable().parse(dir_chunk, getChmLzxcResetTable()); |
| |
| setIndexOfContent(ChmCommons |
| .indexOf(getChmDirList().getDirectoryListingEntryList(), ChmConstants.CONTENT)); |
| setLzxBlockOffset( |
| (getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()) |
| .getOffset() + getChmItsfHeader().getDataOffset())); |
| setLzxBlockLength( |
| getChmDirList().getDirectoryListingEntryList().get(getIndexOfContent()) |
| .getLength()); |
| |
| setLzxBlocksCache(new ArrayList<>()); |
| |
| } catch (IOException e) { |
| LOG.warn("IOException parsing chm file", e); |
| } |
| } |
| |
| /** |
| * Returns lzxc control data. |
| * |
| * @return ChmLzxcControlData |
| */ |
| private ChmLzxcControlData getChmLzxcControlData() { |
| return chmLzxcControlData; |
| } |
| |
| /** |
| * Sets lzxc control data |
| * |
| * @param chmLzxcControlData |
| */ |
| private void setChmLzxcControlData(ChmLzxcControlData chmLzxcControlData) { |
| this.chmLzxcControlData = chmLzxcControlData; |
| } |
| |
| private ChmItspHeader getChmItspHeader() { |
| return chmItspHeader; |
| } |
| |
| private void setChmItspHeader(ChmItspHeader chmItspHeader) { |
| this.chmItspHeader = chmItspHeader; |
| } |
| |
| /** |
| * Returns lzxc reset table |
| * |
| * @return ChmLzxcResetTable |
| */ |
| private ChmLzxcResetTable getChmLzxcResetTable() { |
| return chmLzxcResetTable; |
| } |
| |
| /** |
| * Sets lzxc reset table |
| * |
| * @param chmLzxcResetTable |
| */ |
| private void setChmLzxcResetTable(ChmLzxcResetTable chmLzxcResetTable) { |
| this.chmLzxcResetTable = chmLzxcResetTable; |
| } |
| |
| /** |
| * Returns lzxc hit_cache length |
| * |
| * @return lzxBlockLength |
| */ |
| private long getLzxBlockLength() { |
| return lzxBlockLength; |
| } |
| |
| /** |
| * Sets lzxc hit_cache length |
| * |
| * @param lzxBlockLength |
| */ |
| private void setLzxBlockLength(long lzxBlockLength) { |
| this.lzxBlockLength = lzxBlockLength; |
| } |
| |
| /** |
| * Returns lzxc hit_cache offset |
| * |
| * @return lzxBlockOffset |
| */ |
| private long getLzxBlockOffset() { |
| return lzxBlockOffset; |
| } |
| |
| /** |
| * Sets lzxc hit_cache offset |
| */ |
| private void setLzxBlockOffset(long lzxBlockOffset) { |
| this.lzxBlockOffset = lzxBlockOffset; |
| } |
| |
| private int getIndexOfContent() { |
| return indexOfContent; |
| } |
| |
| private void setIndexOfContent(int indexOfContent) { |
| this.indexOfContent = indexOfContent; |
| } |
| |
| private byte[] getData() { |
| return data; |
| } |
| |
| private void setData(byte[] data) { |
| this.data = data; |
| } |
| |
| /** |
| * Enumerates chm entities |
| * |
| * @return list of chm entities |
| */ |
| public List<String> enumerateChm() { |
| List<String> listOfEntries = new ArrayList<>(); |
| for (DirectoryListingEntry directoryListingEntry : getChmDirList() |
| .getDirectoryListingEntryList()) { |
| listOfEntries.add(directoryListingEntry.getName()); |
| } |
| return listOfEntries; |
| } |
| |
| /** |
| * Decompresses a chm entry |
| * |
| * @param directoryListingEntry |
| * @return decompressed data |
| * @throws TikaException |
| */ |
| public byte[] extractChmEntry(DirectoryListingEntry directoryListingEntry) |
| throws TikaException { |
| ByteArrayOutputStream buffer = new ByteArrayOutputStream(); |
| ChmLzxBlock lzxBlock = null; |
| try { |
| /* UNCOMPRESSED type is easiest one */ |
| if (directoryListingEntry.getEntryType() == EntryType.UNCOMPRESSED && |
| directoryListingEntry.getLength() > 0 && |
| !ChmCommons.hasSkip(directoryListingEntry)) { |
| int dataOffset = (int) (getChmItsfHeader().getDataOffset() + |
| directoryListingEntry.getOffset()); |
| // dataSegment = Arrays.copyOfRange(getData(), dataOffset, |
| // dataOffset + directoryListingEntry.getLength()); |
| buffer.write(ChmCommons.copyOfRange(getData(), dataOffset, |
| dataOffset + directoryListingEntry.getLength())); |
| } else if (directoryListingEntry.getEntryType() == EntryType.COMPRESSED && |
| !ChmCommons.hasSkip(directoryListingEntry)) { |
| /* Gets a chm hit_cache info */ |
| chmBlockInfo = ChmBlockInfo.getChmBlockInfoInstance(directoryListingEntry, |
| (int) getChmLzxcResetTable().getBlockLen(), getChmLzxcControlData(), |
| chmBlockInfo); |
| |
| int i = 0, start = 0, hit_cache = 0; |
| |
| if ((getLzxBlockLength() < Integer.MAX_VALUE) && |
| (getLzxBlockOffset() < Integer.MAX_VALUE)) { |
| // TODO: Improve the caching |
| // caching ... = O(n^2) - depends on startBlock and endBlock |
| start = -1; |
| if (!getLzxBlocksCache().isEmpty()) { |
| for (i = 0; i < getLzxBlocksCache().size(); i++) { |
| //lzxBlock = getLzxBlocksCache().get(i); |
| int bn = getLzxBlocksCache().get(i).getBlockNumber(); |
| for (int j = chmBlockInfo.getIniBlock(); |
| j <= chmBlockInfo.getStartBlock(); j++) { |
| if (bn == j) { |
| if (j > start) { |
| start = j; |
| hit_cache = i; |
| } |
| } |
| } |
| if (start == chmBlockInfo.getStartBlock()) { |
| break; |
| } |
| } |
| } |
| |
| // if (i == getLzxBlocksCache().size() && i == 0) { |
| if (start < 0) { |
| start = chmBlockInfo.getIniBlock(); |
| |
| byte[] dataSegment = ChmCommons |
| .getChmBlockSegment(getData(), getChmLzxcResetTable(), start, |
| (int) getLzxBlockOffset(), (int) getLzxBlockLength()); |
| |
| lzxBlock = new ChmLzxBlock(start, dataSegment, |
| getChmLzxcResetTable().getBlockLen(), null); |
| |
| getLzxBlocksCache().add(lzxBlock); |
| } else { |
| lzxBlock = getLzxBlocksCache().get(hit_cache); |
| } |
| |
| for (i = start; i <= chmBlockInfo.getEndBlock(); ) { |
| if (i == chmBlockInfo.getStartBlock() && i == chmBlockInfo.getEndBlock()) { |
| buffer.write(lzxBlock.getContent(chmBlockInfo.getStartOffset(), |
| chmBlockInfo.getEndOffset())); |
| break; |
| } |
| |
| if (i == chmBlockInfo.getStartBlock()) { |
| buffer.write(lzxBlock.getContent(chmBlockInfo.getStartOffset())); |
| } |
| |
| if (i > chmBlockInfo.getStartBlock() && i < chmBlockInfo.getEndBlock()) { |
| buffer.write(lzxBlock.getContent()); |
| } |
| |
| if (i == chmBlockInfo.getEndBlock()) { |
| buffer.write(lzxBlock.getContent(0, chmBlockInfo.getEndOffset())); |
| break; |
| } |
| |
| i++; |
| |
| if (i % getChmLzxcControlData().getResetInterval() == 0) { |
| lzxBlock = new ChmLzxBlock(i, ChmCommons |
| .getChmBlockSegment(getData(), getChmLzxcResetTable(), i, |
| (int) getLzxBlockOffset(), (int) getLzxBlockLength()), |
| getChmLzxcResetTable().getBlockLen(), null); |
| } else { |
| lzxBlock = new ChmLzxBlock(i, ChmCommons |
| .getChmBlockSegment(getData(), getChmLzxcResetTable(), i, |
| (int) getLzxBlockOffset(), (int) getLzxBlockLength()), |
| getChmLzxcResetTable().getBlockLen(), lzxBlock); |
| } |
| |
| getLzxBlocksCache().add(lzxBlock); |
| } |
| |
| if (getLzxBlocksCache().size() > getChmLzxcResetTable().getBlockCount()) { |
| getLzxBlocksCache().clear(); |
| } |
| } //end of if |
| |
| if (buffer.size() != directoryListingEntry.getLength()) { |
| throw new TikaException("CHM file extract error: extracted Length is wrong."); |
| } |
| } //end of if compressed |
| } catch (Exception e) { |
| throw new TikaException(e.getMessage()); |
| } |
| |
| return buffer.toByteArray(); |
| } |
| |
| private List<ChmLzxBlock> getLzxBlocksCache() { |
| return lzxBlocksCache; |
| } |
| |
| private void setLzxBlocksCache(List<ChmLzxBlock> lzxBlocksCache) { |
| this.lzxBlocksCache = lzxBlocksCache; |
| } |
| |
| public ChmDirectoryListingSet getChmDirList() { |
| return chmDirList; |
| } |
| |
| private void setChmDirList(ChmDirectoryListingSet chmDirList) { |
| this.chmDirList = chmDirList; |
| } |
| |
| private ChmItsfHeader getChmItsfHeader() { |
| return chmItsfHeader; |
| } |
| |
| private void setChmItsfHeader(ChmItsfHeader chmItsfHeader) { |
| this.chmItsfHeader = chmItsfHeader; |
| } |
| } |