| /* |
| * Licensed to the Apache Software Foundation (ASF) under one or more |
| * contributor license agreements. See the NOTICE file distributed with |
| * this work for additional information regarding copyright ownership. |
| * The ASF licenses this file to You under the Apache License, Version 2.0 |
| * (the "License"); you may not use this file except in compliance with |
| * the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.tika.parser.microsoft.chm; |
| |
| import java.io.FileNotFoundException; |
| import java.io.FileOutputStream; |
| import java.io.IOException; |
| import java.util.List; |
| |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| import org.apache.tika.exception.TikaException; |
| |
| public class ChmCommons { |
| |
| /** |
| * Represents lzx block types in order to decompress differently |
| */ |
| public final static int UNDEFINED = 0; |
| public final static int VERBATIM = 1; |
| public final static int ALIGNED_OFFSET = 2; |
| public final static int UNCOMPRESSED = 3; |
| private static final Logger LOG = LoggerFactory.getLogger(ChmCommons.class); |
| |
| /* Prevents initialization */ |
| private ChmCommons() { |
| } |
| |
| public static void assertByteArrayNotNull(byte[] data) throws TikaException { |
| if (data == null) { |
| throw new TikaException("byte[] is null"); |
| } |
| } |
| |
| /** |
| * LZX supports window sizes of 2^15 (32Kb) through 2^21 (2Mb) Returns X, |
| * i.e 2^X |
| * |
| * @param window chmLzxControlData.getWindowSize() |
| * @return window size |
| */ |
| public static int getWindowSize(int window) { |
| int win = 0; |
| while (window > 1) { |
| window >>>= 1; |
| win++; |
| } |
| return win; |
| } |
| |
| public static byte[] getChmBlockSegment(byte[] data, ChmLzxcResetTable resetTable, |
| int blockNumber, int lzxcBlockOffset, |
| int lzxcBlockLength) throws TikaException { |
| ChmAssert.assertChmBlockSegment(data, resetTable, blockNumber, lzxcBlockOffset, |
| lzxcBlockLength); |
| int blockLength = -1; |
| // TODO add int_max_value checking |
| if (blockNumber < (resetTable.getBlockAddress().length - 1)) { |
| blockLength = (int) (resetTable.getBlockAddress()[blockNumber + 1] - |
| resetTable.getBlockAddress()[blockNumber]); |
| } else { |
| /* new code */ |
| if (blockNumber >= resetTable.getBlockAddress().length) { |
| blockLength = 0; |
| } else |
| /* end new code */ { |
| blockLength = (int) (lzxcBlockLength - resetTable.getBlockAddress()[blockNumber]); |
| } |
| } |
| byte[] t = ChmCommons.copyOfRange(data, |
| (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber]), |
| (int) (lzxcBlockOffset + resetTable.getBlockAddress()[blockNumber] + blockLength)); |
| return (t != null) ? t : new byte[1]; |
| } |
| |
| /** |
| * Returns textual representation of LangID |
| * |
| * @param langID |
| * @return language name |
| */ |
| public static String getLanguage(long langID) { |
| /* Potential problem with casting */ |
| switch ((int) langID) { |
| case 1025: |
| return "Arabic"; |
| case 1069: |
| return "Basque"; |
| case 1027: |
| return "Catalan"; |
| case 2052: |
| return "Chinese (Simplified)"; |
| case 1028: |
| return "Chinese (Traditional)"; |
| case 1029: |
| return "Czech"; |
| case 1030: |
| return "Danish"; |
| case 1043: |
| return "Dutch"; |
| case 1033: |
| return "English (United States)"; |
| case 1035: |
| return "Finnish"; |
| case 1036: |
| return "French"; |
| case 1031: |
| return "German"; |
| case 1032: |
| return "Greek"; |
| case 1037: |
| return "Hebrew"; |
| case 1038: |
| return "Hungarian"; |
| case 1040: |
| return "Italian"; |
| case 1041: |
| return "Japanese"; |
| case 1042: |
| return "Korean"; |
| case 1044: |
| return "Norwegian"; |
| case 1045: |
| return "Polish"; |
| case 2070: |
| return "Portuguese"; |
| case 1046: |
| return "Portuguese (Brazil)"; |
| case 1049: |
| return "Russian"; |
| case 1051: |
| return "Slovakian"; |
| case 1060: |
| return "Slovenian"; |
| case 3082: |
| return "Spanish"; |
| case 1053: |
| return "Swedish"; |
| case 1055: |
| return "Turkish"; |
| default: |
| return "unknown - http://msdn.microsoft.com/en-us/library/bb165625%28VS.80%29.aspx"; |
| } |
| } |
| |
| /** |
| * Checks skippable patterns |
| * |
| * @param directoryListingEntry |
| * @return boolean |
| */ |
| public static boolean hasSkip(DirectoryListingEntry directoryListingEntry) { |
| String name = directoryListingEntry.getName(); |
| return name.startsWith("/$") || name.startsWith("/#") || name.startsWith("::"); |
| } |
| |
| /** |
| * Writes byte[][] to the file |
| * |
| * @param buffer |
| * @param fileToBeSaved file name |
| * @throws TikaException |
| */ |
| public static void writeFile(byte[][] buffer, String fileToBeSaved) throws TikaException { |
| if (buffer == null || fileToBeSaved == null || ChmCommons.isEmpty(fileToBeSaved)) { |
| return; |
| } |
| try (FileOutputStream output = new FileOutputStream(fileToBeSaved)) { |
| for (byte[] bufferEntry : buffer) { |
| output.write(bufferEntry); |
| } |
| } catch (FileNotFoundException e) { |
| throw new TikaException(e.getMessage()); |
| } catch (IOException e) { |
| LOG.warn("problem writing tmp file", e); |
| } |
| } |
| |
| /** |
| * Reverses the order of given array |
| * |
| * @param array |
| */ |
| public static void reverse(byte[] array) { |
| if (array == null) { |
| return; |
| } |
| int i = 0; |
| int j = array.length - 1; |
| byte tmp; |
| while (j > i) { |
| tmp = array[j]; |
| array[j] = array[i]; |
| array[i] = tmp; |
| j--; |
| i++; |
| } |
| } |
| |
| /** |
| * Returns an index of the reset table |
| * |
| * @param text |
| * @param pattern |
| * @return index of the reset table |
| * @throws ChmParsingException |
| */ |
| public static final int indexOfResetTableBlock(byte[] text, byte[] pattern) |
| throws ChmParsingException { |
| return (indexOf(text, pattern)) - 4; |
| } |
| |
| /** |
| * Searches some pattern in byte[] |
| * |
| * @param text byte[] |
| * @param pattern byte[] |
| * @return an index, if nothing found returns -1 |
| * @throws ChmParsingException |
| */ |
| public static int indexOf(byte[] text, byte[] pattern) throws ChmParsingException { |
| int[] next = null; |
| int i = 0, j = -1; |
| |
| /* Preprocessing */ |
| if (pattern != null && text != null) { |
| next = new int[pattern.length]; |
| next[0] = -1; |
| } else { |
| throw new ChmParsingException("pattern and/or text should not be null"); |
| } |
| |
| /* Computes a failure function */ |
| while (i < pattern.length - 1) { |
| if (j == -1 || pattern[i] == pattern[j]) { |
| i++; |
| j++; |
| if (pattern[i] != pattern[j]) { |
| next[i] = j; |
| } else { |
| next[i] = next[j]; |
| } |
| } else { |
| j = next[j]; |
| } |
| } |
| |
| /* Reinitializes local variables */ |
| i = j = 0; |
| |
| /* Matching */ |
| while (i < text.length && j < pattern.length) { |
| if (j == -1 || pattern[j] == text[i]) { |
| i++; |
| j++; |
| } else { |
| j = next[j]; |
| } |
| } |
| if (j == pattern.length) { |
| return (i - j); // match found at offset i - M |
| } else { |
| return -1; // not found |
| } |
| } |
| |
| /** |
| * Searches for some pattern in the directory listing entry list |
| * |
| * @param list |
| * @param pattern |
| * @return an index, if nothing found returns -1 |
| */ |
| public static int indexOf(List<DirectoryListingEntry> list, String pattern) { |
| int place = 0; |
| for (DirectoryListingEntry directoryListingEntry : list) { |
| if (directoryListingEntry.toString().contains(pattern)) { |
| return place; |
| } |
| ++place; |
| } |
| return -1;// not found |
| } |
| |
| /* |
| * This method is added because of supporting of Java 5 |
| */ |
| public static byte[] copyOfRange(byte[] original, int from, int to) throws TikaException { |
| checkCopyOfRangeParams(original, from, to); |
| int newLength = to - from; |
| if (newLength < 0) { |
| throw new IllegalArgumentException(from + " > " + to); |
| } |
| if (to > original.length) { |
| throw new TikaException("can't copy beyond array length"); |
| } |
| byte[] copy = new byte[newLength]; |
| System.arraycopy(original, from, copy, 0, Math.min(original.length - from, newLength)); |
| return copy; |
| } |
| |
| private static void checkCopyOfRangeParams(byte[] original, int from, int to) { |
| if (original == null) { |
| throw new NullPointerException("array is null"); |
| } |
| if (from < 0) { |
| throw new IllegalArgumentException(from + " should be > 0"); |
| } |
| if (to < 0) { |
| throw new IllegalArgumentException(to + " should be > 0"); |
| } |
| } |
| |
| /* |
| * This method is added because of supporting of Java 5 |
| */ |
| public static boolean isEmpty(String str) { |
| return str == null || str.length() == 0; |
| } |
| |
| /** |
| * Represents entry types: uncompressed, compressed |
| */ |
| public enum EntryType { |
| UNCOMPRESSED, COMPRESSED |
| } |
| |
| /** |
| * Represents lzx states: started decoding, not started decoding |
| */ |
| public enum LzxState { |
| STARTED_DECODING, NOT_STARTED_DECODING |
| } |
| |
| /** |
| * Represents intel file states during decompression |
| */ |
| public enum IntelState { |
| STARTED, NOT_STARTED |
| } |
| |
| } |