src/java/org/apache/poi/util/LZWDecompresser.java - poi - Git at Google

 /* ====================================================================
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    (the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */
 package org.apache.poi.util;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;

 /**
  * This class provides common functionality for the
  * various LZW implementations in the different file
  * formats.
  * It's currently used by HDGF and HMEF.
  * <p>
  * Two good resources on LZW are:
  * http://en.wikipedia.org/wiki/LZW
  * http://marknelson.us/1989/10/01/lzw-data-compression/
  */
 public abstract class LZWDecompresser {

     /** the size of our dictionary */
     public static final int DICT_SIZE = 0x1000;
     /** the mask for calculating / wrapping dictionary offsets */
     public static final int DICT_MASK = 0xFFF;

     //arbitrarily selected; may need to increase
     private static final int MAX_RECORD_LENGTH = 1_000_000;

     /**
      * Does the mask bit mean it's compressed or uncompressed?
      */
     private final boolean maskMeansCompressed;
     /**
      * How much to append to the code length in the stream
      * to get the real code length? Normally 2 or 3
      */
     private final int codeLengthIncrease;
     /**
      * Does the 12 bits of the position get stored in
      * Little Endian or Big Endian form?
      * This controls whether a pos+length of 0x12 0x34
      * becomes a position of 0x123 or 0x312
      */
     private final boolean positionIsBigEndian;

     protected LZWDecompresser(boolean maskMeansCompressed,
                               int codeLengthIncrease, boolean positionIsBigEndian) {
         this.maskMeansCompressed = maskMeansCompressed;
         this.codeLengthIncrease = codeLengthIncrease;
         this.positionIsBigEndian = positionIsBigEndian;
     }

     /**
      * Populates the dictionary, and returns where in it
      * to begin writing new codes.
      * Generally, if the dictionary is pre-populated, then new
      * codes should be placed at the end of that block.
      * Equally, if the dictionary is left with all zeros, then
      * usually the new codes can go in at the start.
      */
     protected abstract int populateDictionary(byte[] dict);

     /**
      * Adjusts the position offset if needed when looking
      * something up in the dictionary.
      */
     protected abstract int adjustDictionaryOffset(int offset);

     /**
      * Decompresses the given input stream, returning the array of bytes
      * of the decompressed input.
      */
     public byte[] decompress(InputStream src) throws IOException {
         ByteArrayOutputStream res = new ByteArrayOutputStream();
         decompress(src, res);
         return res.toByteArray();
     }

     /**
      * Perform a streaming decompression of the input.
      * Works by:
      * 1) Reading a flag byte, the 8 bits of which tell you if the
      * following 8 codes are compressed our un-compressed
      * 2) Consider the 8 bits in turn
      * 3) If the bit is set, the next code is un-compressed, so
      * add it to the dictionary and output it
      * 4) If the bit isn't set, then read in the length and start
      * position in the dictionary, and output the bytes there
      * 5) Loop until we've done all 8 bits, then read in the next
      * flag byte
      */
     public void decompress(InputStream src, OutputStream res) throws IOException {
         // How far through the output we've got
         // (This is normally used &4095, so it nicely wraps)
         // The initial value is set when populating the dictionary
         int pos;
         // The flag byte is treated as its 8 individual
         //  bits, which tell us if the following 8 codes
         //  are compressed or un-compressed
         int flag;
         // The mask, between 1 and 255, which is used when
         //  processing each bit of the flag byte in turn
         int mask;

         // We use 12 bit codes:
         // * 0-255 are real bytes
         // * 256-4095 are the substring codes
         // Java handily initialises our buffer / dictionary
         //  to all zeros
         final byte[] buffer = new byte[DICT_SIZE];
         pos = populateDictionary(buffer);

         // These are bytes as looked up in the dictionary
         // It needs to be signed, as it'll get passed on to
         //  the output stream
         final byte[] dataB = IOUtils.safelyAllocate(16L + codeLengthIncrease, MAX_RECORD_LENGTH);
         // This is an unsigned byte read from the stream
         // It needs to be unsigned, so that bit stuff works
         int dataI;
         // The compressed code sequence is held over 2 bytes
         int dataIPt1, dataIPt2;
         // How long a code sequence is, and where in the
         //  dictionary to start at
         int len, pntr;

         while ((flag = src.read()) != -1) {
             // Compare each bit in our flag byte in turn:
             for (mask = 1; mask < 0x100; mask <<= 1) {
                 // Is this a new code (un-compressed), or
                 //  the use of existing codes (compressed)?
                 boolean isMaskSet = (flag & mask) > 0;
                 if (isMaskSet ^ maskMeansCompressed) {
                     // Retrieve the un-compressed code
                     if ((dataI = src.read()) != -1) {
                         // Save the byte into the dictionary
                         buffer[pos++ & DICT_MASK] = (byte) dataI;
                         // And output the byte
                         res.write(dataI);
                     }
                 } else {
                     // We have a compressed sequence
                     // Grab the next 16 bits of data
                     dataIPt1 = src.read();
                     dataIPt2 = src.read();
                     if (dataIPt1 == -1 || dataIPt2 == -1) break;

                     // Build up how long the code sequence is, and
                     //  what position of the code to start at
                     // (The position is the usually the first 12 bits,
                     //  and the length is usually the last 4 bits)
                     len = (dataIPt2 & 0x0F) + codeLengthIncrease;
                     if (positionIsBigEndian) {
                         pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
                     } else {
                         pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
                     }

                     // Adjust the pointer as needed
                     pntr = adjustDictionaryOffset(pntr);

                     // Loop over the codes, outputting what they correspond to
                     for (int i = 0; i < len; i++) {
                         dataB[i] = buffer[(pntr + i) & DICT_MASK];
                         buffer[(pos + i) & DICT_MASK] = dataB[i];
                     }
                     res.write(dataB, 0, len);

                     // Record how far along the stream we have moved
                     pos += len;
                 }
             }
         }
     }
 }
	/* ====================================================================
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==================================================================== */
	package org.apache.poi.util;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;

	/**
	* This class provides common functionality for the
	* various LZW implementations in the different file
	* formats.
	* It's currently used by HDGF and HMEF.
	* <p>
	* Two good resources on LZW are:
	* http://en.wikipedia.org/wiki/LZW
	* http://marknelson.us/1989/10/01/lzw-data-compression/
	*/
	public abstract class LZWDecompresser {

	/** the size of our dictionary */
	public static final int DICT_SIZE = 0x1000;
	/** the mask for calculating / wrapping dictionary offsets */
	public static final int DICT_MASK = 0xFFF;

	//arbitrarily selected; may need to increase
	private static final int MAX_RECORD_LENGTH = 1_000_000;

	/**
	* Does the mask bit mean it's compressed or uncompressed?
	*/
	private final boolean maskMeansCompressed;
	/**
	* How much to append to the code length in the stream
	* to get the real code length? Normally 2 or 3
	*/
	private final int codeLengthIncrease;
	/**
	* Does the 12 bits of the position get stored in
	* Little Endian or Big Endian form?
	* This controls whether a pos+length of 0x12 0x34
	* becomes a position of 0x123 or 0x312
	*/
	private final boolean positionIsBigEndian;

	protected LZWDecompresser(boolean maskMeansCompressed,
	int codeLengthIncrease, boolean positionIsBigEndian) {
	this.maskMeansCompressed = maskMeansCompressed;
	this.codeLengthIncrease = codeLengthIncrease;
	this.positionIsBigEndian = positionIsBigEndian;
	}

	/**
	* Populates the dictionary, and returns where in it
	* to begin writing new codes.
	* Generally, if the dictionary is pre-populated, then new
	* codes should be placed at the end of that block.
	* Equally, if the dictionary is left with all zeros, then
	* usually the new codes can go in at the start.
	*/
	protected abstract int populateDictionary(byte[] dict);

	/**
	* Adjusts the position offset if needed when looking
	* something up in the dictionary.
	*/
	protected abstract int adjustDictionaryOffset(int offset);

	/**
	* Decompresses the given input stream, returning the array of bytes
	* of the decompressed input.
	*/
	public byte[] decompress(InputStream src) throws IOException {
	ByteArrayOutputStream res = new ByteArrayOutputStream();
	decompress(src, res);
	return res.toByteArray();
	}

	/**
	* Perform a streaming decompression of the input.
	* Works by:
	* 1) Reading a flag byte, the 8 bits of which tell you if the
	* following 8 codes are compressed our un-compressed
	* 2) Consider the 8 bits in turn
	* 3) If the bit is set, the next code is un-compressed, so
	* add it to the dictionary and output it
	* 4) If the bit isn't set, then read in the length and start
	* position in the dictionary, and output the bytes there
	* 5) Loop until we've done all 8 bits, then read in the next
	* flag byte
	*/
	public void decompress(InputStream src, OutputStream res) throws IOException {
	// How far through the output we've got
	// (This is normally used &4095, so it nicely wraps)
	// The initial value is set when populating the dictionary
	int pos;
	// The flag byte is treated as its 8 individual
	// bits, which tell us if the following 8 codes
	// are compressed or un-compressed
	int flag;
	// The mask, between 1 and 255, which is used when
	// processing each bit of the flag byte in turn
	int mask;

	// We use 12 bit codes:
	// * 0-255 are real bytes
	// * 256-4095 are the substring codes
	// Java handily initialises our buffer / dictionary
	// to all zeros
	final byte[] buffer = new byte[DICT_SIZE];
	pos = populateDictionary(buffer);

	// These are bytes as looked up in the dictionary
	// It needs to be signed, as it'll get passed on to
	// the output stream
	final byte[] dataB = IOUtils.safelyAllocate(16L + codeLengthIncrease, MAX_RECORD_LENGTH);
	// This is an unsigned byte read from the stream
	// It needs to be unsigned, so that bit stuff works
	int dataI;
	// The compressed code sequence is held over 2 bytes
	int dataIPt1, dataIPt2;
	// How long a code sequence is, and where in the
	// dictionary to start at
	int len, pntr;

	while ((flag = src.read()) != -1) {
	// Compare each bit in our flag byte in turn:
	for (mask = 1; mask < 0x100; mask <<= 1) {
	// Is this a new code (un-compressed), or
	// the use of existing codes (compressed)?
	boolean isMaskSet = (flag & mask) > 0;
	if (isMaskSet ^ maskMeansCompressed) {
	// Retrieve the un-compressed code
	if ((dataI = src.read()) != -1) {
	// Save the byte into the dictionary
	buffer[pos++ & DICT_MASK] = (byte) dataI;
	// And output the byte
	res.write(dataI);
	}
	} else {
	// We have a compressed sequence
	// Grab the next 16 bits of data
	dataIPt1 = src.read();
	dataIPt2 = src.read();
	if (dataIPt1 == -1 \|\| dataIPt2 == -1) break;

	// Build up how long the code sequence is, and
	// what position of the code to start at
	// (The position is the usually the first 12 bits,
	// and the length is usually the last 4 bits)
	len = (dataIPt2 & 0x0F) + codeLengthIncrease;
	if (positionIsBigEndian) {
	pntr = (dataIPt1 << 4) + (dataIPt2 >>> 4);
	} else {
	pntr = dataIPt1 + ((dataIPt2 & 0xF0) << 4);
	}

	// Adjust the pointer as needed
	pntr = adjustDictionaryOffset(pntr);

	// Loop over the codes, outputting what they correspond to
	for (int i = 0; i < len; i++) {
	dataB[i] = buffer[(pntr + i) & DICT_MASK];
	buffer[(pos + i) & DICT_MASK] = dataB[i];
	}
	res.write(dataB, 0, len);

	// Record how far along the stream we have moved
	pos += len;
	}
	}
	}
	}
	}