trunk/src/scratchpad/src/org/apache/poi/hdgf/HDGFLZWCompressor.java - poi - Git at Google

 /* ====================================================================
    Licensed to the Apache Software Foundation (ASF) under one or more
    contributor license agreements.  See the NOTICE file distributed with
    this work for additional information regarding copyright ownership.
    The ASF licenses this file to You under the Apache License, Version 2.0
    (the "License"); you may not use this file except in compliance with
    the License.  You may obtain a copy of the License at

        http://www.apache.org/licenses/LICENSE-2.0

    Unless required by applicable law or agreed to in writing, software
    distributed under the License is distributed on an "AS IS" BASIS,
    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
    See the License for the specific language governing permissions and
    limitations under the License.
 ==================================================================== */

 package org.apache.poi.hdgf;

 import java.io.IOException;
 import java.io.InputStream;
 import java.io.OutputStream;

 /**
  * Helper class to handle the Visio compatible
  *  streaming LZW compression.
  * Need our own class to handle keeping track of the
  *  code buffer, pending bytes to write out etc.
  *
  * TODO Fix this, as it starts to go wrong on
  *  large streams
  */
 /* package */ final class HDGFLZWCompressor {
 	// We use 12 bit codes:
 	// * 0-255 are real bytes
 	// * 256-4095 are the substring codes
 	// Java handily initialises our buffer / dictionary
 	//  to all zeros
 	private byte[] dict = new byte[4096];

 	// The next block of data to be written out, minus
 	//  its mask byte
 	private byte[] buffer = new byte[16];
 	// And how long it is
 	// (Un-compressed codes are 1 byte each, compressed codes
 	//   are two)
 	private int bufferLen = 0;

 	// The raw length of a code is limited to 4 bits + 2
 	private byte[] rawCode = new byte[18];
 	// And how much we're using
 	private int rawCodeLen = 0;

 	// How far through the input and output streams we are
 	private int posInp = 0;
 	private int posOut = 0;

 	// What the next mask byte to output will be
 	private int nextMask = 0;
 	// And how many bits we've already set
 	private int maskBitsSet = 0;

 	public HDGFLZWCompressor() {}

 /**
  * Returns the last place that the bytes from rawCode are found
  *  at in the buffer, or -1 if they can't be found
  */
 private int findRawCodeInBuffer() {
 	// Work our way through all the codes until we
    //  find the right one. Visio starts from the end
 	for(int i=4096-rawCodeLen; i>0; i--) {
 		boolean matches = true;
 		for(int j=0; matches && j<rawCodeLen; j++) {
 			if(dict[i+j] == rawCode[j]) {
 				// Fits
 			} else {
 				// Doesn't fit, can't be a match
 				matches = false;
 			}
 		}

 		// Was this position a match?
 		if(matches) {
 			return i;
 		}
 	}

 	// Not found
 	return -1;
 }

 /**
  * Output the compressed representation for the bytes
  *  found in rawCode
  */
 private void outputCompressed(OutputStream res) throws IOException {
 	// It's not worth compressing only 1 or two bytes,
 	//  due to the overheads
 	// So if asked, just output uncompressed
 	if(rawCodeLen < 3) {
 		for(int i=0; i<rawCodeLen; i++) {
 			outputUncompressed(rawCode[i], res);
 		}
 		return;
 	}

 	// Grab where the data lives
 	int codesAt = findRawCodeInBuffer();
    codesAt -= 18;
 	if(codesAt < 0) {
 	   codesAt += 4096;
 	}

 	// Increment the mask bit count, we've done another code
 	maskBitsSet++;

 	// Add the length+code to the buffer
 	// (The position is the first 12 bits, the
 	//  length is the last 4 bits)
 	int bp1 = (codesAt & 255);
 	int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
 	buffer[bufferLen] = HDGFLZW.fromInt(bp1);
 	bufferLen++;
    buffer[bufferLen] = HDGFLZW.fromInt(bp2);
    bufferLen++;

    // Copy the data to the dictionary in the new place
    for(int i=0; i<rawCodeLen; i++) {
       dict[(posOut&4095)] = rawCode[i];
       posOut++;
    }

 	// If we're now at 8 codes, output
 	if(maskBitsSet == 8) {
 		output8Codes(res);
 	}
 }
 /**
  * Output the un-compressed byte
  */
 private void outputUncompressed(byte b, OutputStream res) throws IOException {
 	// Set the mask bit for us
 	nextMask += (1<<maskBitsSet);
 	maskBitsSet++;

 	// And add us to the buffer + dictionary
 	buffer[bufferLen] = b;
 	bufferLen++;
 	dict[(posOut&4095)] = b;
 	posOut++;

 	// If we're now at 8 codes, output
 	if(maskBitsSet == 8) {
 		output8Codes(res);
 	}
 }

 /**
  * We've got 8 code worth to write out, so
  *  output along with the header
  */
 private void output8Codes(OutputStream res) throws IOException {
 	// Output the mask and the data
 	res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
 	res.write(buffer, 0, bufferLen);

 	// Reset things
 	nextMask = 0;
 	maskBitsSet = 0;
 	bufferLen = 0;
 }

 /**
  * Does the compression
  */
 public void compress(InputStream src, OutputStream res) throws IOException {
 	// Have we hit the end of the file yet?
 	boolean going = true;

 	// This is a byte as looked up in the dictionary
 	// It needs to be signed, as it'll get passed on to
 	//  the output stream
 	byte dataB;
 	// This is an unsigned byte read from the stream
 	// It needs to be unsigned, so that bit stuff works
 	int dataI;

 	while( going ) {
 		dataI = src.read();
 		posInp++;
 		if(dataI == -1) { going = false; }
 		dataB = HDGFLZW.fromInt(dataI);

 		// If we've run out of data, output anything that's
 		//  pending then finish
 		if(!going) {
 		   if(rawCodeLen > 0) {
 	         outputCompressed(res);
 	         if(maskBitsSet > 0) {
 	            output8Codes(res);
 	         }
 		   }
 			break;
 		}

 		// Try adding this new byte onto rawCode, and
 		//  see if all of that is still found in the
 		//  buffer dictionary or not
 		rawCode[rawCodeLen] = dataB;
 		rawCodeLen++;
 		int rawAt = findRawCodeInBuffer();

 		// If we found it and are now at 18 bytes,
 		//  we need to output our pending code block
 		if(rawCodeLen == 18 && rawAt > -1) {
 			outputCompressed(res);
 			rawCodeLen = 0;
 			continue;
 		}

 		// If we did find all of rawCode with our new
 		//  byte added on, we can wait to see what happens
 		//  with the next byte
 		if(rawAt > -1) {
 			continue;
 		}

 		// If we get here, then the rawCode + this byte weren't
 		// found in the dictionary

 		// If there was something in rawCode before, then that was
 		// found in the dictionary, so output that compressed
 		rawCodeLen--;
 		if(rawCodeLen > 0) {
 			// Output the old rawCode
 			outputCompressed(res);

 			// Can this byte start a new rawCode, or does
 			//  it need outputting itself?
 			rawCode[0] = dataB;
 			rawCodeLen = 1;
 			if(findRawCodeInBuffer() > -1) {
 				// Fits in, wait for next byte
 				continue;
 			}
 			// Doesn't fit, output
 			outputUncompressed(dataB,res);
 			rawCodeLen = 0;
 		} else {
 			// Nothing in rawCode before, so this byte
 			//  isn't in the buffer dictionary
 			// Output it un-compressed
 			outputUncompressed(dataB,res);
 		}
 	}
 }
 }
	/* ====================================================================
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==================================================================== */

	package org.apache.poi.hdgf;

	import java.io.IOException;
	import java.io.InputStream;
	import java.io.OutputStream;

	/**
	* Helper class to handle the Visio compatible
	* streaming LZW compression.
	* Need our own class to handle keeping track of the
	* code buffer, pending bytes to write out etc.
	*
	* TODO Fix this, as it starts to go wrong on
	* large streams
	*/
	/* package */ final class HDGFLZWCompressor {
	// We use 12 bit codes:
	// * 0-255 are real bytes
	// * 256-4095 are the substring codes
	// Java handily initialises our buffer / dictionary
	// to all zeros
	private byte[] dict = new byte[4096];

	// The next block of data to be written out, minus
	// its mask byte
	private byte[] buffer = new byte[16];
	// And how long it is
	// (Un-compressed codes are 1 byte each, compressed codes
	// are two)
	private int bufferLen = 0;

	// The raw length of a code is limited to 4 bits + 2
	private byte[] rawCode = new byte[18];
	// And how much we're using
	private int rawCodeLen = 0;

	// How far through the input and output streams we are
	private int posInp = 0;
	private int posOut = 0;

	// What the next mask byte to output will be
	private int nextMask = 0;
	// And how many bits we've already set
	private int maskBitsSet = 0;

	public HDGFLZWCompressor() {}

	/**
	* Returns the last place that the bytes from rawCode are found
	* at in the buffer, or -1 if they can't be found
	*/
	private int findRawCodeInBuffer() {
	// Work our way through all the codes until we
	// find the right one. Visio starts from the end
	for(int i=4096-rawCodeLen; i>0; i--) {
	boolean matches = true;
	for(int j=0; matches && j<rawCodeLen; j++) {
	if(dict[i+j] == rawCode[j]) {
	// Fits
	} else {
	// Doesn't fit, can't be a match
	matches = false;
	}
	}

	// Was this position a match?
	if(matches) {
	return i;
	}
	}

	// Not found
	return -1;
	}

	/**
	* Output the compressed representation for the bytes
	* found in rawCode
	*/
	private void outputCompressed(OutputStream res) throws IOException {
	// It's not worth compressing only 1 or two bytes,
	// due to the overheads
	// So if asked, just output uncompressed
	if(rawCodeLen < 3) {
	for(int i=0; i<rawCodeLen; i++) {
	outputUncompressed(rawCode[i], res);
	}
	return;
	}

	// Grab where the data lives
	int codesAt = findRawCodeInBuffer();
	codesAt -= 18;
	if(codesAt < 0) {
	codesAt += 4096;
	}

	// Increment the mask bit count, we've done another code
	maskBitsSet++;

	// Add the length+code to the buffer
	// (The position is the first 12 bits, the
	// length is the last 4 bits)
	int bp1 = (codesAt & 255);
	int bp2 = (rawCodeLen-3) + ((codesAt-bp1) >> 4);
	buffer[bufferLen] = HDGFLZW.fromInt(bp1);
	bufferLen++;
	buffer[bufferLen] = HDGFLZW.fromInt(bp2);
	bufferLen++;

	// Copy the data to the dictionary in the new place
	for(int i=0; i<rawCodeLen; i++) {
	dict[(posOut&4095)] = rawCode[i];
	posOut++;
	}

	// If we're now at 8 codes, output
	if(maskBitsSet == 8) {
	output8Codes(res);
	}
	}
	/**
	* Output the un-compressed byte
	*/
	private void outputUncompressed(byte b, OutputStream res) throws IOException {
	// Set the mask bit for us
	nextMask += (1<<maskBitsSet);
	maskBitsSet++;

	// And add us to the buffer + dictionary
	buffer[bufferLen] = b;
	bufferLen++;
	dict[(posOut&4095)] = b;
	posOut++;

	// If we're now at 8 codes, output
	if(maskBitsSet == 8) {
	output8Codes(res);
	}
	}

	/**
	* We've got 8 code worth to write out, so
	* output along with the header
	*/
	private void output8Codes(OutputStream res) throws IOException {
	// Output the mask and the data
	res.write(new byte[] { HDGFLZW.fromInt(nextMask) } );
	res.write(buffer, 0, bufferLen);

	// Reset things
	nextMask = 0;
	maskBitsSet = 0;
	bufferLen = 0;
	}

	/**
	* Does the compression
	*/
	public void compress(InputStream src, OutputStream res) throws IOException {
	// Have we hit the end of the file yet?
	boolean going = true;

	// This is a byte as looked up in the dictionary
	// It needs to be signed, as it'll get passed on to
	// the output stream
	byte dataB;
	// This is an unsigned byte read from the stream
	// It needs to be unsigned, so that bit stuff works
	int dataI;

	while( going ) {
	dataI = src.read();
	posInp++;
	if(dataI == -1) { going = false; }
	dataB = HDGFLZW.fromInt(dataI);

	// If we've run out of data, output anything that's
	// pending then finish
	if(!going) {
	if(rawCodeLen > 0) {
	outputCompressed(res);
	if(maskBitsSet > 0) {
	output8Codes(res);
	}
	}
	break;
	}

	// Try adding this new byte onto rawCode, and
	// see if all of that is still found in the
	// buffer dictionary or not
	rawCode[rawCodeLen] = dataB;
	rawCodeLen++;
	int rawAt = findRawCodeInBuffer();

	// If we found it and are now at 18 bytes,
	// we need to output our pending code block
	if(rawCodeLen == 18 && rawAt > -1) {
	outputCompressed(res);
	rawCodeLen = 0;
	continue;
	}

	// If we did find all of rawCode with our new
	// byte added on, we can wait to see what happens
	// with the next byte
	if(rawAt > -1) {
	continue;
	}

	// If we get here, then the rawCode + this byte weren't
	// found in the dictionary

	// If there was something in rawCode before, then that was
	// found in the dictionary, so output that compressed
	rawCodeLen--;
	if(rawCodeLen > 0) {
	// Output the old rawCode
	outputCompressed(res);

	// Can this byte start a new rawCode, or does
	// it need outputting itself?
	rawCode[0] = dataB;
	rawCodeLen = 1;
	if(findRawCodeInBuffer() > -1) {
	// Fits in, wait for next byte
	continue;
	}
	// Doesn't fit, output
	outputUncompressed(dataB,res);
	rawCodeLen = 0;
	} else {
	// Nothing in rawCode before, so this byte
	// isn't in the buffer dictionary
	// Output it un-compressed
	outputUncompressed(dataB,res);
	}
	}
	}
	}