| /* ==================================================================== |
| * The Apache Software License, Version 1.1 |
| * |
| * Copyright (c) 2003 The Apache Software Foundation. All rights |
| * reserved. |
| * |
| * Redistribution and use in source and binary forms, with or without |
| * modification, are permitted provided that the following conditions |
| * are met: |
| * |
| * 1. Redistributions of source code must retain the above copyright |
| * notice, this list of conditions and the following disclaimer. |
| * |
| * 2. Redistributions in binary form must reproduce the above copyright |
| * notice, this list of conditions and the following disclaimer in |
| * the documentation and/or other materials provided with the |
| * distribution. |
| * |
| * 3. The end-user documentation included with the redistribution, |
| * if any, must include the following acknowledgment: |
| * "This product includes software developed by the |
| * Apache Software Foundation (http://www.apache.org/)." |
| * Alternately, this acknowledgment may appear in the software itself, |
| * if and wherever such third-party acknowledgments normally appear. |
| * |
| * 4. The names "Apache" and "Apache Software Foundation" and |
| * "Apache POI" must not be used to endorse or promote products |
| * derived from this software without prior written permission. For |
| * written permission, please contact apache@apache.org. |
| * |
| * 5. Products derived from this software may not be called "Apache", |
| * "Apache POI", nor may "Apache" appear in their name, without |
| * prior written permission of the Apache Software Foundation. |
| * |
| * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED |
| * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES |
| * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE |
| * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR |
| * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, |
| * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT |
| * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF |
| * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND |
| * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, |
| * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT |
| * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF |
| * SUCH DAMAGE. |
| * ==================================================================== |
| * |
| * This software consists of voluntary contributions made by many |
| * individuals on behalf of the Apache Software Foundation. For more |
| * information on the Apache Software Foundation, please see |
| * <http://www.apache.org/>. |
| */ |
| |
| package org.apache.poi.hssf.record; |
| |
| import org.apache.poi.util.BinaryTree; |
| import org.apache.poi.util.LittleEndian; |
| import org.apache.poi.util.LittleEndianConsts; |
| |
| /** |
| * Handles the task of deserializing a SST string. The two main entry points are |
| * |
| * @author Glen Stampoultzis (glens at apache.org) |
| * @author Jason Height (jheight at apache.org) |
| */ |
| class SSTDeserializer |
| { |
| |
| private BinaryTree strings; |
| /** this is the number of characters that have been read prior to the continuation */ |
| private int continuationReadChars; |
| /** this is the string we were working on before hitting the end of the current record. This string is NOT finished. */ |
| private String unfinishedString; |
| /** this is true if the string uses wide characters */ |
| private boolean wideChar; |
| /** this is true if the string is a rich text string */ |
| private boolean richText; |
| /** this is true if the string is a far east string or some other wierd string */ |
| private boolean extendedText; |
| /** Number of formatting runs in this rich text field */ |
| private short runCount; |
| /** Number of characters in current string */ |
| private int charCount; |
| private int extensionLength; |
| private int continueSkipBytes = 0; |
| |
| |
| public SSTDeserializer( BinaryTree strings ) |
| { |
| this.strings = strings; |
| initVars(); |
| } |
| |
| private void initVars() |
| { |
| runCount = 0; |
| continuationReadChars = 0; |
| unfinishedString = ""; |
| // bytesInCurrentSegment = 0; |
| // stringDataOffset = 0; |
| wideChar = false; |
| richText = false; |
| extendedText = false; |
| continueSkipBytes = 0; |
| } |
| |
| /** |
| * This is the starting point where strings are constructed. Note that |
| * strings may span across multiple continuations. Read the SST record |
| * carefully before beginning to hack. |
| */ |
| public void manufactureStrings( final byte[] data, final int initialOffset) |
| { |
| initVars(); |
| |
| int offset = initialOffset; |
| final int dataSize = data.length; |
| while ( offset < dataSize ) |
| { |
| int remaining = dataSize - offset; |
| |
| if ( ( remaining > 0 ) && ( remaining < LittleEndianConsts.SHORT_SIZE ) ) |
| { |
| throw new RecordFormatException( "Cannot get length of the last string in SSTRecord" ); |
| } |
| if ( remaining == LittleEndianConsts.SHORT_SIZE ) |
| { |
| //JMH Dont know about this |
| setContinuationCharsRead( 0 );//LittleEndian.getUShort( data, offset ) ); |
| unfinishedString = ""; |
| break; |
| } |
| charCount = LittleEndian.getUShort( data, offset ); |
| int charsRead = charCount; |
| readStringHeader( data, offset ); |
| boolean stringContinuesOverContinuation = remaining < totalStringSize(); |
| if ( stringContinuesOverContinuation ) |
| { |
| int remainingBytes = dataSize - offset - stringHeaderOverhead(); |
| //Only read the size of the string or whatever is left before the |
| //continuation |
| charsRead = Math.min(charsRead, calculateCharCount( remainingBytes )); |
| setContinuationCharsRead( charsRead ); |
| if (charsRead == charCount) { |
| //Since all of the characters will have been read, but the entire string (including formatting runs etc) |
| //hasnt, Compute the number of bytes to skip when the continue record starts |
| continueSkipBytes = offsetForContinuedRecord(0) - (remainingBytes - calculateByteCount(charsRead)); |
| } |
| } |
| processString( data, offset, charsRead ); |
| offset += totalStringSize(); |
| if ( stringContinuesOverContinuation ) |
| { |
| break; |
| } |
| } |
| } |
| |
| // private void dump( final byte[] data, int offset, int length ) |
| // { |
| // try |
| // { |
| // System.out.println( "------------------- SST DUMP -------------------------" ); |
| // HexDump.dump( (byte[]) data, offset, System.out, offset, length ); |
| // } |
| // catch ( IOException e ) |
| // { |
| // } |
| // catch ( ArrayIndexOutOfBoundsException e ) |
| // { |
| // } |
| // catch ( IllegalArgumentException e ) |
| // { |
| // } |
| // } |
| |
| /** |
| * Detemines the option types for the string (ie, compressed or uncompressed unicode, rich text string or |
| * plain string etc) and calculates the length and offset for the string. |
| * |
| */ |
| private void readStringHeader( final byte[] data, final int index ) |
| { |
| |
| byte optionFlag = data[index + LittleEndianConsts.SHORT_SIZE]; |
| |
| wideChar = ( optionFlag & 1 ) == 1; |
| extendedText = ( optionFlag & 4 ) == 4; |
| richText = ( optionFlag & 8 ) == 8; |
| runCount = 0; |
| if ( richText ) |
| { |
| runCount = LittleEndian.getShort( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD ); |
| } |
| extensionLength = 0; |
| if ( extendedText ) |
| { |
| extensionLength = LittleEndian.getInt( data, index + SSTRecord.STRING_MINIMAL_OVERHEAD |
| + (richText ? LittleEndianConsts.SHORT_SIZE : 0) ); |
| } |
| |
| } |
| |
| |
| /** |
| * Reads a string or the first part of a string. |
| * |
| * @param characters the number of characters to write. |
| * |
| * @return the number of bytes written. |
| */ |
| private int processString( final byte[] data, final int dataIndex, final int characters ) |
| { |
| |
| // length is the length we store it as. not the length that is read. |
| int length = SSTRecord.STRING_MINIMAL_OVERHEAD + calculateByteCount( characters ); |
| byte[] unicodeStringBuffer = new byte[length]; |
| |
| int offset = 0; |
| |
| // Set the length in characters |
| LittleEndian.putUShort( unicodeStringBuffer, offset, characters ); |
| offset += LittleEndianConsts.SHORT_SIZE; |
| // Set the option flags |
| unicodeStringBuffer[offset] = data[dataIndex + offset]; |
| // Copy in the string data |
| int bytesRead = unicodeStringBuffer.length - SSTRecord.STRING_MINIMAL_OVERHEAD; |
| arraycopy( data, dataIndex + stringHeaderOverhead(), unicodeStringBuffer, SSTRecord.STRING_MINIMAL_OVERHEAD, bytesRead ); |
| // Create the unicode string |
| UnicodeString string = new UnicodeString( UnicodeString.sid, |
| (short) unicodeStringBuffer.length, |
| unicodeStringBuffer ); |
| setContinuationCharsRead( calculateCharCount(bytesRead)); |
| |
| if ( isStringFinished() ) |
| { |
| Integer integer = new Integer( strings.size() ); |
| addToStringTable( strings, integer, string ); |
| } |
| else |
| { |
| unfinishedString = string.getString(); |
| } |
| |
| return bytesRead; |
| } |
| |
| private boolean isStringFinished() |
| { |
| return getContinuationCharsRead() == charCount; |
| } |
| |
| /** |
| * Okay, we are doing some major cheating here. Because we can't handle rich text strings properly |
| * we end up getting duplicate strings. To get around this I'm doing two things: 1. Converting rich |
| * text to normal text and 2. If there's a duplicate I'm adding a space onto the end. Sneaky perhaps |
| * but it gets the job done until we can handle this a little better. |
| */ |
| static public void addToStringTable( BinaryTree strings, Integer integer, UnicodeString string ) |
| { |
| |
| if ( string.isRichText() ) |
| string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~8 ) ) ); |
| if ( string.isExtendedText() ) |
| string.setOptionFlags( (byte) ( string.getOptionFlags() & ( ~4 ) ) ); |
| |
| boolean added = false; |
| while ( added == false ) |
| { |
| try |
| { |
| strings.put( integer, string ); |
| added = true; |
| } |
| catch ( Exception ignore ) |
| { |
| string.setString( string.getString() + " " ); |
| } |
| } |
| |
| } |
| |
| |
| private int calculateCharCount( final int byte_count ) |
| { |
| return byte_count / ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); |
| } |
| |
| /** |
| * Process a Continue record. A Continue record for an SST record |
| * contains the same kind of data that the SST record contains, |
| * with the following exceptions: |
| * <P> |
| * <OL> |
| * <LI>The string counts at the beginning of the SST record are |
| * not in the Continue record |
| * <LI>The first string in the Continue record might NOT begin |
| * with a size. If the last string in the previous record is |
| * continued in this record, the size is determined by that |
| * last string in the previous record; the first string will |
| * begin with a flag byte, followed by the remaining bytes (or |
| * words) of the last string from the previous |
| * record. Otherwise, the first string in the record will |
| * begin with a string length |
| * </OL> |
| * |
| * @param record the Continue record's byte data |
| */ |
| public void processContinueRecord( final byte[] record ) |
| { |
| if ( isStringFinished() ) |
| { |
| final int offset = continueSkipBytes; |
| initVars(); |
| manufactureStrings( record, offset); |
| } |
| else |
| { |
| // reset the wide bit because that can change across a continuation. the fact that it's |
| // actually rich text doesn't change across continuations even though the rich text |
| // may on longer be set in the "new" option flag. confusing huh? |
| wideChar = ( record[0] & 1 ) == 1; |
| |
| if ( stringSpansContinuation( record.length - LittleEndianConsts.BYTE_SIZE ) ) |
| { |
| processEntireContinuation( record ); |
| } |
| else |
| { |
| readStringRemainder( record ); |
| } |
| } |
| |
| } |
| |
| /** |
| * Reads the remainder string and any subsequent strings from the continuation record. |
| * |
| * @param record The entire continuation record data. |
| */ |
| private void readStringRemainder( final byte[] record ) |
| { |
| int stringRemainderSizeInBytes = calculateByteCount( charCount-getContinuationCharsRead() ); |
| byte[] unicodeStringData = new byte[SSTRecord.STRING_MINIMAL_OVERHEAD |
| + stringRemainderSizeInBytes]; |
| |
| // write the string length |
| LittleEndian.putShort( unicodeStringData, 0, (short) (charCount-getContinuationCharsRead()) ); |
| |
| // write the options flag |
| unicodeStringData[LittleEndianConsts.SHORT_SIZE] = createOptionByte( wideChar, richText, extendedText ); |
| |
| // copy the bytes/words making up the string; skipping |
| // past all the overhead of the str_data array |
| arraycopy( record, LittleEndianConsts.BYTE_SIZE, unicodeStringData, |
| SSTRecord.STRING_MINIMAL_OVERHEAD, |
| stringRemainderSizeInBytes ); |
| |
| // use special constructor to create the final string |
| UnicodeString string = new UnicodeString( UnicodeString.sid, |
| (short) unicodeStringData.length, unicodeStringData, |
| unfinishedString ); |
| Integer integer = new Integer( strings.size() ); |
| |
| addToStringTable( strings, integer, string ); |
| |
| int newOffset = offsetForContinuedRecord( stringRemainderSizeInBytes ); |
| manufactureStrings( record, newOffset); |
| } |
| |
| /** |
| * Calculates the size of the string in bytes based on the character width |
| */ |
| private int stringSizeInBytes() |
| { |
| return calculateByteCount( charCount ); |
| } |
| |
| /** |
| * Calculates the size of the string in byes. This figure includes all the over |
| * heads for the string. |
| */ |
| private int totalStringSize() |
| { |
| return stringSizeInBytes() |
| + stringHeaderOverhead() |
| + LittleEndianConsts.INT_SIZE * runCount |
| + extensionLength; |
| } |
| |
| private int stringHeaderOverhead() |
| { |
| return SSTRecord.STRING_MINIMAL_OVERHEAD |
| + ( richText ? LittleEndianConsts.SHORT_SIZE : 0 ) |
| + ( extendedText ? LittleEndianConsts.INT_SIZE : 0 ); |
| } |
| |
| private int offsetForContinuedRecord( int stringRemainderSizeInBytes ) |
| { |
| int offset = stringRemainderSizeInBytes + runCount * LittleEndianConsts.INT_SIZE + extensionLength; |
| if (stringRemainderSizeInBytes != 0) |
| //If a portion of the string remains then the wideChar options byte is repeated, |
| //so need to skip this. |
| offset += + LittleEndianConsts.BYTE_SIZE; |
| return offset; |
| } |
| |
| private byte createOptionByte( boolean wideChar, boolean richText, boolean farEast ) |
| { |
| return (byte) ( ( wideChar ? 1 : 0 ) + ( farEast ? 4 : 0 ) + ( richText ? 8 : 0 ) ); |
| } |
| |
| /** |
| * If the continued record is so long is spans into the next continue then |
| * simply suck the remaining string data into the existing <code>unfinishedString</code>. |
| * |
| * @param record The data from the continuation record. |
| */ |
| private void processEntireContinuation( final byte[] record ) |
| { |
| // create artificial data to create a UnicodeString |
| int dataLengthInBytes = record.length - LittleEndianConsts.BYTE_SIZE; |
| byte[] unicodeStringData = new byte[record.length + LittleEndianConsts.SHORT_SIZE]; |
| |
| int charsRead = calculateCharCount( dataLengthInBytes ); |
| LittleEndian.putShort( unicodeStringData, (byte) 0, (short) charsRead ); |
| arraycopy( record, 0, unicodeStringData, LittleEndianConsts.SHORT_SIZE, record.length ); |
| UnicodeString ucs = new UnicodeString( UnicodeString.sid, (short) unicodeStringData.length, unicodeStringData, unfinishedString); |
| |
| unfinishedString = ucs.getString(); |
| setContinuationCharsRead( getContinuationCharsRead() + charsRead ); |
| if (getContinuationCharsRead() == charCount) { |
| Integer integer = new Integer( strings.size() ); |
| addToStringTable( strings, integer, ucs ); |
| } |
| } |
| |
| private boolean stringSpansContinuation( int continuationSizeInBytes ) |
| { |
| return calculateByteCount( charCount - getContinuationCharsRead() ) > continuationSizeInBytes; |
| } |
| |
| /** |
| * @return the number of characters we expect in the first |
| * sub-record in a subsequent continuation record |
| */ |
| |
| int getContinuationCharsRead() |
| { |
| return continuationReadChars; |
| } |
| |
| private void setContinuationCharsRead( final int count ) |
| { |
| continuationReadChars = count; |
| } |
| |
| private int calculateByteCount( final int character_count ) |
| { |
| return character_count * ( wideChar ? LittleEndianConsts.SHORT_SIZE : LittleEndianConsts.BYTE_SIZE ); |
| } |
| |
| |
| /** |
| * Copies an array from the specified source array, beginning at the |
| * specified position, to the specified position of the destination array. |
| * A subsequence of array components are copied from the source |
| * array referenced by <code>src</code> to the destination array |
| * referenced by <code>dst</code>. The number of components copied is |
| * equal to the <code>length</code> argument. The components at |
| * positions <code>srcOffset</code> through |
| * <code>srcOffset+length-1</code> in the source array are copied into |
| * positions <code>dstOffset</code> through |
| * <code>dstOffset+length-1</code>, respectively, of the destination |
| * array. |
| * <p> |
| * If the <code>src</code> and <code>dst</code> arguments refer to the |
| * same array object, then the copying is performed as if the |
| * components at positions <code>srcOffset</code> through |
| * <code>srcOffset+length-1</code> were first copied to a temporary |
| * array with <code>length</code> components and then the contents of |
| * the temporary array were copied into positions |
| * <code>dstOffset</code> through <code>dstOffset+length-1</code> of the |
| * destination array. |
| * <p> |
| * If <code>dst</code> is <code>null</code>, then a |
| * <code>NullPointerException</code> is thrown. |
| * <p> |
| * If <code>src</code> is <code>null</code>, then a |
| * <code>NullPointerException</code> is thrown and the destination |
| * array is not modified. |
| * <p> |
| * Otherwise, if any of the following is true, an |
| * <code>ArrayStoreException</code> is thrown and the destination is |
| * not modified: |
| * <ul> |
| * <li>The <code>src</code> argument refers to an object that is not an |
| * array. |
| * <li>The <code>dst</code> argument refers to an object that is not an |
| * array. |
| * <li>The <code>src</code> argument and <code>dst</code> argument refer to |
| * arrays whose component types are different primitive types. |
| * <li>The <code>src</code> argument refers to an array with a primitive |
| * component type and the <code>dst</code> argument refers to an array |
| * with a reference component type. |
| * <li>The <code>src</code> argument refers to an array with a reference |
| * component type and the <code>dst</code> argument refers to an array |
| * with a primitive component type. |
| * </ul> |
| * <p> |
| * Otherwise, if any of the following is true, an |
| * <code>IndexOutOfBoundsException</code> is |
| * thrown and the destination is not modified: |
| * <ul> |
| * <li>The <code>srcOffset</code> argument is negative. |
| * <li>The <code>dstOffset</code> argument is negative. |
| * <li>The <code>length</code> argument is negative. |
| * <li><code>srcOffset+length</code> is greater than |
| * <code>src.length</code>, the length of the source array. |
| * <li><code>dstOffset+length</code> is greater than |
| * <code>dst.length</code>, the length of the destination array. |
| * </ul> |
| * <p> |
| * Otherwise, if any actual component of the source array from |
| * position <code>srcOffset</code> through |
| * <code>srcOffset+length-1</code> cannot be converted to the component |
| * type of the destination array by assignment conversion, an |
| * <code>ArrayStoreException</code> is thrown. In this case, let |
| * <b><i>k</i></b> be the smallest nonnegative integer less than |
| * length such that <code>src[srcOffset+</code><i>k</i><code>]</code> |
| * cannot be converted to the component type of the destination |
| * array; when the exception is thrown, source array components from |
| * positions <code>srcOffset</code> through |
| * <code>srcOffset+</code><i>k</i><code>-1</code> |
| * will already have been copied to destination array positions |
| * <code>dstOffset</code> through |
| * <code>dstOffset+</code><i>k</I><code>-1</code> and no other |
| * positions of the destination array will have been modified. |
| * (Because of the restrictions already itemized, this |
| * paragraph effectively applies only to the situation where both |
| * arrays have component types that are reference types.) |
| * |
| * @param src the source array. |
| * @param src_position start position in the source array. |
| * @param dst the destination array. |
| * @param dst_position pos start position in the destination data. |
| * @param length the number of array elements to be copied. |
| * @exception IndexOutOfBoundsException if copying would cause |
| * access of data outside array bounds. |
| * @exception ArrayStoreException if an element in the <code>src</code> |
| * array could not be stored into the <code>dest</code> array |
| * because of a type mismatch. |
| * @exception NullPointerException if either <code>src</code> or |
| * <code>dst</code> is <code>null</code>. |
| */ |
| private void arraycopy( byte[] src, int src_position, |
| byte[] dst, int dst_position, |
| int length ) |
| { |
| System.arraycopy( src, src_position, dst, dst_position, length ); |
| } |
| |
| /** |
| * @return the unfinished string |
| */ |
| String getUnfinishedString() |
| { |
| return unfinishedString; |
| } |
| |
| /** |
| * @return true if current string uses wide characters |
| */ |
| boolean isWideChar() |
| { |
| return wideChar; |
| } |
| |
| |
| } |