AOO410/main/xmerge/source/wordsmith/java/org/openoffice/xmerge/converter/xml/sxw/wordsmith/WSDecoder.java - openoffice - Git at Google

 /**************************************************************
  *
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  *************************************************************/


 package org.openoffice.xmerge.converter.xml.sxw.wordsmith;

 import java.io.ByteArrayInputStream;
 import java.io.DataInputStream;
 import java.io.IOException;
 import java.io.FileInputStream;
 import java.io.UnsupportedEncodingException;
 import org.openoffice.xmerge.util.Debug;

 import org.openoffice.xmerge.converter.palm.*;
 import org.openoffice.xmerge.util.Resources;

 /**
  *  This class is used by {@link
  *  org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl
  *  DocumentDeserializerImpl} to decode a WordSmith format.  It currently
  *  decodes the text content into a single <code>String</code> object.
  *
  *  @author   Herbie Ong, David Proulx
  */
 final class WSDecoder implements DOCConstants {

     /** For decoding purposes. */
     private final static int COUNT_BITS = 3;

     /** Resources object for I18N. */
     private Resources res = null;

     /**
      *  Default constructor creates a header and
      *  a text buffer for holding all the text in
      *  the DOC db.
      */
     WSDecoder() {
         res = Resources.getInstance();
     }

     /**
      *  Decode the text records into a single <code>byte</code> array.
      *
      *  @param  recs  <code>Record</code> array holding WordSmith
      *                  contents.
      *
      *  @throws  IOException  If any I/O error occurs.
      */
     byte[] parseRecords(Record[] recs) throws IOException {

         // read the header record
         HeaderInfo header = readHeader(recs[0].getBytes());
         dumpHeader(header);
         byte[][] byteArrays = new byte[recs.length - 1][];
         for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null;

         switch (header.version & ~4) {  // DJP: "4" indicates OOB data is present.
         // Add a constant to handle this, might also need code to handle it.

             case COMPRESSED:
             case 3:   // DJP: determined this empirically.  Are Herbie's constants wrong?
                 for (int i = 1; i < recs.length; i++) {
                     byteArrays[i-1] = decompress(recs[i].getBytes(),
                     header.textRecordSize);
                     Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
                 }

                 break;

             case UNCOMPRESSED:
                 for (int i = 1; i < recs.length; i++) {
                     byteArrays[i-1] = recs[i].getBytes();
                     Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
                 }

                 break;

             default:
                 throw new IOException(res.getString("UNKNOWN_DOC_VERSION"));

         }

         // Concatenate byteArrays[][] into a single byte array.
         int length = 0;
         for (int i = 0; i < recs.length - 1; i++)
             length += byteArrays[i].length;
         byte bigArray[] = new byte[length];
         int offset = 0;
         for (int i = 0; i < recs.length - 1; i++) {
             System.arraycopy(byteArrays[i], 0, bigArray, offset,
             byteArrays[i].length);
             offset += byteArrays[i].length;
         }
         return bigArray;
     }


     /**
      *  Decode the text records into a <code>Wse</code> array.
      *
      *  @param  recs  <code>Record</code> array holding DOC
      *                    contents.
      *
      *  @throws  IOException  If any I/O error occurs.
      */
     Wse[] parseDocument(Record[] recs) throws IOException {

         java.util.Vector v = new java.util.Vector(20, 20);
         WseFontTable fontTable = null;
         WseColorTable colorTable = null;

         // rawData is the document data to be parsed.
         byte rawData[] = parseRecords(recs);

         // beginning of document has some header information, including
         // optional font and color tables.
         // DJP: maybe should add a new WSelement (docHeader) to hold
         // header info.
         // DJP: finish code here to parse header
         if (rawData[0] != 2) throw new IOException();
         int nParagraphs = util.intFrom4bytes(rawData, 2);
         int nAtoms      = util.intFrom4bytes(rawData, 6);
         int nChars      = util.intFrom4bytes(rawData, 10);
         int miscSize    = util.intFrom4bytes(rawData, 14);
         int curIndex = 18;

         while (curIndex < rawData.length) {
             if (WsePara.isValid(rawData, curIndex)) {
                 v.add(new WsePara(rawData, curIndex));
                 curIndex = WsePara.computeNewIndex(rawData, curIndex);
             } else if (WseTextRun.isValid(rawData, curIndex)) {
                 v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable));
                 curIndex = WseTextRun.computeNewIndex(rawData, curIndex);
             } else if (WseFontTable.isValid(rawData, curIndex)) {
                 fontTable = new WseFontTable(rawData, curIndex);
                 v.add(fontTable);
                 curIndex = WseFontTable.computeNewIndex(rawData, curIndex);
             } else if (WseColorTable.isValid(rawData, curIndex)) {
                 colorTable = new WseColorTable(rawData, curIndex);
                 v.add(colorTable);
                 curIndex = WseColorTable.computeNewIndex(rawData, curIndex);
             } else {
                 Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]);
                 throw new IOException();
             }
         }

         return (Wse[])v.toArray(new Wse[2]);
     }


     /**
      *  <p>Decompress the <code>byte</code> array.</p>
      *
      *  <p>The resulting uncompressed <code>byte</code> array
      *  should be within <code>textRecordSize</code> length,
      *  definitely within twice the size it claims, else treat
      *  it as a problem with the encoding of that PDB and
      *  throw <code>IOException</code>.</p>
      *
      *  @param  cBytes           Compressed <code>byte</code> array
      *  @param  textRecordSize  Size of uncompressed <code>byte</code>
      *                          array
      *
      *  @throws   IOException  If <code>textRecordSize</codeL &lt;
      *                         <code>cBytes.length</code>.
      */
     private byte[] decompress(byte[] cBytes, int textRecordSize)
     throws IOException {

         // create byte array for storing uncompressed bytes
         // it should be within textRecordSize range, definitely
         // within twice of textRecordSize!  if not, then
         // an ArrayIndexOutOfBoundsException will get thrown,
         // and it should be converted into an IOException, and
         // treat it as a conversion error.
         byte[] uBytes = new byte[textRecordSize*2];

         int up = 0;
         int cp = 0;

         try {

             while (cp < cBytes.length) {

                 int c = cBytes[cp++] & 0xff;

                 // codes 1...8 mean copy that many bytes
                 if (c > 0 && c < 9) {

                     while (c-- > 0)
                         uBytes[up++] = cBytes[cp++];
                 }

                 // codes 0, 9...0x7F represent themselves
                 else if (c < 0x80) {
                     uBytes[up++] = (byte) c;
                 }

                 // codes 0xC0...0xFF represent "space + ascii char"
                 else if (c >= 0xC0) {
                     uBytes[up++] = (byte) ' ';
                     uBytes[up++] = (byte) (c ^ 0x80);
                 }

                 // codes 0x80...0xBf represent sequences
                 else {
                     c <<= 8;
                     c += cBytes[cp++] & 0xff;
                     int m = (c & 0x3fff) >> COUNT_BITS;
                     int n = c & ((1 << COUNT_BITS) - 1);
                     n += COUNT_BITS;
                     while (n-- > 0) {
                         uBytes[up] = uBytes[up - m];
                         up++;
                     }
                 }
             }

         } catch (ArrayIndexOutOfBoundsException e) {

             throw new IOException(
             res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED"));
         }

         // note that ubytes may be larger that the amount of
         // uncompressed bytes, so trim it to another byte array
         // with the exact size.
         byte[] textBytes = new byte[up];
         System.arraycopy(uBytes, 0, textBytes, 0, up);

         return textBytes;
     }


     /**
      *  Read the header <code>byte</code> array.
      *
      *  @param  bytes  <code>byte</code> array containing header
      *                 record data.
      *
      *  @return  <code>HeaderInfo</code> object.
      *
      *  @throws  IOException  If any I/O error occurs.
      */
     private HeaderInfo readHeader(byte[] bytes) throws IOException {

         HeaderInfo header = new HeaderInfo();

         ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
         DataInputStream dis = new DataInputStream(bis);

         // Normally the first 2 bytes comprised of the version
         // which should either be COMPRESSED or UNCOMPRESSED
         // SmartDoc/Quickword would add a 0x01 to the first
         // byte, thus their version would be 0x0101 for UNCOMPRESSED
         // instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of
         // 0x0002.

         dis.readByte();
         header.version = dis.readByte();

         // read extra 2 unused bytes
         dis.readShort();

         // Read the text length, this should be unsigned 4 bytes.
         // We could store the read value into a long, but then
         // our current buffer limit is the max positive of an int.
         // That is a large enough limit, thus we shall stay with
         // storing the value in an int.  If it exceeds, then
         // an IOException should be thrown.
         header.textLen = dis.readInt();
         if (header.textLen < 0) {
             throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED"));
         }

         // read the number of records - unsigned 2 bytes
         header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff;

         // read the record size - unsigned 2 bytes
         header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff;

         // read extra 4 unused bytes
         dis.readInt();

         return header;
     }


     /**
      *  Prints out header info into log.
      *  Used for debugging purposes only.
      *
      *  @param  header  <code>HeaderInfo</code> structure.
      */
     private void dumpHeader(HeaderInfo header) {
     /*
         log("<DOC_INFO ");
         log("version=\"" + header.version + "\" ");
         log("text-length=\"" + header.textLen + "\" ");
         log("number-of-records=\"" + header.textRecordCount + "\" ");
         log("record-size=\"" + header.textRecordSize  + "\" />\n");
     */
     }


     /**
      *  Inner class to store DOC header information.
      */
     private class HeaderInfo {

         /** length of text section */
         int textLen = 0;

         /** number of text records */
         int textRecordCount = 0;

         /**
          *  size of a text record.  This is normally the same as
          *  TEXT_RECORD_SIZE, but some applications may modify this.
          */
         int textRecordSize = 0;

         /** compression type */
         int version = 0;
     }
 }
	/**************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	*************************************************************/



	package org.openoffice.xmerge.converter.xml.sxw.wordsmith;

	import java.io.ByteArrayInputStream;
	import java.io.DataInputStream;
	import java.io.IOException;
	import java.io.FileInputStream;
	import java.io.UnsupportedEncodingException;
	import org.openoffice.xmerge.util.Debug;

	import org.openoffice.xmerge.converter.palm.*;
	import org.openoffice.xmerge.util.Resources;

	/**
	* This class is used by {@link
	* org.openoffice.xmerge.converter.xml.sxw.wordsmith.DocumentDeserializerImpl
	* DocumentDeserializerImpl} to decode a WordSmith format. It currently
	* decodes the text content into a single <code>String</code> object.
	*
	* @author Herbie Ong, David Proulx
	*/
	final class WSDecoder implements DOCConstants {

	/** For decoding purposes. */
	private final static int COUNT_BITS = 3;

	/** Resources object for I18N. */
	private Resources res = null;

	/**
	* Default constructor creates a header and
	* a text buffer for holding all the text in
	* the DOC db.
	*/
	WSDecoder() {
	res = Resources.getInstance();
	}

	/**
	* Decode the text records into a single <code>byte</code> array.
	*
	* @param recs <code>Record</code> array holding WordSmith
	* contents.
	*
	* @throws IOException If any I/O error occurs.
	*/
	byte[] parseRecords(Record[] recs) throws IOException {

	// read the header record
	HeaderInfo header = readHeader(recs[0].getBytes());
	dumpHeader(header);
	byte[][] byteArrays = new byte[recs.length - 1][];
	for (int i = 0; i < recs.length - 1; i++) byteArrays[i] = null;

	switch (header.version & ~4) { // DJP: "4" indicates OOB data is present.
	// Add a constant to handle this, might also need code to handle it.

	case COMPRESSED:
	case 3: // DJP: determined this empirically. Are Herbie's constants wrong?
	for (int i = 1; i < recs.length; i++) {
	byteArrays[i-1] = decompress(recs[i].getBytes(),
	header.textRecordSize);
	Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
	}

	break;

	case UNCOMPRESSED:
	for (int i = 1; i < recs.length; i++) {
	byteArrays[i-1] = recs[i].getBytes();
	Debug.log(Debug.INFO, "processing " + byteArrays[i-1].length + " bytes");
	}

	break;

	default:
	throw new IOException(res.getString("UNKNOWN_DOC_VERSION"));

	}

	// Concatenate byteArrays[][] into a single byte array.
	int length = 0;
	for (int i = 0; i < recs.length - 1; i++)
	length += byteArrays[i].length;
	byte bigArray[] = new byte[length];
	int offset = 0;
	for (int i = 0; i < recs.length - 1; i++) {
	System.arraycopy(byteArrays[i], 0, bigArray, offset,
	byteArrays[i].length);
	offset += byteArrays[i].length;
	}
	return bigArray;
	}


	/**
	* Decode the text records into a <code>Wse</code> array.
	*
	* @param recs <code>Record</code> array holding DOC
	* contents.
	*
	* @throws IOException If any I/O error occurs.
	*/
	Wse[] parseDocument(Record[] recs) throws IOException {

	java.util.Vector v = new java.util.Vector(20, 20);
	WseFontTable fontTable = null;
	WseColorTable colorTable = null;

	// rawData is the document data to be parsed.
	byte rawData[] = parseRecords(recs);

	// beginning of document has some header information, including
	// optional font and color tables.
	// DJP: maybe should add a new WSelement (docHeader) to hold
	// header info.
	// DJP: finish code here to parse header
	if (rawData[0] != 2) throw new IOException();
	int nParagraphs = util.intFrom4bytes(rawData, 2);
	int nAtoms = util.intFrom4bytes(rawData, 6);
	int nChars = util.intFrom4bytes(rawData, 10);
	int miscSize = util.intFrom4bytes(rawData, 14);
	int curIndex = 18;

	while (curIndex < rawData.length) {
	if (WsePara.isValid(rawData, curIndex)) {
	v.add(new WsePara(rawData, curIndex));
	curIndex = WsePara.computeNewIndex(rawData, curIndex);
	} else if (WseTextRun.isValid(rawData, curIndex)) {
	v.add(new WseTextRun(rawData, curIndex, fontTable, colorTable));
	curIndex = WseTextRun.computeNewIndex(rawData, curIndex);
	} else if (WseFontTable.isValid(rawData, curIndex)) {
	fontTable = new WseFontTable(rawData, curIndex);
	v.add(fontTable);
	curIndex = WseFontTable.computeNewIndex(rawData, curIndex);
	} else if (WseColorTable.isValid(rawData, curIndex)) {
	colorTable = new WseColorTable(rawData, curIndex);
	v.add(colorTable);
	curIndex = WseColorTable.computeNewIndex(rawData, curIndex);
	} else {
	Debug.log(Debug.ERROR, "Unknown code " + rawData[curIndex]);
	throw new IOException();
	}
	}

	return (Wse[])v.toArray(new Wse[2]);
	}


	/**
	* <p>Decompress the <code>byte</code> array.</p>
	*
	* <p>The resulting uncompressed <code>byte</code> array
	* should be within <code>textRecordSize</code> length,
	* definitely within twice the size it claims, else treat
	* it as a problem with the encoding of that PDB and
	* throw <code>IOException</code>.</p>
	*
	* @param cBytes Compressed <code>byte</code> array
	* @param textRecordSize Size of uncompressed <code>byte</code>
	* array
	*
	* @throws IOException If <code>textRecordSize</codeL <
	* <code>cBytes.length</code>.
	*/
	private byte[] decompress(byte[] cBytes, int textRecordSize)
	throws IOException {

	// create byte array for storing uncompressed bytes
	// it should be within textRecordSize range, definitely
	// within twice of textRecordSize! if not, then
	// an ArrayIndexOutOfBoundsException will get thrown,
	// and it should be converted into an IOException, and
	// treat it as a conversion error.
	byte[] uBytes = new byte[textRecordSize*2];

	int up = 0;
	int cp = 0;

	try {

	while (cp < cBytes.length) {

	int c = cBytes[cp++] & 0xff;

	// codes 1...8 mean copy that many bytes
	if (c > 0 && c < 9) {

	while (c-- > 0)
	uBytes[up++] = cBytes[cp++];
	}

	// codes 0, 9...0x7F represent themselves
	else if (c < 0x80) {
	uBytes[up++] = (byte) c;
	}

	// codes 0xC0...0xFF represent "space + ascii char"
	else if (c >= 0xC0) {
	uBytes[up++] = (byte) ' ';
	uBytes[up++] = (byte) (c ^ 0x80);
	}

	// codes 0x80...0xBf represent sequences
	else {
	c <<= 8;
	c += cBytes[cp++] & 0xff;
	int m = (c & 0x3fff) >> COUNT_BITS;
	int n = c & ((1 << COUNT_BITS) - 1);
	n += COUNT_BITS;
	while (n-- > 0) {
	uBytes[up] = uBytes[up - m];
	up++;
	}
	}
	}

	} catch (ArrayIndexOutOfBoundsException e) {

	throw new IOException(
	res.getString("DOC_TEXT_RECORD_SIZE_EXCEEDED"));
	}

	// note that ubytes may be larger that the amount of
	// uncompressed bytes, so trim it to another byte array
	// with the exact size.
	byte[] textBytes = new byte[up];
	System.arraycopy(uBytes, 0, textBytes, 0, up);

	return textBytes;
	}


	/**
	* Read the header <code>byte</code> array.
	*
	* @param bytes <code>byte</code> array containing header
	* record data.
	*
	* @return <code>HeaderInfo</code> object.
	*
	* @throws IOException If any I/O error occurs.
	*/
	private HeaderInfo readHeader(byte[] bytes) throws IOException {

	HeaderInfo header = new HeaderInfo();

	ByteArrayInputStream bis = new ByteArrayInputStream(bytes);
	DataInputStream dis = new DataInputStream(bis);

	// Normally the first 2 bytes comprised of the version
	// which should either be COMPRESSED or UNCOMPRESSED
	// SmartDoc/Quickword would add a 0x01 to the first
	// byte, thus their version would be 0x0101 for UNCOMPRESSED
	// instead of 0x0001 and 0x0102 for UNCOMPRESSED instead of
	// 0x0002.

	dis.readByte();
	header.version = dis.readByte();

	// read extra 2 unused bytes
	dis.readShort();

	// Read the text length, this should be unsigned 4 bytes.
	// We could store the read value into a long, but then
	// our current buffer limit is the max positive of an int.
	// That is a large enough limit, thus we shall stay with
	// storing the value in an int. If it exceeds, then
	// an IOException should be thrown.
	header.textLen = dis.readInt();
	if (header.textLen < 0) {
	throw new IOException(res.getString("DOC_TEXT_LENGTH_EXCEEDED"));
	}

	// read the number of records - unsigned 2 bytes
	header.textRecordCount = ((int) dis.readShort()) & 0x0000ffff;

	// read the record size - unsigned 2 bytes
	header.textRecordSize = ((int) dis.readShort()) & 0x0000ffff;

	// read extra 4 unused bytes
	dis.readInt();

	return header;
	}


	/**
	* Prints out header info into log.
	* Used for debugging purposes only.
	*
	* @param header <code>HeaderInfo</code> structure.
	*/
	private void dumpHeader(HeaderInfo header) {
	/*
	log("<DOC_INFO ");
	log("version=\"" + header.version + "\" ");
	log("text-length=\"" + header.textLen + "\" ");
	log("number-of-records=\"" + header.textRecordCount + "\" ");
	log("record-size=\"" + header.textRecordSize + "\" />\n");
	*/
	}


	/**
	* Inner class to store DOC header information.
	*/
	private class HeaderInfo {

	/** length of text section */
	int textLen = 0;

	/** number of text records */
	int textRecordCount = 0;

	/**
	* size of a text record. This is normally the same as
	* TEXT_RECORD_SIZE, but some applications may modify this.
	*/
	int textRecordSize = 0;

	/** compression type */
	int version = 0;
	}
	}