poi-scratchpad/src/main/java/org/apache/poi/hwpf/model/SectionTable.java - poi - Git at Google

 /* ====================================================================
      Licensed to the Apache Software Foundation (ASF) under one or more
      contributor license agreements.    See the NOTICE file distributed with
      this work for additional information regarding copyright ownership.
      The ASF licenses this file to You under the Apache License, Version 2.0
      (the "License"); you may not use this file except in compliance with
      the License.    You may obtain a copy of the License at

              http://www.apache.org/licenses/LICENSE-2.0

      Unless required by applicable law or agreed to in writing, software
      distributed under the License is distributed on an "AS IS" BASIS,
      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
      See the License for the specific language governing permissions and
      limitations under the License.
 ==================================================================== */

 package org.apache.poi.hwpf.model;

 import java.io.ByteArrayOutputStream;
 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.logging.log4j.LogManager;
 import org.apache.logging.log4j.Logger;
 import org.apache.poi.hwpf.model.io.HWPFFileSystem;
 import org.apache.poi.util.IOUtils;
 import org.apache.poi.util.Internal;
 import org.apache.poi.util.LittleEndian;
 import org.apache.poi.util.LittleEndianConsts;

 @Internal
 public class SectionTable
 {

     //arbitrarily selected; may need to increase
     private static final int MAX_RECORD_LENGTH = 100_000;

     private static final Logger LOG = LogManager.getLogger(SectionTable.class);
     private static final int SED_SIZE = 12;

     protected List<SEPX> _sections = new ArrayList<>();
     protected List<TextPiece> _text;

     /** So we can know if things are unicode or not */
     //private TextPieceTable tpt;

     public SectionTable()
     {
     }


     public SectionTable(
             byte[] documentStream, byte[] tableStream,
             int offset, int size, int fcMin, TextPieceTable tpt, int mainLength)
     {
         PlexOfCps sedPlex = new PlexOfCps(tableStream, offset, size, SED_SIZE);
         //this.tpt = tpt;
         this._text = tpt.getTextPieces();

         int length = sedPlex.length();

         for (int x = 0; x < length; x++)
         {
             GenericPropertyNode node = sedPlex.getProperty(x);
             SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);

             int fileOffset = sed.getFc();
             // int startAt = CPtoFC(node.getStart());
             // int endAt = CPtoFC(node.getEnd());
             int startAt = node.getStart();
             int endAt = node.getEnd();

             // check for the optimization
             if (fileOffset == 0xffffffff)
             {
                 _sections.add(new SEPX(sed, startAt, endAt, new byte[0]));
             }
             else
             {
                 // The first short at the offset is the size of the grpprl.
                 int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
                 fileOffset += LittleEndianConsts.SHORT_SIZE;
                 byte[] buf = IOUtils.safelyClone(documentStream, fileOffset, sepxSize, MAX_RECORD_LENGTH);
                 _sections.add(new SEPX(sed, startAt, endAt, buf));
             }
         }

         // Some files seem to lie about their unicode status, which
         //    is very very pesky. Try to work around these, but this
         //    is getting on for black magic...
         boolean matchAt = false;
         boolean matchHalf = false;
         for (SEPX s : _sections) {
             if (s.getEnd() == mainLength) {
                 matchAt = true;
             } else if (s.getEnd() == mainLength || s.getEnd() == mainLength - 1) {
                 matchHalf = true;
             }
         }
         if(! matchAt && matchHalf) {
             LOG.atWarn().log("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
             for(int i=0; i<_sections.size(); i++) {
                 SEPX s = _sections.get(i);
                 GenericPropertyNode node = sedPlex.getProperty(i);

                 // s.setStart( CPtoFC(node.getStart()) );
                 // s.setEnd( CPtoFC(node.getEnd()) );
                 int startAt = node.getStart();
                 int endAt = node.getEnd();
                 s.setStart( startAt );
                 s.setEnd( endAt );
             }
         }

         _sections.sort(PropertyNode.StartComparator);
     }

     public void adjustForInsert(int listIndex, int length)
     {
         int size = _sections.size();
         SEPX sepx = _sections.get(listIndex);
         sepx.setEnd(sepx.getEnd() + length);

         for (int x = listIndex + 1; x < size; x++)
         {
             sepx = _sections.get(x);
             sepx.setStart(sepx.getStart() + length);
             sepx.setEnd(sepx.getEnd() + length);
         }
     }

     // goss version of CPtoFC - this takes into account non-contiguous textpieces
     // that we have come across in real world documents. Tests against the example
     // code in HWPFDocument show no variation to Ryan's version of the code in
     // normal use, but this version works with our non-contiguous test case.
     // So far unable to get this test case to be written out as well due to
     // other issues. - piers
     //
     // i'm commenting this out, because it just doesn't work with non-contiguous
     // textpieces :( Usual (as for PAPX and CHPX) call to TextPiecesTable does.
     // private int CPtoFC(int CP)
     // {
     // TextPiece TP = null;
     //
     // for(int i=_text.size()-1; i>-1; i--)
     // {
     // TP = _text.get(i);
     //
     // if(CP >= TP.getCP()) break;
     // }
     // int FC = TP.getPieceDescriptor().getFilePosition();
     // int offset = CP - TP.getCP();
     // if (TP.isUnicode()) {
     // offset = offset*2;
     // }
     // FC = FC+offset;
     // return FC;
     // }

     public List<SEPX> getSections()
     {
         return _sections;
     }

     @Deprecated
     public void writeTo( HWPFFileSystem sys, int fcMin ) throws IOException
     {
         ByteArrayOutputStream docStream = sys.getStream( "WordDocument" );
         ByteArrayOutputStream tableStream = sys.getStream( "1Table" );

         writeTo( docStream, tableStream );
     }

     public void writeTo(
             ByteArrayOutputStream wordDocumentStream,
             ByteArrayOutputStream tableStream ) throws IOException
     {

         int offset = wordDocumentStream.size();
         //int len = _sections.size();
         PlexOfCps plex = new PlexOfCps(SED_SIZE);

         for (SEPX sepx : _sections) {
             byte[] grpprl = sepx.getGrpprl();

             // write the sepx to the document stream. starts with a 2 byte size
             // followed by the grpprl
             byte[] shortBuf = new byte[2];
             LittleEndian.putShort(shortBuf, 0, (short) grpprl.length);

             wordDocumentStream.write(shortBuf);
             wordDocumentStream.write(grpprl);

             // set the fc in the section descriptor
             SectionDescriptor sed = sepx.getSectionDescriptor();
             sed.setFc(offset);

             // add the section descriptor bytes to the PlexOfCps.

             /* original line */
             GenericPropertyNode property = new GenericPropertyNode(
                     sepx.getStart(), sepx.getEnd(), sed.toByteArray());
             /*
              * Line using Ryan's FCtoCP() conversion method - unable to observe
              * any effect on our testcases when using this code - piers
              */
             /*
              * there is an effect on Bug45743.doc actually. writeoutreadback
              * changes byte offset of chars (but preserve string offsets).
              * Changing back to original lines - sergey
              */
             // GenericPropertyNode property = new GenericPropertyNode(
             // tpt.getCharIndex( sepx.getStartBytes() ),
             // tpt.getCharIndex( sepx.getEndBytes() ), sed.toByteArray() );

             plex.addProperty(property);

             offset = wordDocumentStream.size();
         }
         tableStream.write(plex.toByteArray());
     }
 }
	/* ====================================================================
	Licensed to the Apache Software Foundation (ASF) under one or more
	contributor license agreements. See the NOTICE file distributed with
	this work for additional information regarding copyright ownership.
	The ASF licenses this file to You under the Apache License, Version 2.0
	(the "License"); you may not use this file except in compliance with
	the License. You may obtain a copy of the License at

	http://www.apache.org/licenses/LICENSE-2.0

	Unless required by applicable law or agreed to in writing, software
	distributed under the License is distributed on an "AS IS" BASIS,
	WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	See the License for the specific language governing permissions and
	limitations under the License.
	==================================================================== */

	package org.apache.poi.hwpf.model;

	import java.io.ByteArrayOutputStream;
	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.logging.log4j.LogManager;
	import org.apache.logging.log4j.Logger;
	import org.apache.poi.hwpf.model.io.HWPFFileSystem;
	import org.apache.poi.util.IOUtils;
	import org.apache.poi.util.Internal;
	import org.apache.poi.util.LittleEndian;
	import org.apache.poi.util.LittleEndianConsts;

	@Internal
	public class SectionTable
	{

	//arbitrarily selected; may need to increase
	private static final int MAX_RECORD_LENGTH = 100_000;

	private static final Logger LOG = LogManager.getLogger(SectionTable.class);
	private static final int SED_SIZE = 12;

	protected List<SEPX> _sections = new ArrayList<>();
	protected List<TextPiece> _text;

	/** So we can know if things are unicode or not */
	//private TextPieceTable tpt;

	public SectionTable()
	{
	}


	public SectionTable(
	byte[] documentStream, byte[] tableStream,
	int offset, int size, int fcMin, TextPieceTable tpt, int mainLength)
	{
	PlexOfCps sedPlex = new PlexOfCps(tableStream, offset, size, SED_SIZE);
	//this.tpt = tpt;
	this._text = tpt.getTextPieces();

	int length = sedPlex.length();

	for (int x = 0; x < length; x++)
	{
	GenericPropertyNode node = sedPlex.getProperty(x);
	SectionDescriptor sed = new SectionDescriptor(node.getBytes(), 0);

	int fileOffset = sed.getFc();
	// int startAt = CPtoFC(node.getStart());
	// int endAt = CPtoFC(node.getEnd());
	int startAt = node.getStart();
	int endAt = node.getEnd();

	// check for the optimization
	if (fileOffset == 0xffffffff)
	{
	_sections.add(new SEPX(sed, startAt, endAt, new byte[0]));
	}
	else
	{
	// The first short at the offset is the size of the grpprl.
	int sepxSize = LittleEndian.getShort(documentStream, fileOffset);
	fileOffset += LittleEndianConsts.SHORT_SIZE;
	byte[] buf = IOUtils.safelyClone(documentStream, fileOffset, sepxSize, MAX_RECORD_LENGTH);
	_sections.add(new SEPX(sed, startAt, endAt, buf));
	}
	}

	// Some files seem to lie about their unicode status, which
	// is very very pesky. Try to work around these, but this
	// is getting on for black magic...
	boolean matchAt = false;
	boolean matchHalf = false;
	for (SEPX s : _sections) {
	if (s.getEnd() == mainLength) {
	matchAt = true;
	} else if (s.getEnd() == mainLength \|\| s.getEnd() == mainLength - 1) {
	matchHalf = true;
	}
	}
	if(! matchAt && matchHalf) {
	LOG.atWarn().log("Your document seemed to be mostly unicode, but the section definition was in bytes! Trying anyway, but things may well go wrong!");
	for(int i=0; i<_sections.size(); i++) {
	SEPX s = _sections.get(i);
	GenericPropertyNode node = sedPlex.getProperty(i);

	// s.setStart( CPtoFC(node.getStart()) );
	// s.setEnd( CPtoFC(node.getEnd()) );
	int startAt = node.getStart();
	int endAt = node.getEnd();
	s.setStart( startAt );
	s.setEnd( endAt );
	}
	}

	_sections.sort(PropertyNode.StartComparator);
	}

	public void adjustForInsert(int listIndex, int length)
	{
	int size = _sections.size();
	SEPX sepx = _sections.get(listIndex);
	sepx.setEnd(sepx.getEnd() + length);

	for (int x = listIndex + 1; x < size; x++)
	{
	sepx = _sections.get(x);
	sepx.setStart(sepx.getStart() + length);
	sepx.setEnd(sepx.getEnd() + length);
	}
	}

	// goss version of CPtoFC - this takes into account non-contiguous textpieces
	// that we have come across in real world documents. Tests against the example
	// code in HWPFDocument show no variation to Ryan's version of the code in
	// normal use, but this version works with our non-contiguous test case.
	// So far unable to get this test case to be written out as well due to
	// other issues. - piers
	//
	// i'm commenting this out, because it just doesn't work with non-contiguous
	// textpieces :( Usual (as for PAPX and CHPX) call to TextPiecesTable does.
	// private int CPtoFC(int CP)
	// {
	// TextPiece TP = null;
	//
	// for(int i=_text.size()-1; i>-1; i--)
	// {
	// TP = _text.get(i);
	//
	// if(CP >= TP.getCP()) break;
	// }
	// int FC = TP.getPieceDescriptor().getFilePosition();
	// int offset = CP - TP.getCP();
	// if (TP.isUnicode()) {
	// offset = offset*2;
	// }
	// FC = FC+offset;
	// return FC;
	// }

	public List<SEPX> getSections()
	{
	return _sections;
	}

	@Deprecated
	public void writeTo( HWPFFileSystem sys, int fcMin ) throws IOException
	{
	ByteArrayOutputStream docStream = sys.getStream( "WordDocument" );
	ByteArrayOutputStream tableStream = sys.getStream( "1Table" );

	writeTo( docStream, tableStream );
	}

	public void writeTo(
	ByteArrayOutputStream wordDocumentStream,
	ByteArrayOutputStream tableStream ) throws IOException
	{

	int offset = wordDocumentStream.size();
	//int len = _sections.size();
	PlexOfCps plex = new PlexOfCps(SED_SIZE);

	for (SEPX sepx : _sections) {
	byte[] grpprl = sepx.getGrpprl();

	// write the sepx to the document stream. starts with a 2 byte size
	// followed by the grpprl
	byte[] shortBuf = new byte[2];
	LittleEndian.putShort(shortBuf, 0, (short) grpprl.length);

	wordDocumentStream.write(shortBuf);
	wordDocumentStream.write(grpprl);

	// set the fc in the section descriptor
	SectionDescriptor sed = sepx.getSectionDescriptor();
	sed.setFc(offset);

	// add the section descriptor bytes to the PlexOfCps.

	/* original line */
	GenericPropertyNode property = new GenericPropertyNode(
	sepx.getStart(), sepx.getEnd(), sed.toByteArray());
	/*
	* Line using Ryan's FCtoCP() conversion method - unable to observe
	* any effect on our testcases when using this code - piers
	*/
	/*
	* there is an effect on Bug45743.doc actually. writeoutreadback
	* changes byte offset of chars (but preserve string offsets).
	* Changing back to original lines - sergey
	*/
	// GenericPropertyNode property = new GenericPropertyNode(
	// tpt.getCharIndex( sepx.getStartBytes() ),
	// tpt.getCharIndex( sepx.getEndBytes() ), sed.toByteArray() );

	plex.addProperty(property);

	offset = wordDocumentStream.size();
	}
	tableStream.write(plex.toByteArray());
	}
	}