pdfbox/src/main/java/org/apache/pdfbox/multipdf/Splitter.java - pdfbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pdfbox.multipdf;

 import java.io.IOException;
 import java.util.ArrayList;
 import java.util.List;

 import org.apache.pdfbox.pdmodel.PDDocument;
 import org.apache.pdfbox.pdmodel.PDPage;
 import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
 import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
 import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
 import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;

 /**
  * Split a document into several other documents.
  *
  * @author Mario Ivankovits
  * @author Ben Litchfield
  */
 public class Splitter
 {
     private PDDocument sourceDocument;
     private PDDocument currentDestinationDocument;

     private int splitLength = 1;
     private int startPage = Integer.MIN_VALUE;
     private int endPage = Integer.MAX_VALUE;
     private List<PDDocument> destinationDocuments;

     private int currentPageNumber = 0;

     /**
      * This will take a document and split into several other documents.
      *
      * @param document The document to split.
      *
      * @return A list of all the split documents.
      *
      * @throws IOException If there is an IOError
      */
     public List<PDDocument> split(PDDocument document) throws IOException
     {
         destinationDocuments = new ArrayList<PDDocument>();
         sourceDocument = document;
         processPages();
         return destinationDocuments;
     }

     /**
      * This will tell the splitting algorithm where to split the pages.  The default
      * is 1, so every page will become a new document.  If it was two then each document would
      * contain 2 pages.  If the source document had 5 pages it would split into
      * 3 new documents, 2 documents containing 2 pages and 1 document containing one
      * page.
      *
      * @param split The number of pages each split document should contain.
      * @throws IllegalArgumentException if the page is smaller than one.
      */
     public void setSplitAtPage(int split)
     {
         if(split <= 0)
         {
             throw new IllegalArgumentException("Number of pages is smaller than one");
         }
         splitLength = split;
     }

     /**
      * This will set the start page.
      *
      * @param start the start page
      * @throws IllegalArgumentException if the start page is smaller than one.
      */
     public void setStartPage(int start)
     {
         if(start <= 0)
         {
             throw new IllegalArgumentException("Start page is smaller than one");
         }
         startPage = start;
     }

     /**
      * This will set the end page.
      *
      * @param end the end page
      * @throws IllegalArgumentException if the end page is smaller than one.
      */
     public void setEndPage(int end)
     {
         if(end <= 0)
         {
             throw new IllegalArgumentException("End page is smaller than one");
         }
         endPage = end;
     }

     /**
      * Interface method to handle the start of the page processing.
      *
      * @throws IOException If an IO error occurs.
      */
     private void processPages() throws IOException
     {
         for (int i = 0; i < sourceDocument.getNumberOfPages(); i++)
         {
             PDPage page = sourceDocument.getPage(i);
             if (currentPageNumber + 1 >= startPage && currentPageNumber + 1 <= endPage)
             {
                 processPage(page);
                 currentPageNumber++;
             }
             else
             {
                 if (currentPageNumber > endPage)
                 {
                     break;
                 }
                 else
                 {
                     currentPageNumber++;
                 }
             }
         }
     }

     /**
      * Helper method for creating new documents at the appropriate pages.
      *
      * @throws IOException If there is an error creating the new document.
      */
     private void createNewDocumentIfNecessary() throws IOException
     {
         if (splitAtPage(currentPageNumber) || currentDestinationDocument == null)
         {
             currentDestinationDocument = createNewDocument();
             destinationDocuments.add(currentDestinationDocument);
         }
     }

     /**
      * Check if it is necessary to create a new document.
      * By default a split occurs at every page.  If you wanted to split
      * based on some complex logic then you could override this method.  For example.
      * <code>
      * protected void splitAtPage()
      * {
      *     // will split at pages with prime numbers only
      *     return isPrime(pageNumber);
      * }
      * </code>
      * @param pageNumber the page number to be checked as splitting page
      *
      * @return true If a new document should be created.
      */
     protected boolean splitAtPage(int pageNumber)
     {
         return pageNumber % splitLength == 0;
     }

     /**
      * Create a new document to write the split contents to.
      *
      * @return the newly created PDDocument.
      * @throws IOException If there is an problem creating the new document.
      */
     protected PDDocument createNewDocument() throws IOException
     {
         PDDocument document = new PDDocument();
         document.getDocument().setVersion(getSourceDocument().getVersion());
         document.setDocumentInformation(getSourceDocument().getDocumentInformation());
         document.getDocumentCatalog().setViewerPreferences(
                 getSourceDocument().getDocumentCatalog().getViewerPreferences());
         return document;
     }

     /**
      * Interface to start processing a new page.
      *
      * @param page The page that is about to get processed.
      *
      * @throws IOException If there is an error creating the new document.
      */
     protected void processPage(PDPage page) throws IOException
     {
         createNewDocumentIfNecessary();

         PDPage imported = getDestinationDocument().importPage(page);
         imported.setResources(page.getResources());
         // remove page links to avoid copying not needed resources
         processAnnotations(imported);
     }

     private void processAnnotations(PDPage imported) throws IOException
     {
         List<PDAnnotation> annotations = imported.getAnnotations();
         for (PDAnnotation annotation : annotations)
         {
             if (annotation instanceof PDAnnotationLink)
             {
                 PDAnnotationLink link = (PDAnnotationLink)annotation;
                 PDDestination destination = link.getDestination();
                 if (destination == null && link.getAction() != null)
                 {
                     PDAction action = link.getAction();
                     if (action instanceof PDActionGoTo)
                     {
                         destination = ((PDActionGoTo)action).getDestination();
                     }
                 }
                 if (destination instanceof PDPageDestination)
                 {
                     // TODO preserve links to pages within the splitted result
                     ((PDPageDestination) destination).setPage(null);
                 }
             }
             // TODO preserve links to pages within the splitted result
             annotation.setPage(null);
         }
     }
     /**
      * The source PDF document.
      *
      * @return the pdf to be splitted
      */
     protected final PDDocument getSourceDocument()
     {
         return sourceDocument;
     }

     /**
      * The source PDF document.
      *
      * @return current destination pdf
      */
     protected final PDDocument getDestinationDocument()
     {
         return currentDestinationDocument;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pdfbox.multipdf;

	import java.io.IOException;
	import java.util.ArrayList;
	import java.util.List;

	import org.apache.pdfbox.pdmodel.PDDocument;
	import org.apache.pdfbox.pdmodel.PDPage;
	import org.apache.pdfbox.pdmodel.interactive.action.PDAction;
	import org.apache.pdfbox.pdmodel.interactive.action.PDActionGoTo;
	import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotation;
	import org.apache.pdfbox.pdmodel.interactive.annotation.PDAnnotationLink;
	import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDDestination;
	import org.apache.pdfbox.pdmodel.interactive.documentnavigation.destination.PDPageDestination;

	/**
	* Split a document into several other documents.
	*
	* @author Mario Ivankovits
	* @author Ben Litchfield
	*/
	public class Splitter
	{
	private PDDocument sourceDocument;
	private PDDocument currentDestinationDocument;

	private int splitLength = 1;
	private int startPage = Integer.MIN_VALUE;
	private int endPage = Integer.MAX_VALUE;
	private List<PDDocument> destinationDocuments;

	private int currentPageNumber = 0;

	/**
	* This will take a document and split into several other documents.
	*
	* @param document The document to split.
	*
	* @return A list of all the split documents.
	*
	* @throws IOException If there is an IOError
	*/
	public List<PDDocument> split(PDDocument document) throws IOException
	{
	destinationDocuments = new ArrayList<PDDocument>();
	sourceDocument = document;
	processPages();
	return destinationDocuments;
	}

	/**
	* This will tell the splitting algorithm where to split the pages. The default
	* is 1, so every page will become a new document. If it was two then each document would
	* contain 2 pages. If the source document had 5 pages it would split into
	* 3 new documents, 2 documents containing 2 pages and 1 document containing one
	* page.
	*
	* @param split The number of pages each split document should contain.
	* @throws IllegalArgumentException if the page is smaller than one.
	*/
	public void setSplitAtPage(int split)
	{
	if(split <= 0)
	{
	throw new IllegalArgumentException("Number of pages is smaller than one");
	}
	splitLength = split;
	}

	/**
	* This will set the start page.
	*
	* @param start the start page
	* @throws IllegalArgumentException if the start page is smaller than one.
	*/
	public void setStartPage(int start)
	{
	if(start <= 0)
	{
	throw new IllegalArgumentException("Start page is smaller than one");
	}
	startPage = start;
	}

	/**
	* This will set the end page.
	*
	* @param end the end page
	* @throws IllegalArgumentException if the end page is smaller than one.
	*/
	public void setEndPage(int end)
	{
	if(end <= 0)
	{
	throw new IllegalArgumentException("End page is smaller than one");
	}
	endPage = end;
	}

	/**
	* Interface method to handle the start of the page processing.
	*
	* @throws IOException If an IO error occurs.
	*/
	private void processPages() throws IOException
	{
	for (int i = 0; i < sourceDocument.getNumberOfPages(); i++)
	{
	PDPage page = sourceDocument.getPage(i);
	if (currentPageNumber + 1 >= startPage && currentPageNumber + 1 <= endPage)
	{
	processPage(page);
	currentPageNumber++;
	}
	else
	{
	if (currentPageNumber > endPage)
	{
	break;
	}
	else
	{
	currentPageNumber++;
	}
	}
	}
	}

	/**
	* Helper method for creating new documents at the appropriate pages.
	*
	* @throws IOException If there is an error creating the new document.
	*/
	private void createNewDocumentIfNecessary() throws IOException
	{
	if (splitAtPage(currentPageNumber) \|\| currentDestinationDocument == null)
	{
	currentDestinationDocument = createNewDocument();
	destinationDocuments.add(currentDestinationDocument);
	}
	}

	/**
	* Check if it is necessary to create a new document.
	* By default a split occurs at every page. If you wanted to split
	* based on some complex logic then you could override this method. For example.
	* <code>
	* protected void splitAtPage()
	* {
	* // will split at pages with prime numbers only
	* return isPrime(pageNumber);
	* }
	* </code>
	* @param pageNumber the page number to be checked as splitting page
	*
	* @return true If a new document should be created.
	*/
	protected boolean splitAtPage(int pageNumber)
	{
	return pageNumber % splitLength == 0;
	}

	/**
	* Create a new document to write the split contents to.
	*
	* @return the newly created PDDocument.
	* @throws IOException If there is an problem creating the new document.
	*/
	protected PDDocument createNewDocument() throws IOException
	{
	PDDocument document = new PDDocument();
	document.getDocument().setVersion(getSourceDocument().getVersion());
	document.setDocumentInformation(getSourceDocument().getDocumentInformation());
	document.getDocumentCatalog().setViewerPreferences(
	getSourceDocument().getDocumentCatalog().getViewerPreferences());
	return document;
	}

	/**
	* Interface to start processing a new page.
	*
	* @param page The page that is about to get processed.
	*
	* @throws IOException If there is an error creating the new document.
	*/
	protected void processPage(PDPage page) throws IOException
	{
	createNewDocumentIfNecessary();

	PDPage imported = getDestinationDocument().importPage(page);
	imported.setResources(page.getResources());
	// remove page links to avoid copying not needed resources
	processAnnotations(imported);
	}

	private void processAnnotations(PDPage imported) throws IOException
	{
	List<PDAnnotation> annotations = imported.getAnnotations();
	for (PDAnnotation annotation : annotations)
	{
	if (annotation instanceof PDAnnotationLink)
	{
	PDAnnotationLink link = (PDAnnotationLink)annotation;
	PDDestination destination = link.getDestination();
	if (destination == null && link.getAction() != null)
	{
	PDAction action = link.getAction();
	if (action instanceof PDActionGoTo)
	{
	destination = ((PDActionGoTo)action).getDestination();
	}
	}
	if (destination instanceof PDPageDestination)
	{
	// TODO preserve links to pages within the splitted result
	((PDPageDestination) destination).setPage(null);
	}
	}
	// TODO preserve links to pages within the splitted result
	annotation.setPage(null);
	}
	}
	/**
	* The source PDF document.
	*
	* @return the pdf to be splitted
	*/
	protected final PDDocument getSourceDocument()
	{
	return sourceDocument;
	}

	/**
	* The source PDF document.
	*
	* @return current destination pdf
	*/
	protected final PDDocument getDestinationDocument()
	{
	return currentDestinationDocument;
	}
	}