pdfbox/src/main/java/org/apache/pdfbox/text/PDFMarkedContentExtractor.java - pdfbox - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one or more
  * contributor license agreements.  See the NOTICE file distributed with
  * this work for additional information regarding copyright ownership.
  * The ASF licenses this file to You under the Apache License, Version 2.0
  * (the "License"); you may not use this file except in compliance with
  * the License.  You may obtain a copy of the License at
  *
  *      http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pdfbox.text;

 import java.io.IOException;
 import java.util.ArrayDeque;
 import java.util.ArrayList;
 import java.util.HashMap;
 import java.util.List;
 import java.util.Map;
 import java.util.Deque;

 import org.apache.pdfbox.cos.COSDictionary;
 import org.apache.pdfbox.cos.COSName;
 import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
 import org.apache.pdfbox.pdmodel.graphics.PDXObject;
 import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
 import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
 import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject;
 import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;

 /**
  * This is an stream engine to extract the marked content of a pdf.
  *
  * @author Johannes Koch
  */
 public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
 {
     private boolean suppressDuplicateOverlappingText = true;
     private final List<PDMarkedContent> markedContents = new ArrayList<>();
     private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<>();
     private final Map<String, List<TextPosition>> characterListMapping = new HashMap<>();

     /**
      * Instantiate a new PDFTextStripper object.
      */
     public PDFMarkedContentExtractor() throws IOException
     {
         this(null);
     }

     /**
      * Constructor. Will apply encoding-specific conversions to the output text.
      *
      * @param encoding The encoding that the output will be written in.
      */
     public PDFMarkedContentExtractor(String encoding) throws IOException
     {
         addOperator(new BeginMarkedContentSequenceWithProperties());
         addOperator(new BeginMarkedContentSequence());
         addOperator(new EndMarkedContentSequence());
         addOperator(new DrawObject());
         // todo: DP - Marked Content Point
         // todo: MP - Marked Content Point with Properties
     }

     /**
      * @return the suppressDuplicateOverlappingText setting.
      */
     public boolean isSuppressDuplicateOverlappingText()
     {
         return suppressDuplicateOverlappingText;
     }

     /**
      * By default the class will attempt to remove text that overlaps each other. Word paints the
      * same character several times in order to make it look bold. By setting this to false all text
      * will be extracted, which means that certain sections will be duplicated, but better
      * performance will be noticed.
      *
      * @param suppressDuplicateOverlappingText The suppressDuplicateOverlappingText setting to set.
      */
     public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
     {
         this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
     }

     /**
      * This will determine of two floating point numbers are within a specified variance.
      *
      * @param first The first number to compare to.
      * @param second The second number to compare to.
      * @param variance The allowed variance.
      */
     private boolean within( float first, float second, float variance )
     {
         return second > first - variance && second < first + variance;
     }

     @Override
     public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
     {
         PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
         if (this.currentMarkedContents.isEmpty())
         {
             this.markedContents.add(markedContent);
         }
         else
         {
             PDMarkedContent currentMarkedContent =
                 this.currentMarkedContents.peek();
             if (currentMarkedContent != null)
             {
                 currentMarkedContent.addMarkedContent(markedContent);
             }
         }
         this.currentMarkedContents.push(markedContent);
     }

     @Override
     public void endMarkedContentSequence()
     {
         if (!this.currentMarkedContents.isEmpty())
         {
             this.currentMarkedContents.pop();
         }
     }

     public void xobject(PDXObject xobject)
     {
         if (!this.currentMarkedContents.isEmpty())
         {
             this.currentMarkedContents.peek().addXObject(xobject);
         }
     }

     /**
      * This will process a TextPosition object and add the
      * text to the list of characters on a page.  It takes care of
      * overlapping text.
      *
      * @param text The text to process.
      */
     @Override
     protected void processTextPosition( TextPosition text )
     {
         boolean showCharacter = true;
         if( this.suppressDuplicateOverlappingText )
         {
             showCharacter = false;
             String textCharacter = text.getUnicode();
             float textX = text.getX();
             float textY = text.getY();
             List<TextPosition> sameTextCharacters =
                     this.characterListMapping.computeIfAbsent(textCharacter, k -> new ArrayList<>());

             // RDD - Here we compute the value that represents the end of the rendered
             // text.  This value is used to determine whether subsequent text rendered
             // on the same line overwrites the current text.
             //
             // We subtract any positive padding to handle cases where extreme amounts
             // of padding are applied, then backed off (not sure why this is done, but there
             // are cases where the padding is on the order of 10x the character width, and
             // the TJ just backs up to compensate after each character).  Also, we subtract
             // an amount to allow for kerning (a percentage of the width of the last
             // character).
             //
             boolean suppressCharacter = false;
             float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
             for (TextPosition sameTextCharacter : sameTextCharacters)
             {
                 TextPosition character = sameTextCharacter;
                 String charCharacter = character.getUnicode();
                 float charX = character.getX();
                 float charY = character.getY();
                 //only want to suppress
                 if( charCharacter != null &&
                         //charCharacter.equals( textCharacter ) &&
                         within( charX, textX, tolerance ) &&
                         within( charY,
                                 textY,
                                 tolerance ) )
                 {
                     suppressCharacter = true;
                     break;
                 }
             }
             if( !suppressCharacter )
             {
                 sameTextCharacters.add( text );
                 showCharacter = true;
             }
         }

         if( showCharacter )
         {
             List<TextPosition> textList = new ArrayList<>();

             /* In the wild, some PDF encoded documents put diacritics (accents on
              * top of characters) into a separate Tj element.  When displaying them
              * graphically, the two chunks get overlaid.  With text output though,
              * we need to do the overlay. This code recombines the diacritic with
              * its associated character if the two are consecutive.
              */
             if(textList.isEmpty())
             {
                 textList.add(text);
             }
             else
             {
                 /* test if we overlap the previous entry.
                  * Note that we are making an assumption that we need to only look back
                  * one TextPosition to find what we are overlapping.
                  * This may not always be true. */
                 TextPosition previousTextPosition = textList.get(textList.size()-1);
                 if(text.isDiacritic() && previousTextPosition.contains(text))
                 {
                     previousTextPosition.mergeDiacritic(text);
                 }
                 /* If the previous TextPosition was the diacritic, merge it into this
                  * one and remove it from the list. */
                 else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
                 {
                     text.mergeDiacritic(previousTextPosition);
                     textList.remove(textList.size()-1);
                     textList.add(text);
                 }
                 else
                 {
                     textList.add(text);
                 }
             }
             if (!this.currentMarkedContents.isEmpty())
             {
                 this.currentMarkedContents.peek().addText(text);
             }
         }
     }

     public List<PDMarkedContent> getMarkedContents()
     {
         return this.markedContents;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed with
	* this work for additional information regarding copyright ownership.
	* The ASF licenses this file to You under the Apache License, Version 2.0
	* (the "License"); you may not use this file except in compliance with
	* the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pdfbox.text;

	import java.io.IOException;
	import java.util.ArrayDeque;
	import java.util.ArrayList;
	import java.util.HashMap;
	import java.util.List;
	import java.util.Map;
	import java.util.Deque;

	import org.apache.pdfbox.cos.COSDictionary;
	import org.apache.pdfbox.cos.COSName;
	import org.apache.pdfbox.pdmodel.documentinterchange.markedcontent.PDMarkedContent;
	import org.apache.pdfbox.pdmodel.graphics.PDXObject;
	import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequence;
	import org.apache.pdfbox.contentstream.operator.markedcontent.BeginMarkedContentSequenceWithProperties;
	import org.apache.pdfbox.contentstream.operator.markedcontent.DrawObject;
	import org.apache.pdfbox.contentstream.operator.markedcontent.EndMarkedContentSequence;

	/**
	* This is an stream engine to extract the marked content of a pdf.
	*
	* @author Johannes Koch
	*/
	public class PDFMarkedContentExtractor extends LegacyPDFStreamEngine
	{
	private boolean suppressDuplicateOverlappingText = true;
	private final List<PDMarkedContent> markedContents = new ArrayList<>();
	private final Deque<PDMarkedContent> currentMarkedContents = new ArrayDeque<>();
	private final Map<String, List<TextPosition>> characterListMapping = new HashMap<>();

	/**
	* Instantiate a new PDFTextStripper object.
	*/
	public PDFMarkedContentExtractor() throws IOException
	{
	this(null);
	}

	/**
	* Constructor. Will apply encoding-specific conversions to the output text.
	*
	* @param encoding The encoding that the output will be written in.
	*/
	public PDFMarkedContentExtractor(String encoding) throws IOException
	{
	addOperator(new BeginMarkedContentSequenceWithProperties());
	addOperator(new BeginMarkedContentSequence());
	addOperator(new EndMarkedContentSequence());
	addOperator(new DrawObject());
	// todo: DP - Marked Content Point
	// todo: MP - Marked Content Point with Properties
	}

	/**
	* @return the suppressDuplicateOverlappingText setting.
	*/
	public boolean isSuppressDuplicateOverlappingText()
	{
	return suppressDuplicateOverlappingText;
	}

	/**
	* By default the class will attempt to remove text that overlaps each other. Word paints the
	* same character several times in order to make it look bold. By setting this to false all text
	* will be extracted, which means that certain sections will be duplicated, but better
	* performance will be noticed.
	*
	* @param suppressDuplicateOverlappingText The suppressDuplicateOverlappingText setting to set.
	*/
	public void setSuppressDuplicateOverlappingText(boolean suppressDuplicateOverlappingText)
	{
	this.suppressDuplicateOverlappingText = suppressDuplicateOverlappingText;
	}

	/**
	* This will determine of two floating point numbers are within a specified variance.
	*
	* @param first The first number to compare to.
	* @param second The second number to compare to.
	* @param variance The allowed variance.
	*/
	private boolean within( float first, float second, float variance )
	{
	return second > first - variance && second < first + variance;
	}

	@Override
	public void beginMarkedContentSequence(COSName tag, COSDictionary properties)
	{
	PDMarkedContent markedContent = PDMarkedContent.create(tag, properties);
	if (this.currentMarkedContents.isEmpty())
	{
	this.markedContents.add(markedContent);
	}
	else
	{
	PDMarkedContent currentMarkedContent =
	this.currentMarkedContents.peek();
	if (currentMarkedContent != null)
	{
	currentMarkedContent.addMarkedContent(markedContent);
	}
	}
	this.currentMarkedContents.push(markedContent);
	}

	@Override
	public void endMarkedContentSequence()
	{
	if (!this.currentMarkedContents.isEmpty())
	{
	this.currentMarkedContents.pop();
	}
	}

	public void xobject(PDXObject xobject)
	{
	if (!this.currentMarkedContents.isEmpty())
	{
	this.currentMarkedContents.peek().addXObject(xobject);
	}
	}

	/**
	* This will process a TextPosition object and add the
	* text to the list of characters on a page. It takes care of
	* overlapping text.
	*
	* @param text The text to process.
	*/
	@Override
	protected void processTextPosition( TextPosition text )
	{
	boolean showCharacter = true;
	if( this.suppressDuplicateOverlappingText )
	{
	showCharacter = false;
	String textCharacter = text.getUnicode();
	float textX = text.getX();
	float textY = text.getY();
	List<TextPosition> sameTextCharacters =
	this.characterListMapping.computeIfAbsent(textCharacter, k -> new ArrayList<>());

	// RDD - Here we compute the value that represents the end of the rendered
	// text. This value is used to determine whether subsequent text rendered
	// on the same line overwrites the current text.
	//
	// We subtract any positive padding to handle cases where extreme amounts
	// of padding are applied, then backed off (not sure why this is done, but there
	// are cases where the padding is on the order of 10x the character width, and
	// the TJ just backs up to compensate after each character). Also, we subtract
	// an amount to allow for kerning (a percentage of the width of the last
	// character).
	//
	boolean suppressCharacter = false;
	float tolerance = (text.getWidth()/textCharacter.length())/3.0f;
	for (TextPosition sameTextCharacter : sameTextCharacters)
	{
	TextPosition character = sameTextCharacter;
	String charCharacter = character.getUnicode();
	float charX = character.getX();
	float charY = character.getY();
	//only want to suppress
	if( charCharacter != null &&
	//charCharacter.equals( textCharacter ) &&
	within( charX, textX, tolerance ) &&
	within( charY,
	textY,
	tolerance ) )
	{
	suppressCharacter = true;
	break;
	}
	}
	if( !suppressCharacter )
	{
	sameTextCharacters.add( text );
	showCharacter = true;
	}
	}

	if( showCharacter )
	{
	List<TextPosition> textList = new ArrayList<>();

	/* In the wild, some PDF encoded documents put diacritics (accents on
	* top of characters) into a separate Tj element. When displaying them
	* graphically, the two chunks get overlaid. With text output though,
	* we need to do the overlay. This code recombines the diacritic with
	* its associated character if the two are consecutive.
	*/
	if(textList.isEmpty())
	{
	textList.add(text);
	}
	else
	{
	/* test if we overlap the previous entry.
	* Note that we are making an assumption that we need to only look back
	* one TextPosition to find what we are overlapping.
	* This may not always be true. */
	TextPosition previousTextPosition = textList.get(textList.size()-1);
	if(text.isDiacritic() && previousTextPosition.contains(text))
	{
	previousTextPosition.mergeDiacritic(text);
	}
	/* If the previous TextPosition was the diacritic, merge it into this
	* one and remove it from the list. */
	else if(previousTextPosition.isDiacritic() && text.contains(previousTextPosition))
	{
	text.mergeDiacritic(previousTextPosition);
	textList.remove(textList.size()-1);
	textList.add(text);
	}
	else
	{
	textList.add(text);
	}
	}
	if (!this.currentMarkedContents.isEmpty())
	{
	this.currentMarkedContents.peek().addText(text);
	}
	}
	}

	public List<PDMarkedContent> getMarkedContents()
	{
	return this.markedContents;
	}
	}