blob: bd338cac42ed754f25b99bdcb56e0c2d2b5a3d5b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pdfbox.text;
import java.text.Normalizer;
import java.util.Arrays;
import java.util.HashMap;
import java.util.Map;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pdfbox.pdmodel.font.PDFont;
import org.apache.pdfbox.util.Matrix;
/**
* This represents a string and a position on the screen of those characters.
*
* @author Ben Litchfield
*/
public final class TextPosition
{
private static final Log LOG = LogFactory.getLog(TextPosition.class);
private static final Map<Integer, String> DIACRITICS = createDiacritics();
// text matrix for the start of the text object, coordinates are in display units
// and have not been adjusted
private final Matrix textMatrix;
// ending X and Y coordinates in display units
private final float endX;
private final float endY;
private final float maxHeight; // maximum height of text, in display units
private final int rotation; // 0, 90, 180, 270 degrees of page rotation
private final float x;
private final float y;
private final float pageHeight;
private final float pageWidth;
private final float widthOfSpace; // width of a space, in display units
private final int[] charCodes; // internal PDF character codes
private final PDFont font;
private final float fontSize;
private final int fontSizePt;
// mutable
private float[] widths;
private String unicode;
private float direction = -1;
/**
* Constructor.
*
* @param pageRotation rotation of the page that the text is located in
* @param pageWidth width of the page that the text is located in
* @param pageHeight height of the page that the text is located in
* @param textMatrix text rendering matrix for start of text (in display units)
* @param endX x coordinate of the end position
* @param endY y coordinate of the end position
* @param maxHeight Maximum height of text (in display units)
* @param individualWidth The width of the given character/string. (in text units)
* @param spaceWidth The width of the space character. (in display units)
* @param unicode The string of Unicode characters to be displayed.
* @param charCodes An array of the internal PDF character codes for the glyphs in this text.
* @param font The current font for this text position.
* @param fontSize The new font size.
* @param fontSizeInPt The font size in pt units (see {@link #getFontSizeInPt()} for details).
*/
public TextPosition(int pageRotation, float pageWidth, float pageHeight, Matrix textMatrix,
float endX, float endY, float maxHeight, float individualWidth,
float spaceWidth, String unicode, int[] charCodes, PDFont font,
float fontSize, int fontSizeInPt)
{
this.textMatrix = textMatrix;
this.endX = endX;
this.endY = endY;
int rotationAngle = pageRotation;
this.rotation = rotationAngle;
this.maxHeight = maxHeight;
this.pageHeight = pageHeight;
this.pageWidth = pageWidth;
this.widths = new float[] { individualWidth };
this.widthOfSpace = spaceWidth;
this.unicode = unicode;
this.charCodes = charCodes;
this.font = font;
this.fontSize = fontSize;
this.fontSizePt = fontSizeInPt;
x = getXRot(rotationAngle);
if (rotationAngle == 0 || rotationAngle == 180)
{
y = this.pageHeight - getYLowerLeftRot(rotationAngle);
}
else
{
y = this.pageWidth - getYLowerLeftRot(rotationAngle);
}
}
// Adds non-decomposing diacritics to the hash with their related combining character.
// These are values that the unicode spec claims are equivalent but are not mapped in the form
// NFKC normalization method. Determined by going through the Combining Diacritical Marks
// section of the Unicode spec and identifying which characters are not mapped to by the
// normalization.
private static Map<Integer, String> createDiacritics()
{
Map<Integer, String> map = new HashMap<>(31);
map.put(0x0060, "\u0300");
map.put(0x02CB, "\u0300");
map.put(0x0027, "\u0301");
map.put(0x02B9, "\u0301");
map.put(0x02CA, "\u0301");
map.put(0x005e, "\u0302");
map.put(0x02C6, "\u0302");
map.put(0x007E, "\u0303");
map.put(0x02C9, "\u0304");
map.put(0x00B0, "\u030A");
map.put(0x02BA, "\u030B");
map.put(0x02C7, "\u030C");
map.put(0x02C8, "\u030D");
map.put(0x0022, "\u030E");
map.put(0x02BB, "\u0312");
map.put(0x02BC, "\u0313");
map.put(0x0486, "\u0313");
map.put(0x055A, "\u0313");
map.put(0x02BD, "\u0314");
map.put(0x0485, "\u0314");
map.put(0x0559, "\u0314");
map.put(0x02D4, "\u031D");
map.put(0x02D5, "\u031E");
map.put(0x02D6, "\u031F");
map.put(0x02D7, "\u0320");
map.put(0x02B2, "\u0321");
map.put(0x02CC, "\u0329");
map.put(0x02B7, "\u032B");
map.put(0x02CD, "\u0331");
map.put(0x005F, "\u0332");
map.put(0x204E, "\u0359");
return map;
}
/**
* Return the string of characters stored in this object. The length can be different than the
* CharacterCodes length e.g. if ligatures are used ("fi", "fl", "ffl") where one glyph
* represents several unicode characters.
*
* @return The string on the screen.
*/
public String getUnicode()
{
return unicode;
}
/**
* Return the internal PDF character codes of the glyphs in this text.
*
* @return an array of internal PDF character codes
*/
public int[] getCharacterCodes()
{
return charCodes;
}
/**
* The matrix containing the starting text position and scaling. Despite the name, it is not the
* text matrix set by the "Tm" operator, it is really the effective text rendering matrix (which
* is dependent on the current transformation matrix (set by the "cm" operator), the text matrix
* (set by the "Tm" operator), the font size (set by the "Tf" operator) and the page cropbox).
*
* @return The Matrix containing the starting text position
*/
public Matrix getTextMatrix()
{
return textMatrix;
}
/**
* Return the direction/orientation of the string in this object based on its text matrix. Only
* angles of 0, 90, 180, or 270 are supported. To get other angles, use this code:
* <pre>
* TextPosition text = ...
* Matrix m = text.getTextMatrix().clone();
* m.concatenate(text.getFont().getFontMatrix());
* int angle = (int) Math.round(Math.toDegrees(Math.atan2(m.getShearY(), m.getScaleY())));
* </pre>
*
* @return The direction of the text (0, 90, 180, or 270).
*/
public float getDir()
{
if (direction < 0)
{
float a = textMatrix.getScaleY();
float b = textMatrix.getShearY();
float c = textMatrix.getShearX();
float d = textMatrix.getScaleX();
// 12 0 left to right
// 0 12
if (a > 0 && Math.abs(b) < d && Math.abs(c) < a && d > 0)
{
direction = 0;
}
// -12 0 right to left (upside down)
// 0 -12
else if (a < 0 && Math.abs(b) < Math.abs(d) && Math.abs(c) < Math.abs(a) && d < 0)
{
direction = 180;
}
// 0 12 up
// -12 0
else if (Math.abs(a) < Math.abs(c) && b > 0 && c < 0 && Math.abs(d) < b)
{
direction = 90;
}
// 0 -12 down
// 12 0
else if (Math.abs(a) < c && b < 0 && c > 0 && Math.abs(d) < Math.abs(b))
{
direction = 270;
}
else
{
direction = 0;
}
}
return direction;
}
/**
* Return the X starting coordinate of the text, adjusted by the given rotation amount.
* The rotation adjusts where the 0,0 location is relative to the text.
*
* @param rotation Rotation to apply (0, 90, 180, or 270). 0 will perform no adjustments.
* @return X coordinate
*/
private float getXRot(float rotation)
{
if (Float.compare(rotation, 0) == 0)
{
return textMatrix.getTranslateX();
}
else if (Float.compare(rotation, 90) == 0)
{
return textMatrix.getTranslateY();
}
else if (Float.compare(rotation, 180) == 0)
{
return pageWidth - textMatrix.getTranslateX();
}
else if (Float.compare(rotation, 270) == 0)
{
return pageHeight - textMatrix.getTranslateY();
}
return 0;
}
/**
* This will get the page rotation adjusted x position of the character.
* This is adjusted based on page rotation so that the upper left is 0,0 which is
* unlike PDF coordinates, which start at the bottom left. See also
* <a href="https://stackoverflow.com/questions/57067372/">this answer by Michael Klink</a> for
* further details and
* <a href="https://issues.apache.org/jira/browse/PDFBOX-4597">PDFBOX-4597</a> for a sample
* file.
*
* @return The x coordinate of the character.
*/
public float getX()
{
return x;
}
/**
* This will get the text direction adjusted x position of the character.
* This is adjusted based on text direction so that the first character
* in that direction is in the upper left at 0,0.
* This method ignores the page rotation but takes the text rotation (see
* {@link #getDir() getDir()}) and adjusts the coordinates to awt. This is useful when doing
* text extraction, to compare the glyph positions when imagining these to be horizontal. See also
* <a href="https://stackoverflow.com/questions/57067372/">this answer by Michael Klink</a> for
* further details and
* <a href="https://issues.apache.org/jira/browse/PDFBOX-4597">PDFBOX-4597</a> for a sample
* file.
*
* @return The x coordinate of the text.
*/
public float getXDirAdj()
{
return getXRot(getDir());
}
/**
* This will get the y position of the character with 0,0 in lower left.
* This will be adjusted by the given rotation.
*
* @param rotation Rotation to apply to text to adjust the 0,0 location (0,90,180,270)
* @return The y coordinate of the text
*/
private float getYLowerLeftRot(float rotation)
{
if (Float.compare(rotation, 0) == 0)
{
return textMatrix.getTranslateY();
}
else if (Float.compare(rotation, 90) == 0)
{
return pageWidth - textMatrix.getTranslateX();
}
else if (Float.compare(rotation, 180) == 0)
{
return pageHeight - textMatrix.getTranslateY();
}
else if (Float.compare(rotation, 270) == 0)
{
return textMatrix.getTranslateX();
}
return 0;
}
/**
* This will get the page rotation adjusted x position of the character.
* This is adjusted based on page rotation so that the upper left is 0,0 which is
* unlike PDF coordinates, which start at the bottom left. See also
* <a href="https://stackoverflow.com/questions/57067372/">this answer by Michael Klink</a> for
* further details and
* <a href="https://issues.apache.org/jira/browse/PDFBOX-4597">PDFBOX-4597</a> for a sample
* file.
*
* @return The adjusted y coordinate of the character.
*/
public float getY()
{
return y;
}
/**
* This will get the y position of the text, adjusted so that 0,0 is upper left and it is
* adjusted based on the text direction.
* This method ignores the page rotation but takes the
* text rotation and adjusts the coordinates to awt. This is useful when doing text extraction,
* to compare the glyph positions when imagining these to be horizontal. See also
* <a href="https://stackoverflow.com/questions/57067372/">this answer by Michael Klink</a> for
* further details and
* <a href="https://issues.apache.org/jira/browse/PDFBOX-4597">PDFBOX-4597</a> for a sample
* file.
*
* @return The adjusted y coordinate of the character.
*/
public float getYDirAdj()
{
float dir = getDir();
// some PDFBox code assumes that the 0,0 point is in upper left, not lower left
if (Float.compare(dir, 0) == 0 || Float.compare(dir, 180) == 0)
{
return pageHeight - getYLowerLeftRot(dir);
}
else
{
return pageWidth - getYLowerLeftRot(dir);
}
}
/**
* Get the length or width of the text, based on a given rotation.
*
* @param rotation Rotation that was used to determine coordinates (0,90,180,270)
* @return Width of text in display units
*/
private float getWidthRot(float rotation)
{
if (Float.compare(rotation, 90) == 0 || Float.compare(rotation, 270) == 0)
{
return Math.abs(endY - textMatrix.getTranslateY());
}
else
{
return Math.abs(endX - textMatrix.getTranslateX());
}
}
/**
* This will get the width of the string when page rotation adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidth()
{
return getWidthRot(rotation);
}
/**
* This will get the width of the string when text direction adjusted coordinates are used.
*
* @return The width of the text in display units.
*/
public float getWidthDirAdj()
{
return getWidthRot(getDir());
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeight()
{
return maxHeight;
}
/**
* This will get the maximum height of all characters in this string.
*
* @return The maximum height of all characters in this string.
*/
public float getHeightDir()
{
// this is not really a rotation-dependent calculation, but this is defined for symmetry
return maxHeight;
}
/**
* This will get the font size that has been set with the "Tf" operator (Set text font and
* size). When the text is rendered, it may appear bigger or smaller depending on the current
* transformation matrix (set by the "cm" operator) and the text matrix (set by the "Tm"
* operator).
*
* @return The font size.
*/
public float getFontSize()
{
return fontSize;
}
/**
* This will get the font size in pt. To get this size we have to multiply the font size from
* {@link #getFontSize() getFontSize()} with the text matrix (set by the "Tm" operator)
* horizontal scaling factor and truncate the result to integer. The actual rendering may appear
* bigger or smaller depending on the current transformation matrix (set by the "cm" operator).
* To get the size in rendering, use {@link #getXScale() getXScale()}.
*
* @return The font size in pt.
*/
public float getFontSizeInPt()
{
return fontSizePt;
}
/**
* This will get the font for the text being drawn.
*
* @return The font size.
*/
public PDFont getFont()
{
return font;
}
/**
* This will get the width of a space character. This is useful for some algorithms such as the
* text stripper, that need to know the width of a space character.
*
* @return The width of a space character.
*/
public float getWidthOfSpace()
{
return widthOfSpace;
}
/**
* This will get the X scaling factor. This is dependent on the current transformation matrix
* (set by the "cm" operator), the text matrix (set by the "Tm" operator) and the font size (set
* by the "Tf" operator).
*
* @return The X scaling factor.
*/
public float getXScale()
{
return textMatrix.getScalingFactorX();
}
/**
* This will get the Y scaling factor. This is dependent on the current transformation matrix
* (set by the "cm" operator), the text matrix (set by the "Tm" operator) and the font size (set
* by the "Tf" operator).
*
* @return The Y scaling factor.
*/
public float getYScale()
{
return textMatrix.getScalingFactorY();
}
/**
* Get the widths of each individual character.
*
* @return An array that has the same length as the CharacterCodes array.
*/
public float[] getIndividualWidths()
{
return widths;
}
/**
* Determine if this TextPosition logically contains another (i.e. they overlap and should be
* rendered on top of each other).
*
* @param tp2 The other TestPosition to compare against
* @return True if tp2 is contained in the bounding box of this text.
*/
public boolean contains(TextPosition tp2)
{
double thisXstart = getXDirAdj();
double thisWidth = getWidthDirAdj();
double thisXend = thisXstart + thisWidth;
double tp2Xstart = tp2.getXDirAdj();
double tp2Xend = tp2Xstart + tp2.getWidthDirAdj();
// no X overlap at all so return as soon as possible
if (tp2Xend <= thisXstart || tp2Xstart >= thisXend)
{
return false;
}
// no Y overlap at all so return as soon as possible. Note: 0.0 is in the upper left and
// y-coordinate is top of TextPosition
double thisYstart = getYDirAdj();
double tp2Ystart = tp2.getYDirAdj();
if (tp2Ystart + tp2.getHeightDir() < thisYstart ||
tp2Ystart > thisYstart + getHeightDir())
{
return false;
}
// we're going to calculate the percentage of overlap, if its less than a 15% x-coordinate
// overlap then we'll return false because its negligible, .15 was determined by trial and
// error in the regression test files
else if (tp2Xstart > thisXstart && tp2Xend > thisXend)
{
double overlap = thisXend - tp2Xstart;
double overlapPercent = overlap/thisWidth;
return overlapPercent > .15;
}
else if (tp2Xstart < thisXstart && tp2Xend < thisXend)
{
double overlap = tp2Xend - thisXstart;
double overlapPercent = overlap/thisWidth;
return overlapPercent > .15;
}
return true;
}
/**
* Merge a single character TextPosition into the current object. This is to be used only for
* cases where we have a diacritic that overlaps an existing TextPosition. In a graphical
* display, we could overlay them, but for text extraction we need to merge them. Use the
* contains() method to test if two objects overlap.
*
* @param diacritic TextPosition to merge into the current TextPosition.
*/
public void mergeDiacritic(TextPosition diacritic)
{
if (diacritic.getUnicode().length() > 1)
{
return;
}
float diacXStart = diacritic.getXDirAdj();
float diacXEnd = diacXStart + diacritic.widths[0];
float currCharXStart = getXDirAdj();
int strLen = unicode.length();
boolean wasAdded = false;
for (int i = 0; i < strLen && !wasAdded; i++)
{
if (i >= widths.length)
{
LOG.info("diacritic " + diacritic.getUnicode() + " on ligature " + unicode +
" is not supported yet and is ignored (PDFBOX-2831)");
break;
}
float currCharXEnd = currCharXStart + widths[i];
// this is the case where there is an overlap of the diacritic character with the
// current character and the previous character. If no previous character, just append
// the diacritic after the current one
if (diacXStart < currCharXStart && diacXEnd <= currCharXEnd)
{
if (i == 0)
{
insertDiacritic(i, diacritic);
}
else
{
float distanceOverlapping1 = diacXEnd - currCharXStart;
float percentage1 = distanceOverlapping1/widths[i];
float distanceOverlapping2 = currCharXStart - diacXStart;
float percentage2 = distanceOverlapping2/widths[i - 1];
if (percentage1 >= percentage2)
{
insertDiacritic(i, diacritic);
}
else
{
insertDiacritic(i - 1, diacritic);
}
}
wasAdded = true;
}
// diacritic completely covers this character and therefore we assume that this is the
// character the diacritic belongs to
else if (diacXStart < currCharXStart)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// otherwise, The diacritic modifies this character because its completely
// contained by the character width
else if (diacXEnd <= currCharXEnd)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// last character in the TextPosition so we add diacritic to the end
else if (i == strLen - 1)
{
insertDiacritic(i, diacritic);
wasAdded = true;
}
// couldn't find anything useful so we go to the next character in the TextPosition
currCharXStart += widths[i];
}
}
/**
* Inserts the diacritic TextPosition to the str of this TextPosition and updates the widths
* array to include the extra character width.
*
* @param i current character
* @param diacritic The diacritic TextPosition
*/
private void insertDiacritic(int i, TextPosition diacritic)
{
StringBuilder sb = new StringBuilder();
sb.append(unicode.substring(0, i));
float[] widths2 = new float[widths.length + 1];
System.arraycopy(widths, 0, widths2, 0, i);
// Unicode combining diacritics always go after the base character, regardless of whether
// the string is in presentation order or logical order
sb.append(unicode.charAt(i));
widths2[i] = widths[i];
sb.append(combineDiacritic(diacritic.getUnicode()));
widths2[i + 1] = 0;
// get the rest of the string
sb.append(unicode.substring(i + 1, unicode.length()));
System.arraycopy(widths, i + 1, widths2, i + 2, widths.length - i - 1);
unicode = sb.toString();
widths = widths2;
}
/**
* Combine the diacritic, for example, convert non-combining diacritic characters to their
* combining counterparts.
*
* @param str String to normalize
* @return Normalized string
*/
private String combineDiacritic(String str)
{
// Unicode contains special combining forms of the diacritic characters which we want to use
int codePoint = str.codePointAt(0);
// convert the characters not defined in the Unicode spec
if (DIACRITICS.containsKey(codePoint))
{
return DIACRITICS.get(codePoint);
}
else
{
return Normalizer.normalize(str, Normalizer.Form.NFKC).trim();
}
}
/**
* @return True if the current character is a diacritic char.
*/
public boolean isDiacritic()
{
String text = this.getUnicode();
if (text.length() != 1)
{
return false;
}
if ("ー".equals(text))
{
// PDFBOX-3833: ー is not a real diacritic like ¨ or ˆ, it just changes the
// pronunciation of the previous sound, and is printed after the previous glyph
// http://www.japanesewithanime.com/2017/04/prolonged-sound-mark.html
// Ignoring it as diacritic avoids trouble if it slightly overlaps with the next glyph.
return false;
}
int type = Character.getType(text.charAt(0));
return type == Character.NON_SPACING_MARK ||
type == Character.MODIFIER_SYMBOL ||
type == Character.MODIFIER_LETTER;
}
/**
* Show the string data for this text position.
*
* @return A human readable form of this object.
*/
@Override
public String toString()
{
return getUnicode();
}
/**
* This will get the x coordinate of the end position. This is the unadjusted value passed into
* the constructor.
*
* @return The unadjusted x coordinate of the end position
*/
public float getEndX()
{
return endX;
}
/**
* This will get the y coordinate of the end position. This is the unadjusted value passed into
* the constructor.
*
* @return The unadjusted y coordinate of the end position
*/
public float getEndY()
{
return endY;
}
/**
* This will get the rotation of the page that the text is located in. This is the unadjusted
* value passed into the constructor.
*
* @return The unadjusted rotation of the page that the text is located in
*/
public int getRotation()
{
return rotation;
}
/**
* This will get the height of the page that the text is located in. This is the unadjusted
* value passed into the constructor.
*
* @return The unadjusted height of the page that the text is located in
*/
public float getPageHeight()
{
return pageHeight;
}
/**
* This will get the width of the page that the text is located in. This is the unadjusted value
* passed into the constructor.
*
* @return The unadjusted width of the page that the text is located in
*/
public float getPageWidth()
{
return pageWidth;
}
@Override
public boolean equals(Object o)
{
if (this == o)
{
return true;
}
if (!(o instanceof TextPosition))
{
return false;
}
TextPosition that = (TextPosition) o;
if (Float.compare(that.endX, endX) != 0)
{
return false;
}
if (Float.compare(that.endY, endY) != 0)
{
return false;
}
if (Float.compare(that.maxHeight, maxHeight) != 0)
{
return false;
}
if (rotation != that.rotation)
{
return false;
}
if (Float.compare(that.x, x) != 0)
{
return false;
}
if (Float.compare(that.y, y) != 0)
{
return false;
}
if (Float.compare(that.pageHeight, pageHeight) != 0)
{
return false;
}
if (Float.compare(that.pageWidth, pageWidth) != 0)
{
return false;
}
if (Float.compare(that.widthOfSpace, widthOfSpace) != 0)
{
return false;
}
if (Float.compare(that.fontSize, fontSize) != 0)
{
return false;
}
if (fontSizePt != that.fontSizePt)
{
return false;
}
if (textMatrix != null ? !textMatrix.equals(that.textMatrix) : that.textMatrix != null)
{
return false;
}
if (!Arrays.equals(charCodes, that.charCodes))
{
return false;
}
return font != null ? font.equals(that.font) : that.font == null;
// If changing this method, do not compare mutable fields (PDFBOX-4701)
}
@Override
public int hashCode()
{
int result = textMatrix != null ? textMatrix.hashCode() : 0;
result = 31 * result + Float.floatToIntBits(endX);
result = 31 * result + Float.floatToIntBits(endY);
result = 31 * result + Float.floatToIntBits(maxHeight);
result = 31 * result + rotation;
result = 31 * result + Float.floatToIntBits(x);
result = 31 * result + Float.floatToIntBits(y);
result = 31 * result + Float.floatToIntBits(pageHeight);
result = 31 * result + Float.floatToIntBits(pageWidth);
result = 31 * result + Float.floatToIntBits(widthOfSpace);
result = 31 * result + Arrays.hashCode(charCodes);
result = 31 * result + (font != null ? font.hashCode() : 0);
result = 31 * result + Float.floatToIntBits(fontSize);
result = 31 * result + fontSizePt;
return result;
}
}