blob: e5aa66400453f6cd67a84e75fa9338d8e110c9f7 [file] [log] [blame]
<!DOCTYPE html>
<html lang="en">
<!--
Licensed to the Apache Software Foundation (ASF) under one or more
contributor license agreements. See the NOTICE file distributed with
this work for additional information regarding copyright ownership.
The ASF licenses this file to You under the Apache License, Version 2.0
(the "License"); you may not use this file except in compliance with
the License. You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE- 2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License.
-->
<head>
<meta charset="utf-8">
<meta http-equiv="X-UA-Compatible" content="IE=edge">
<meta name="viewport" content="width=device-width, initial-scale=1">
<meta name="description" content="The Apache PDFBox™ library is an open source Java tool for working with PDF documents. This project allows creation of new PDF documents, manipulation of existing documents and the ability to extract content from documents. Apache PDFBox also includes several command-line utilities. Apache PDFBox is published under the Apache License v2.0.">
<title>Apache PDFBox | Ideas</title>
<link href="/bootstrap/css/bootstrap.min.css" rel="stylesheet">
<link href="/css/prism.css" rel="stylesheet">
<link href="/css/styles.css" rel="stylesheet">
</head>
<body>
<header class="main-header">
<div class="main-header-logo">
<a href="/" aria-label="Navigation to the PDFBox home page">
<svg focusable="false" class="pdfbox-brand-toolbox" viewBox="0 0 744.09448819 1052.3622047" xmlns="http://www.w3.org/2000/svg" aria-labelledby="brandImageTitle brandImageDesc" role="img">
<title id="brandImageTitle">PDFBox Brand Logo</title>
<desc id="brandImageDesc">The PDFBox logo showing a toolbox.</desc>
<g transform="matrix(1.25 0 0 -1.25 -317.14 1018.08)" clip-path="url(#clipPath3375)">
<path d="M821.924 376.535L463.24 122.525l-203.83 76.86c23.89 6.02 46.87 15.197 68.335 27.29 60.063 33.835 105.686 88.46 128.282 153.59 2.634 4.66 8.11 6.92 13.265 5.47 4.667-1.31 8.01-5.41 8.353-10.247l-3.572-188.12 334.99 193.957c2.41 1 5.113 1.028 7.54.075 2.315-.907 4.21-2.64 5.32-4.865zm-1.307 97.91l.13-78.324c-.87-2.72-3.342-4.61-6.197-4.75-3.104-.14-5.924 1.8-6.893 4.75l1.29 79.54c.386 2.92 2.893 5.09 5.835 5.04 3.37-.04 6.022-2.89 5.835-6.25z"/>
<path d="M751.88 651.666c-.237.002-.48-.022-.723-.077l-363.512-25.15c-10.37-.73-19.583-6.76-24.588-15.87-6.943-12.64-4.677-28.26 5.864-37.72 2.85-2.56 6.22-4.49 8.97-7.16 2.19-2.12 3.93-4.65 5.12-7.45 23.7-26.86 30.3-64.76 17.08-98.05-4.75-11.96-11.97-22.77-21.2-31.73l-73.74 16.1 46.69 115.89c2.01 3.07.64 7.21-2.8 8.49-2.8 1.03-5.91-.34-7.03-3.1L286.49 448.8l-11.79 4.32c-2.097.28-4.21-.39-5.76-1.825-1.16-1.077-1.925-2.516-2.16-4.083l-11.53-189.68c1.66-1.83 4.03-2.85 6.5-2.798 2.533.05 4.913 1.22 6.5 3.197 6.22 22.46 11.216 45.23 14.96 68.22 5.137 31.55 7.91 63.43 8.3 95.39l136.152-15.28c2.396-1.09 5.032-1.56 7.66-1.35 2.75.22 5.396 1.16 7.663 2.73 41.62 19.22 83.375 38.15 125.26 56.79 41.615 18.52 83.36 36.75 125.23 54.68 1.935-.16 3.703-1.16 4.843-2.73 1.266-1.74 1.618-3.99.948-6.04-3.8-3.45-7.333-7.19-10.575-11.17-3.5-4.3-6.65-8.88-9.42-13.69.11-.34.22-.68.35-1.01.32-.85.7-1.68 1.25-2.41 2.42-3.16 7.04-3.55 9.96-.84 8.66 9.71 19.21 17.557 31.01 23.05 10.67 4.97 22.17 7.927 33.92 8.717 2.58 1.626 4.23 4.39 4.43 7.437.14 2.06-.41 4.103-1.56 5.815-3.63-.24-7.27.26-10.71 1.47-3.55 1.25-6.81 3.23-9.55 5.815l19.55 78.14c-1.4 1.684-3.13 3.07-5.08 4.06-2.28 1.16-4.81 1.76-7.37 1.746-53.05-4.71-106.21-8.08-159.433-10.11-53.34-2.035-106.73-2.72-160.1-2.056-3.79-.575-6.82 3.092-5.54 6.7.63 1.784 2.31 2.98 4.2 2.996l321.783 26.06c2.094.873 3.99 1.933 5.74 3.18 1.46 1.036 2.987 2.365 2.947 4.34-.034 1.75-1.484 3.09-3.154 3.11zm-23.61-56.02c.952-.32 1.776-.934 2.354-1.755.472-.67.76-1.45.833-2.27l-12.96-35.37c-2.706-3.39-5.87-6.4-9.404-8.92-3.324-2.37-6.945-4.3-10.77-5.73l-260.77-58.65-21.01 80.83 311.727 31.88z"/>
<path d="M786.68 627.94c.393-18.97 2.614-37.734 6.564-56.05 3.873-17.958 9.463-35.75 19.087-51.71 3.54-5.854 7.58-11.385 12.08-16.53 1.07-1.608 1.71-3.462 1.86-5.386.14-1.842-.16-3.69-.9-5.385-54.19-15.56-108.32-31.34-162.39-47.35-54.2-16.05-108.33-32.33-162.39-48.85-.23 1.43.04 2.9.78 4.15 1.07 1.81 2.96 2.99 5.06 3.13l271.54 104.43c-4.43 19.56-7.17 39.46-8.19 59.49-.99 19.68-.32 39.4 2.02 58.97.81 4.06 4.63 6.79 8.74 6.24 2.88-.39 5.27-2.4 6.15-5.16z"/>
</g>
</svg>
<svg focusable="false" class="pdfbox-brand-text" xmlns="http://www.w3.org/2000/svg" style="isolation:isolate" viewBox="0 0 109.81066666651577 30.943999999957384" aria-labelledby="brandTextTitle brandTextDesc" role="img">
<title id="brandTextTitle">PDFBox brand text</title>
<desc id="brandTextDesc">PDFBox, the brand text.</desc>
<path d="M0 .31h6.528q1.792 0 3.157.47 1.366.468 2.475 1.663 1.11 1.194 1.536 2.816.427 1.57.427 4.3 0 2-.256 3.45-.214 1.41-.982 2.64-.896 1.492-2.39 2.345-1.492.81-3.924.81H4.36v11.87H0V.305zm4.352 14.42h2.09q1.323 0 2.05-.383.724-.384 1.065-1.024.342-.683.384-1.622.09-.93.09-2.09 0-1.06-.08-2-.04-.98-.38-1.66-.3-.72-.98-1.11-.68-.43-1.96-.43H4.36v10.32z" fill-rule="evenodd"/>
<path d="M17.333.31h6.443q3.712 0 5.675 2.09 1.97 2.048 1.97 5.76v14.208q0 4.267-2.09 6.315-2.05 2.005-5.93 2.005h-6.06V.308zm4.352 26.282h2.006q1.84 0 2.61-.896.77-.94.77-2.9V8.16q0-1.792-.72-2.773-.72-.982-2.64-.982H21.7v22.187z" fill-rule="evenodd"/>
<path d="M35.583.31h12.97v4.095h-8.618v9.216h7.51v4.1h-7.51v12.97h-4.352V.31z"/>
<path d="M51.417.31h6.357q2.09 0 3.54.64 1.495.64 2.433 1.706.94 1.067 1.323 2.475.427 1.37.427 2.86V9.1q0 1.236-.214 2.09-.17.853-.554 1.493-.39.64-.94 1.152-.56.47-1.28.896 1.53.73 2.26 2.18.72 1.41.72 3.8v1.71q0 4.01-1.97 6.15-1.92 2.13-6.19 2.13H51.4V.31zm4.352 26.026h1.87q1.32 0 2.05-.384.77-.384 1.15-1.067.38-.682.47-1.62.08-.94.08-2.05 0-1.15-.13-2.004-.13-.85-.56-1.4-.386-.6-1.11-.89-.727-.3-1.92-.3h-1.92v9.73zm0-13.568h1.96q2.17 0 2.9-1.067.77-1.1.77-3.2 0-2.04-.86-3.07-.81-1.02-2.99-1.02h-1.79v8.37z" fill-rule="evenodd"/>
<path d="M69.027 16.31q0-1.323.17-2.433.17-1.11.64-1.962.768-1.408 2.22-2.262 1.45-.853 3.455-.853t3.456.853q1.45.854 2.22 2.262.468.853.64 1.962.17 1.11.17 2.432v7.12q0 1.32-.17 2.43-.172 1.11-.64 1.96-.77 1.4-2.22 2.26-1.45.85-3.456.85-2.005 0-3.456-.86-1.45-.854-2.22-2.26-.468-.855-.64-1.964-.17-1.11-.17-2.43V16.3zm4.352 7.807q0 1.238.55 1.878.6.597 1.58.597.98 0 1.53-.597.59-.64.59-1.878v-8.49q0-1.238-.6-1.835-.557-.64-1.538-.64-.98 0-1.58.64-.553.597-.553 1.835v8.49z" fill-rule="evenodd"/>
<path d="M88.316 19.637L83.24 9.057h4.607l2.688 6.143 2.688-6.144h4.608l-5.16 10.58 5.42 11.052h-4.61l-2.94-6.613-2.94 6.613h-4.61l5.34-11.05z"/>
<path d="M102.883 5.28h1.2q.784 0 1.168-.224.4-.24.4-.784 0-.464-.35-.672-.33-.224-.88-.224h-1.53V5.28zm-1.056-2.864h2.56q2.32 0 2.32 1.904 0 .48-.144.816-.128.336-.368.56-.24.224-.56.352-.304.112-.656.16l1.93 2.96h-1.28L103.7 6.24h-.817v2.928h-1.056V2.416zm6.832 3.376q0-.976-.37-1.84-.37-.864-.99-1.504-.63-.64-1.48-1.008-.85-.384-1.81-.384t-1.81.384q-.85.368-1.47 1.008t-1 1.504-.37 1.84q0 .976.364 1.84.37.864.992 1.504t1.47 1.024q.85.368 1.81.368.96 0 1.805-.368.85-.384 1.47-1.024.625-.64.99-1.504.37-.864.37-1.84zm-10.44 0q0-1.2.45-2.256.46-1.056 1.25-1.84t1.84-1.232Q102.82 0 104.02 0t2.255.464q1.056.448 1.84 1.232t1.232 1.84q.464 1.056.464 2.256 0 1.2-.46 2.256-.45 1.056-1.23 1.84t-1.84 1.248q-1.05.448-2.25.448t-2.25-.448q-1.053-.464-1.84-1.248t-1.25-1.84q-.45-1.056-.45-2.256z" fill-rule="evenodd"/>
</svg>
</a>
</div>
<nav class="wrapper">
<input type="checkbox" id="menu-toggle">
<label for="menu-toggle" class="label-toggle"></label>
<ul>
<li><a href="/blog">Blog</a></li>
</ul>
</nav>
</header>
<div class="container">
<div class="row row-offcanvas row-offcanvas-left">
<div class="col-xs-6 col-sm-3 sidebar-offcanvas" id="sidebar">
<ul class="sidebar">
<li class="sidebar-header">Apache PDFBox</li>
<li><a href="/index.html">Overview</a></li>
<li><a href="https://www.apache.org/licenses">License</a></li>
<li><a href="/download.html">Download</a></li>
<li class="sidebar-header">Community</li>
<li><a href="/support.html">Support</a></li>
<li><a href="/mailinglists.html">Mailing Lists</a></li>
<li><a href="https://issues.apache.org/jira/browse/PDFBOX">Issue Tracker</a></li>
<li><a href="/team.html">Project Team</a></li>
<li class="sidebar-header">Documentation</li>
<li class="sidebar-node" id="v4-0">
<a href="#">4.0 (not released)</a>
<ul>
<li><a class="sidebar-node" id="v4-0-migration" href="/4.0/migration.html">Migration Guide</a></li>
</ul>
</li>
<li class="sidebar-node" id="v3-0">
<a href="#">3.0</a>
<ul>
<li><a class="sidebar-node" id="v3-0-migration" href="/3.0/migration.html">Migration Guide</a></li>
<li><a href="/3.0/getting-started.html">Getting Started</a></li>
<li><a href="/3.0/dependencies.html">Dependencies</a></li>
<li><a href="/3.0/commandline.html">Command-Line Tools</a></li>
<li><a href="/3.0/faq.html">FAQ</a></li>
<li><a href="https://javadoc.io/doc/org.apache.pdfbox/pdfbox/3.0.2/index.html">API Docs&emsp;<small>via javadoc.io</small></a></li>
</ul>
</li>
<li class="sidebar-node" id="v2-0">
<a href="#">2.0</a>
<ul>
<li><a href="/2.0/migration.html">Migration Guide</a></li>
<li><a href="/2.0/getting-started.html">Getting Started</a></li>
<li><a href="/2.0/examples.html">Examples</a></li>
<li><a href="/2.0/dependencies.html">Dependencies</a></li>
<li class="sidebar-node" id="v2-0-cookbook">
<a href="#">Cookbook</a>
<ul>
<li><a href="/2.0/cookbook/encryption.html">Document Encryption</a></li>
</ul>
</li>
<li><a href="/2.0/commandline.html">Command-Line Tools</a></li>
<li><a href="/2.0/faq.html">FAQ</a></li>
<li><a href="https://javadoc.io/doc/org.apache.pdfbox/pdfbox/2.0.31/index.html">API Docs&emsp;<small>via javadoc.io</small></a></li>
</ul>
</li>
<li class="sidebar-header">Development</li>
<li><a href="/building.html">Building from Source</a></li>
<li><a href="/codingconventions.html">Coding Conventions</a></li>
<li><a href="/siteupdate.html">Update the Website</a></li>
<li><a href="https://ci-builds.apache.org/job/PDFBox/">Jenkins</a></li>
<li><a href="https://sonarcloud.io/dashboard?id=pdfbox-reactor">SonarCloud</a></li>
<li><a href="https://issues.apache.org/jira/browse/PDFBOX">JIRA</a></li>
<li><a href="/ideas.html">Ideas</a></li>
<li><a href="/references.html">External Links</a></li>
<li class="sidebar-header">Apache Software Foundation</li>
<li><a href="https://www.apache.org/foundation/">About</a></li>
<li><a href="/security.html">Security</a></li>
<li><a href="https://www.apache.org/foundation/sponsorship.html">Sponsorship</a></li>
<li><a href="https://www.apache.org/foundation/thanks.html">Thanks</a></li>
</ul>
<!-- dontation page logo/link-->
<a href="https://www.apache.org/foundation/contributing.html"><img width="135" src="/images/SupportApache.jpg" alt="Support Apache"></a>
<!-- ASF events page logo/link -->
<a href="https://www.apache.org/events/current-event.html"><img src="https://www.apache.org/events/current-event-125x125.png" alt="Apache events"></a>
</div>
<div class="col-xs-12 col-sm-9">
<h1 id="ideas" tabindex="-1">Ideas</h1>
<p>There are several ideas to enhance PDFBox. These are outlined below together with
comments and the releases they are planned for as soon as there is agreement to do the
implementation.</p>
<h2 id="enhance-type-safety" tabindex="-1">Enhance type safety</h2>
<p>Enhance the type safety of PDFBox and add more generic collections and code cleanup.</p>
<h2 id="remove-all-deprecated-methods" tabindex="-1">Remove all deprecated methods</h2>
<p>This is an ongoing effort and most/all deprecated methods will be removed in PDFBox 2.0.0</p>
<h2 id="handle-large-pdf-files" tabindex="-1">Handle large PDF files</h2>
<p>In addition to the PDF parsing pdfbox does not always handle large PDF files well as some
of the references are implemented as int instead of long</p>
<h2 id="switch-to-java-1.6" tabindex="-1"><span class="complete">Switch to Java 1.6</span></h2>
<p><span class="complete">PDFBox 2.0.0 has Java 6 as a minimum requirement.</span></p>
<h2 id="break-pdfbox-into-modules" tabindex="-1"><span class="complete">Break PDFBox into modules</span></h2>
<p><span class="complete">In order to support different use cases and provide a minimal toolset PDFBox 2.0.0 should be
separated into different modules. This goes inline with rearranging some of the code
e.g. remove AWT from PDDocument.
</span></p>
<h2 id="enhance-the-font-rendering" tabindex="-1"><span class="complete">Enhance the font rendering</span></h2>
<p><span class="complete">PDFBox 2.0.0 will render most of the fonts without using AWT.</span></p>
<h2 id="replace%2Fenhance-pdf-parsing" tabindex="-1">Replace/enhance PDF parsing</h2>
<p><span class="complete">The old &quot;classic&quot; PDF parser in PDFBox is not in line with the PDF specification as it parses
a PDF from top to bottom instead of respecting the XRef information.</span> The NonSequentialParser
enhanced that situation but there is a need to have a cleaner foundation broken into several levels</p>
<ul>
<li>I/O</li>
<li>Tokenization</li>
<li>Parsing according to structure</li>
<li>COS level document</li>
<li>PD level document</li>
<li>Add some self-healing mechanism to process corrupt files</li>
</ul>
<p>In addition, handling documents which are not conforming shouldn't be part of the core parser
but of an extensible approach, e.g. by adding hooks to allow for handling parsing exceptions.</p>
<h2 id="add-the-ability-to-create-pdfs-using-unicode-encoded-text" tabindex="-1"><span class="complete">Add the ability to create PDFs using unicode encoded text</span></h2>
<p><span class="complete">The recent PDFBox version is limited to WinANSI encoded text. 2.0.0 should have unicode support as well.</span></p>
<h2 id="rearchitect-the-cos-level-objects" tabindex="-1">Rearchitect the COS level objects</h2>
<p>The COS level objects need to be refactored to be in line with the new parser. In addition
method signatures, constructing ... should be made similar across the COS objects</p>
<h2 id="parsing-on-demand" tabindex="-1">Parsing on demand</h2>
<p>Instead of always parsing the complete document PDFs should be parsable on demand making
objects only available as they are needed to enhance performance and minimize memory footprint.</p>
<p>This might be achieved by providing a layered approach where a base (non caching) parser provides
the on demand parsing and a caching parser built on top caches objects for use cases where
this is beneficial e.g. rendering, debugging ...</p>
<ul>
<li>the lexer would be the low level component delivering tokens to the parser.
A sample implementation exists as part of PDFBOX-1000. The benefit would be a clean low
level handling of tokens. The current implementation needs to be (slightly ?) revised though</li>
<li>the incremental (non caching) parser would allow for page by page processing moving forward
only to support text extraction, merging, splitting … - the benefit would be a lower memory
consumption as well as a potential faster processing</li>
<li>the caching parser would support applications such a PDFDebugger or PDFReader</li>
</ul>
<h2 id="handling-of-pdf-versions" tabindex="-1">Handling of PDF versions</h2>
<p>The current implementation is a mix of PDF 1.4 and some adhoc additions without a clear
distinction what is and is not supported. We could ad some support for explicitly handling
versions in PDFBox e.g. my marking certain methods and properties to the PDF version support
level. This could in addition be a good basis for PDF/A and other compliance checks.</p>
</div>
</div>
</div>
<footer class="footer">
<div class="container">
<div class="row">
<div class="span3">
<!-- nothing in here on purpose -->
</div>
<div class="span9">
<p>Copyright © 2009&ndash;2024 <a href="https://www.apache.org/">The Apache Software Foundation</a>. Licensed under the <a href="https://www.apache.org/licenses/LICENSE-2.0">Apache License, Version 2.0</a>.
<br>Apache PDFBox, PDFBox, Apache, the Apache feather logo and the Apache PDFBox project logos are trademarks of The Apache Software Foundation.</p>
</div>
</div>
</div>
</footer>
<script>
function addCollapsed(el) {
el.classList.add('collapsed');
}
function toggleCollapsed(ev) {
ev.target.parentNode.classList.toggle('collapsed');
ev.preventDefault();
}
function addClickEvent(el) {
console.log(el);
el.addEventListener('click', toggleCollapsed);
}
document.querySelectorAll('.sidebar-node').forEach(addCollapsed);
document.querySelectorAll('.sidebar-node > a').forEach(addClickEvent);
// preserve expand/collapse across page navigation
var path = document.location.pathname;
if (path.indexOf('/1.8') == 0) {
var el = document.getElementById("#v1-8");
el.classList.toggle("collapsed");
if (path.indexOf('/1.8/cookbook') == 0) {
el = document.getElementById('v1-8-cookbook');
el.classList.remove('collapsed');
}
} else {
var el = document.getElementById("v2-0");
console.log(el);
el.classList.remove("collapsed");
if (path.indexOf('/2.0/cookbook') == 0) {
el = document.getElementById('#v2-0-cookbook');
el.classList.remove('collapsed');
}
}
</script>
</body>
</html>