/* | |
* Licensed to the Apache Software Foundation (ASF) under one or more | |
* contributor license agreements. See the NOTICE file distributed with | |
* this work for additional information regarding copyright ownership. | |
* The ASF licenses this file to You under the Apache License, Version 2.0 | |
* (the "License"); you may not use this file except in compliance with | |
* the License. You may obtain a copy of the License at | |
* | |
* http://www.apache.org/licenses/LICENSE-2.0 | |
* | |
* Unless required by applicable law or agreed to in writing, software | |
* distributed under the License is distributed on an "AS IS" BASIS, | |
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |
* See the License for the specific language governing permissions and | |
* limitations under the License. | |
*/ | |
package org.apache.pdfbox.benchmark; | |
import java.io.File; | |
import java.io.IOException; | |
import java.util.concurrent.TimeUnit; | |
import org.apache.logging.log4j.Level; | |
import org.apache.logging.log4j.LogManager; | |
import org.apache.logging.log4j.core.config.Configurator; | |
import org.apache.pdfbox.Loader; | |
import org.apache.pdfbox.pdmodel.PDDocument; | |
import org.apache.pdfbox.text.PDFTextStripper; | |
import org.openjdk.jmh.annotations.Benchmark; | |
import org.openjdk.jmh.annotations.BenchmarkMode; | |
import org.openjdk.jmh.annotations.Mode; | |
import org.openjdk.jmh.annotations.OutputTimeUnit; | |
import org.openjdk.jmh.infra.Blackhole; | |
public class TextExtraction { | |
static final String PDF32000_2008 = "target/pdfs/PDF32000_2008.pdf"; | |
static { | |
Configurator.setAllLevels(LogManager.getRootLogger().getName(), Level.OFF); | |
java.util.logging.Logger.getLogger("org.apache").setLevel(java.util.logging.Level.OFF); | |
} | |
@Benchmark | |
@OutputTimeUnit(TimeUnit.MILLISECONDS) | |
@BenchmarkMode(Mode.AverageTime) | |
public void extractPDFSpecUnsorted(Blackhole blackhole) throws IOException { | |
try (PDDocument pdf = Loader.loadPDF(new File(PDF32000_2008))) | |
{ | |
PDFTextStripper pdfStripper = new PDFTextStripper(); | |
pdfStripper.setSortByPosition(false); | |
String parsedText = pdfStripper.getText(pdf); | |
blackhole.consume(parsedText); | |
} | |
} | |
@Benchmark | |
@OutputTimeUnit(TimeUnit.MILLISECONDS) | |
@BenchmarkMode(Mode.AverageTime) | |
public void extractPDFSpecSorted(Blackhole blackhole) throws IOException { | |
try (PDDocument pdf = Loader.loadPDF(new File(PDF32000_2008))) | |
{ | |
PDFTextStripper pdfStripper = new PDFTextStripper(); | |
pdfStripper.setSortByPosition(true); | |
String parsedText = pdfStripper.getText(pdf); | |
blackhole.consume(parsedText); | |
} | |
} | |
} |