blob: 37897355fe2fb3f247e9ec6d8d2dee39e42363bf [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.store.pdf;
import com.fasterxml.jackson.annotation.JsonIgnore;
import com.fasterxml.jackson.annotation.JsonInclude;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.fasterxml.jackson.annotation.JsonTypeName;
import com.fasterxml.jackson.databind.annotation.JsonDeserialize;
import com.fasterxml.jackson.databind.annotation.JsonPOJOBuilder;
import org.apache.commons.lang3.StringUtils;
import org.apache.drill.common.PlanStringBuilder;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.logical.FormatPluginConfig;
import com.google.common.collect.ImmutableList;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import technology.tabula.extractors.BasicExtractionAlgorithm;
import technology.tabula.extractors.ExtractionAlgorithm;
import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
import java.util.Collections;
import java.util.List;
import java.util.Objects;
@JsonInclude(JsonInclude.Include.NON_DEFAULT)
@JsonDeserialize(builder = PdfFormatConfig.PdfFormatConfigBuilder.class)
@JsonTypeName(PdfFormatPlugin.DEFAULT_NAME)
public class PdfFormatConfig implements FormatPluginConfig {
private static final Logger logger = LoggerFactory.getLogger(PdfFormatConfig.class);
@JsonProperty
private final List<String> extensions;
@JsonProperty
private final boolean combinePages;
@JsonProperty
private final boolean extractHeaders;
@JsonProperty
private final String extractionAlgorithm;
@JsonProperty
private final String password;
@JsonProperty
private final int defaultTableIndex;
private PdfFormatConfig(PdfFormatConfig.PdfFormatConfigBuilder builder) {
this.extensions = builder.extensions == null ? Collections.singletonList("pdf") : ImmutableList.copyOf(builder.extensions);
this.combinePages = builder.combinePages;
this.extractHeaders = builder.extractHeaders;
this.defaultTableIndex = builder.defaultTableIndex;
this.extractionAlgorithm = builder.extractionAlgorithm;
this.password = builder.password;
}
public static PdfFormatConfigBuilder builder() {
return new PdfFormatConfigBuilder();
}
@JsonIgnore
public PdfBatchReader.PdfReaderConfig getReaderConfig(PdfFormatPlugin plugin) {
return new PdfBatchReader.PdfReaderConfig(plugin);
}
@JsonIgnore
public ExtractionAlgorithm getAlgorithm() {
if (StringUtils.isEmpty(this.extractionAlgorithm) || this.extractionAlgorithm.equalsIgnoreCase("basic")) {
return new BasicExtractionAlgorithm();
} else if (this.extractionAlgorithm.equalsIgnoreCase("spreadsheet")) {
return new SpreadsheetExtractionAlgorithm();
} else {
throw UserException.validationError()
.message(extractionAlgorithm + " is not a valid extraction algorithm. The available choices are basic or spreadsheet.")
.build(logger);
}
}
public List<String> extensions() {
return this.extensions;
}
public boolean combinePages() {
return this.combinePages;
}
public boolean extractHeaders() {
return this.extractHeaders;
}
public String extractionAlgorithm() {
return this.extractionAlgorithm;
}
public String password() {
return this.password;
}
public int defaultTableIndex() {
return this.defaultTableIndex;
}
@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (o == null || getClass() != o.getClass()) {
return false;
}
PdfFormatConfig that = (PdfFormatConfig) o;
return combinePages == that.combinePages
&& extractHeaders == that.extractHeaders
&& defaultTableIndex == that.defaultTableIndex
&& Objects.equals(extensions, that.extensions)
&& Objects.equals(extractionAlgorithm, that.extractionAlgorithm)
&& Objects.equals(password, that.password);
}
@Override
public int hashCode() {
return Objects.hash(extensions, combinePages, extractHeaders, extractionAlgorithm,
password, defaultTableIndex);
}
@Override
public String toString() {
return new PlanStringBuilder(this)
.field("extensions", extensions)
.field("combinePages", combinePages)
.field("extractHeaders", extractHeaders)
.field("extractionAlgorithm", extractionAlgorithm)
.field("password", password)
.field("defaultTableIndex", defaultTableIndex)
.toString();
}
@JsonPOJOBuilder(withPrefix = "")
public static class PdfFormatConfigBuilder {
private List<String> extensions;
private boolean combinePages;
private boolean extractHeaders;
private String extractionAlgorithm;
private String password;
private int defaultTableIndex;
public PdfFormatConfig build() {
return new PdfFormatConfig(this);
}
public PdfFormatConfigBuilder extensions(List<String> extensions) {
this.extensions = extensions;
return this;
}
public PdfFormatConfigBuilder combinePages(boolean combinePages) {
this.combinePages = combinePages;
return this;
}
public PdfFormatConfigBuilder extractHeaders(boolean extractHeaders) {
this.extractHeaders = extractHeaders;
return this;
}
public PdfFormatConfigBuilder extractionAlgorithm(String extractionAlgorithm) {
this.extractionAlgorithm = extractionAlgorithm;
return this;
}
public PdfFormatConfigBuilder password(String password) {
this.password = password;
return this;
}
public PdfFormatConfigBuilder defaultTableIndex(int defaultTableIndex) {
this.defaultTableIndex = defaultTableIndex;
return this;
}
}
}