DRILL-8390: Minor Improvements to PDF Reader (#2742) DRILL-8390: Minor Improvements to PDF Reader

commit: d89578c5d60a6594b698974ae03a46ad11719a9c [log] [tgz]
author: Charles S. Givre <cgivre@apache.org> Thu Jan 19 08:42:04 2023 -0500
committer: GitHub <noreply@github.com> Thu Jan 19 08:42:04 2023 -0500
tree: 4afeffbc4f64a859d9b5e225516a26158a20aa5b
parent: 18854ff79e1c560002f8c7837a3472877623f33c [diff]
diff --git a/contrib/format-pdf/README.md b/contrib/format-pdf/README.md
index 6dbc2a5..bba351f 100644
--- a/contrib/format-pdf/README.md
+++ b/contrib/format-pdf/README.md

@@ -1,4 +1,4 @@
-# Format Plugin for PDF Table Reader
+# Format Plugin for PDF Tables
 One of the most annoying tasks is when you are working on a data science project and you get data that is in a PDF file. This plugin endeavours to enable you to query data in PDF tables using Drill's SQL interface.  
 
 ## Data Model
@@ -31,7 +31,7 @@
 * `extractionAlgorithm`:  Allows you to choose the extraction algorithm used for extracting data from the PDF file.  Choices are `spreadsheet` and `basic`.  Depending on your data, one may work better than the other.
 
 ## Accessing Document Metadata Fields
-PDF files have a considerable amount of metadata which can be useful for analysis.  Drill will extract the following fields from every PDF file.  Note that these fields are not projected in star queries and must be selected explicitly.  The document's creator populates these fields and some or all may be empty. With the exception of `_page_count` which is an `INT` and the two date fields, all the other fields are `VARCHAR` fields.
+PDF files have a considerable amount of metadata which can be useful for analysis.  Drill will extract the following fields from every PDF file.  Note that these fields are not projected in star queries and must be selected explicitly.  The document's creator populates these fields and some or all may be empty. With the exception of `_page_count`, `_table_count` and `_table_index` which are `INT` fields and the two date fields, all the other fields are `VARCHAR` fields.
  
  The fields are:
  * `_page_count`
@@ -44,6 +44,7 @@
  * `_modification_date`
  * `_trapped`
  * `_table_count`
+ * `_table_index`
  
  The query below will access a document's metadata:
  

diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
index fd6cec9..26d3a94 100644
--- a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfBatchReader.java

@@ -74,6 +74,7 @@
   private PdfRowIterator rowIterator;
   private final FileSchemaNegotiator negotiator;
   private int unregisteredColumnCount;
+  private List<RectangularTextContainer> rawFirstRow;
 
   // Tables
   private List<Table> tables;
@@ -99,15 +100,20 @@
     builder = new SchemaBuilder();
 
     openFile();
-    metadataReader = new PdfMetadataReader(document);
-
+    tables = PdfUtils.extractTablesFromPDF(document, config.plugin.getConfig().getAlgorithm());
+    metadataReader = new PdfMetadataReader(document, tables.size());
     // Get the tables if the user set the combine pages to true
     if (config.plugin.getConfig().combinePages() ) {
-      tables = PdfUtils.extractTablesFromPDF(document, config.plugin.getConfig().getAlgorithm());
       currentTable = tables.get(0);
+      rowIterator = new PdfRowIterator(currentTable);
     } else {
-      currentTable = PdfUtils.getSpecificTable(document, startingTableIndex, config.plugin.getConfig().getAlgorithm());
-      tables = Collections.singletonList(currentTable);
+      if (tables.size() > 0) {
+        currentTable = tables.get(startingTableIndex);
+        tables = Collections.singletonList(currentTable);
+        rowIterator = new PdfRowIterator(currentTable);
+      } else {
+        rowIterator = new PdfRowIterator();
+      }
 
       // If the user specifies a table index, and that table does not exist, throw an exception.
       if (currentTable == null && startingTableIndex != 0) {
@@ -119,9 +125,9 @@
     }
 
     // Get the row iterator and grab the first row to build the schema
-    rowIterator = new PdfRowIterator(currentTable);
     if (rowIterator.hasNext()) {
-      firstRow = PdfUtils.convertRowToStringArray(rowIterator.next());
+      rawFirstRow = rowIterator.next();
+      firstRow = PdfUtils.convertRowToStringArray(rawFirstRow);
     }
 
     // Support provided schema
@@ -156,6 +162,7 @@
         // Get the next table
         currentTableIndex++;
         currentTable = tables.get(currentTableIndex);
+        metadataReader.setTableIndex(currentTableIndex);
 
         // Update the row iterator
         rowIterator = new PdfRowIterator(currentTable);
@@ -173,12 +180,28 @@
         return false;
       }
 
+      // Edge case: If the document is not set to extract headers, we still need to process the first row which
+      // was used to build the schema.
+      if (! config.plugin.getConfig().extractHeaders()) {
+        processFirstRow();
+      }
+
       // Process the row
       processRow(rowIterator.next());
     }
     return true;
   }
 
+  private void processFirstRow() {
+    if (rawFirstRow == null) {
+      return;
+    }
+    processRow(rawFirstRow);
+
+    // Now clear out the rawFirstRow variable so that we don't accidentally read it again.
+    rawFirstRow = null;
+  }
+
   private void processRow(List<RectangularTextContainer> row) {
     if (row == null || row.size() == 0) {
       rowWriter.start();

diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
index 297fac9..a910d6b 100644
--- a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfMetadataReader.java

@@ -36,12 +36,14 @@
   private final Map<String, Object> metadata;
   private final List<PdfBatchReader.PdfColumnWriter> writers;
   private RowSetLoader rowWriter;
+  private int tableIndex;
 
 
-  public PdfMetadataReader(PDDocument document) {
+  public PdfMetadataReader(PDDocument document, int tableCount) {
     this.writers = new ArrayList<>();
     // We are using a LinkedHashMap to preserve the order
     this.metadata = new LinkedHashMap<>();
+    this.tableIndex = 1;
     PDDocumentInformation info = document.getDocumentInformation();
     metadata.put("pageCount", document.getNumberOfPages());
     metadata.put("title",info.getTitle());
@@ -53,11 +55,17 @@
     metadata.put("creationDate", info.getCreationDate());
     metadata.put("modificationDate", info.getModificationDate());
     metadata.put("trapped", info.getTrapped());
+    metadata.put("tableCount", tableCount);
+    metadata.put("tableIndex", tableIndex);
   }
 
   public void setRowWriter(RowSetLoader rowWriter) {
     this.rowWriter = rowWriter;
   }
+  public void setTableIndex(int tableIndex) {
+    this.tableIndex = tableIndex;
+    metadata.put("tableIndex", tableIndex);
+  }
 
   public void addImplicitColumnsToSchema() {
     // Add to schema
@@ -71,6 +79,8 @@
     addMetadataColumnToSchema("_creation_date", MinorType.TIMESTAMP);
     addMetadataColumnToSchema("_modification_date", MinorType.TIMESTAMP);
     addMetadataColumnToSchema("_trapped", MinorType.VARCHAR);
+    addMetadataColumnToSchema("_table_count", MinorType.INT);
+    addMetadataColumnToSchema("_table_index", MinorType.INT);
   }
 
   public void writeMetadata() {

diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
index 4e90d6b..f891d92 100644
--- a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfRowIterator.java

@@ -33,6 +33,12 @@
     this.rowCounter = 0;
   }
 
+  public PdfRowIterator() {
+    this.table = null;
+    this.rowCounter = 0;
+  }
+
+
   @Override
   public boolean hasNext() {
     if (table == null) {

diff --git a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
index ec72b86..370ad56 100644
--- a/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java
+++ b/contrib/format-pdf/src/main/java/org/apache/drill/exec/store/pdf/PdfUtils.java

@@ -34,6 +34,7 @@
 import technology.tabula.extractors.SpreadsheetExtractionAlgorithm;
 
 import java.util.ArrayList;
+import java.util.Collections;
 import java.util.List;
 
 public class PdfUtils {
@@ -206,6 +207,10 @@
     if (table == null) {
       return values;
     }
-    return table.getRows().get(rowIndex);
+    if (table.getRowCount() > 0) {
+      return table.getRows().get(rowIndex);
+    } else {
+      return Collections.emptyList();
+    }
   }
 }

diff --git a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
index 4985605..32b8734 100644
--- a/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java
+++ b/contrib/format-pdf/src/test/java/org/apache/drill/exec/store/pdf/TestPdfFormat.java

@@ -107,7 +107,7 @@
       "(type => 'pdf', combinePages => false, extractHeaders => false))";
 
     RowSet results = client.queryBuilder().sql(sql).rowSet();
-    assertEquals(31, results.rowCount());
+    assertEquals(32, results.rowCount());
     results.clear();
 
     sql = "SELECT * " +
@@ -182,7 +182,8 @@
       "_producer," +
       "_creation_date, " +
       "_modification_date, " +
-      "_trapped " +
+      "_trapped, " +
+      "_table_count " +
       "FROM cp.`pdf/20.pdf` " +
       "LIMIT 1";
 
@@ -200,6 +201,7 @@
       .addNullable("_creation_date", MinorType.TIMESTAMP)
       .addNullable("_modification_date", MinorType.TIMESTAMP)
       .addNullable("_trapped", MinorType.VARCHAR)
+      .addNullable("_table_count", MinorType.INT)
       .buildSchema();
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -210,7 +212,7 @@
         "Acrobat Distiller 7.0.5 (Windows)",
         857403000000L,
         1230835135000L,
-        null)
+        null, 1)
       .build();
 
     new RowSetComparison(expected).verifyAndClearAll(results);
@@ -270,7 +272,7 @@
       "_producer," +
       "_creation_date, " +
       "_modification_date, " +
-      "_trapped " +
+      "_trapped, _table_count " +
       "FROM table(cp.`pdf/labor.pdf` (type => 'pdf', extractionAlgorithm => 'spreadsheet')) LIMIT 1";
 
     RowSet results = client.queryBuilder().sql(sql).rowSet();
@@ -286,13 +288,14 @@
       .addNullable("_creation_date", MinorType.TIMESTAMP)
       .addNullable("_modification_date", MinorType.TIMESTAMP)
       .addNullable("_trapped", MinorType.VARCHAR)
+      .addNullable("_table_count", MinorType.INT)
       .buildSchema();
 
     RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
       .addRow(1, null, null, null, null, "pdftk 2.01 - www.pdftk.com",
         "itext-paulo-155 (itextpdf.sf.net-lowagie.com)",
         QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"),
-        QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"), null)
+        QueryTestUtil.ConvertDateToLong("2015-04-25T23:09:47Z"), null, 0)
     .build();
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
commit	d89578c5d60a6594b698974ae03a46ad11719a9c	[log] [tgz]
author	Charles S. Givre <cgivre@apache.org>	Thu Jan 19 08:42:04 2023 -0500
committer	GitHub <noreply@github.com>	Thu Jan 19 08:42:04 2023 -0500
tree	4afeffbc4f64a859d9b5e225516a26158a20aa5b
parent	18854ff79e1c560002f8c7837a3472877623f33c [diff]