DRILL-7495: Excel Reader Not Parsing Dates Correctly in First Column

commit: e0d1bdd38cd9e08d170d9ed0314bc3a5cca64412 [log] [tgz]
author: Charles Givre <cgivre@apache.org> Tue Jan 07 15:27:33 2020 -0500
committer: Charles S. Givre <cgivre@apache.org> Thu Jan 09 09:04:44 2020 -0500
tree: 4f0b9147945a41e03d9abab0eed560f105169b8d
parent: 51bf4b72eb7a6cab71f977040a7cbf406cb50334 [diff]
diff --git a/contrib/format-excel/README.md b/contrib/format-excel/README.md
index 27a118a..ac08c66 100644
--- a/contrib/format-excel/README.md
+++ b/contrib/format-excel/README.md

@@ -25,7 +25,7 @@
  column.
 * `lastColumn`: To define a region within a spreadsheet, this is the right-most column index. This is indexed from one. If set to `0` Drill will read all available columns. This
  is not inclusive, so if you ask for columns 2-5 you will get columns 2,3 and 4. 
-* `readAllColumnsAsVarChar`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
+* `allTextMode`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
 
 ## Usage
 You can specify the configuration at runtime via the `table()` function or in the storage plugin configuration. For instance, if you just want to query an Excel file, you could
@@ -50,7 +50,7 @@
 
 ### Known Limitations:
 At present, Drill requires that all columns be of the same data type. If they are not, Drill will throw an exception upon trying to read a column of mixed data type. If you are
- trying to query data with heterogenoeus columns, it will be necessary to set `readAllColumnsAsVarChar` to `true`. 
+ trying to query data with heterogenoeus columns, it will be necessary to set `allTextMode` to `true`. 
 
 An additional limitation is that Drill infers the column data type from the first row of data. If a column is `null` in the first row, Drill will default to a datatype of
  `VARCHAR`. However if in fact the column is `NUMERIC` this will cause errors. 

diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 95f611a..c66a912 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml

@@ -31,7 +31,7 @@
   <name>contrib/format-excel</name>
 
   <properties>
-    <poi.version>3.17</poi.version>
+    <poi.version>4.1.1</poi.version>
   </properties>
   <dependencies>
     <dependency>

diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 7bd3efd..848fa75 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java

@@ -45,9 +45,11 @@
 import org.slf4j.Logger;
 
 import java.io.InputStream;
+import java.util.Date;
 import java.util.Iterator;
 import java.io.IOException;
 import java.util.ArrayList;
+import java.util.TimeZone;
 
 public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
 
@@ -216,7 +218,7 @@
         Cell cell = cellIterator.next();
 
         CellValue cellValue = evaluator.evaluate(cell);
-        switch (cellValue.getCellTypeEnum()) {
+        switch (cellValue.getCellType()) {
           case STRING:
             tempColumnName = cell.getStringCellValue()
               .replace(PARSER_WILDCARD, SAFE_WILDCARD)
@@ -265,7 +267,7 @@
   /**
    * Returns the column count.  There are a few gotchas here in that we have to know the header row and count the physical number of cells
    * in that row.  Since the user can define the header row,
-   * @return
+   * @return The number of actual columns
    */
   private int getColumnCount() {
     int columnCount;
@@ -301,7 +303,7 @@
     }
 
     int lastRow = readerConfig.lastRow;
-    while (recordCount < lastRow && rowIterator.hasNext()) {
+    if (recordCount < lastRow && rowIterator.hasNext()) {
       lineCount++;
 
       Row row = rowIterator.next();
@@ -337,15 +339,18 @@
       }
       rowWriter.save();
       recordCount++;
+      return true;
+    } else {
+      return false;
     }
-    return true;
+
   }
 
   /**
    * Function to populate the column array
-   * @param cell
-   * @param cellValue
-   * @param colPosition
+   * @param cell The input cell object
+   * @param cellValue The cell value
+   * @param colPosition The index of the column
    */
   private void populateColumnArray(Cell cell, CellValue cellValue, int colPosition) {
     if (!firstLine) {
@@ -355,7 +360,7 @@
     if (cellValue == null) {
       addColumnToArray(rowWriter, excelFieldNames.get(colPosition), MinorType.VARCHAR);
     } else {
-      CellType cellType = cellValue.getCellTypeEnum();
+      CellType cellType = cellValue.getCellType();
       if (cellType == CellType.STRING || readerConfig.allTextMode) {
         addColumnToArray(rowWriter, excelFieldNames.get(colPosition), MinorType.VARCHAR);
       } else if (cellType == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
@@ -412,7 +417,7 @@
 
   /**
    * Returns the index of the first row of actual data.  This function is to be used to find the header row as the POI skips blank rows.
-   * @return:  The headerRow index, corrected for blank rows
+   * @return The headerRow index, corrected for blank rows
    */
   private int getFirstHeaderRow() {
     int firstRow = sheet.getFirstRowNum();
@@ -481,7 +486,6 @@
     }
 
     public void load(Cell cell) {
-      CellValue cellValue = evaluator.evaluate(cell);
       String fieldValue = String.valueOf(cell.getNumericCellValue());
 
       if (fieldValue == null) {
@@ -519,7 +523,10 @@
       if (cellValue == null) {
         columnWriter.setNull();
       } else {
-        Instant timeStamp = new Instant(cellValue.getNumberValue());
+        logger.debug("Cell value: {}", cellValue.getNumberValue());
+
+        Date dt = DateUtil.getJavaDate(cellValue.getNumberValue(), TimeZone.getTimeZone("UTC"));
+        Instant timeStamp = new Instant(dt.toInstant().getEpochSecond() * 1000);
         columnWriter.setTimestamp(timeStamp);
       }
     }

diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 50c4837..6a85c25 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java

@@ -382,6 +382,25 @@
     new RowSetComparison(expected).verifyAndClearAll(results);
   }
 
+  @Test
+  public void testFileWithDoubleDates() throws Exception {
+    String sql = "SELECT `Close Date`, `Type` FROM table(cp.`excel/test_data.xlsx` (type=> 'excel', sheetName => 'comps')) WHERE style='Contemporary'";
+
+    RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+    TupleMetadata expectedSchema = new SchemaBuilder()
+      .addNullable("Close Date", TypeProtos.MinorType.TIMESTAMP)
+      .addNullable("Type", TypeProtos.MinorType.VARCHAR)
+      .buildSchema();
+
+    RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+      .addRow(1412294400000L, "Hi Rise")
+      .addRow(1417737600000L, "Hi Rise")
+      .build();
+
+    new RowSetComparison(expected).verifyAndClearAll(results);
+  }
+
   private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
     FileSystem fs = ExecTest.getLocalFileSystem();
     Configuration conf = fs.getConf();

diff --git a/contrib/format-excel/src/test/resources/excel/test_data.xlsx b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
index 1226f57..a04a6bd 100644
--- a/contrib/format-excel/src/test/resources/excel/test_data.xlsx
+++ b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
Binary files differ
commit	e0d1bdd38cd9e08d170d9ed0314bc3a5cca64412	[log] [tgz]
author	Charles Givre <cgivre@apache.org>	Tue Jan 07 15:27:33 2020 -0500
committer	Charles S. Givre <cgivre@apache.org>	Thu Jan 09 09:04:44 2020 -0500
tree	4f0b9147945a41e03d9abab0eed560f105169b8d
parent	51bf4b72eb7a6cab71f977040a7cbf406cb50334 [diff]