DRILL-7495: Excel Reader Not Parsing Dates Correctly in First Column
diff --git a/contrib/format-excel/README.md b/contrib/format-excel/README.md
index 27a118a..ac08c66 100644
--- a/contrib/format-excel/README.md
+++ b/contrib/format-excel/README.md
@@ -25,7 +25,7 @@
column.
* `lastColumn`: To define a region within a spreadsheet, this is the right-most column index. This is indexed from one. If set to `0` Drill will read all available columns. This
is not inclusive, so if you ask for columns 2-5 you will get columns 2,3 and 4.
-* `readAllColumnsAsVarChar`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
+* `allTextMode`: When set to `true`, Drill will not attempt to infer column data types and will read everything as `VARCHAR`. Defaults to `false`;
## Usage
You can specify the configuration at runtime via the `table()` function or in the storage plugin configuration. For instance, if you just want to query an Excel file, you could
@@ -50,7 +50,7 @@
### Known Limitations:
At present, Drill requires that all columns be of the same data type. If they are not, Drill will throw an exception upon trying to read a column of mixed data type. If you are
- trying to query data with heterogenoeus columns, it will be necessary to set `readAllColumnsAsVarChar` to `true`.
+ trying to query data with heterogenoeus columns, it will be necessary to set `allTextMode` to `true`.
An additional limitation is that Drill infers the column data type from the first row of data. If a column is `null` in the first row, Drill will default to a datatype of
`VARCHAR`. However if in fact the column is `NUMERIC` this will cause errors.
diff --git a/contrib/format-excel/pom.xml b/contrib/format-excel/pom.xml
index 95f611a..c66a912 100644
--- a/contrib/format-excel/pom.xml
+++ b/contrib/format-excel/pom.xml
@@ -31,7 +31,7 @@
<name>contrib/format-excel</name>
<properties>
- <poi.version>3.17</poi.version>
+ <poi.version>4.1.1</poi.version>
</properties>
<dependencies>
<dependency>
diff --git a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
index 7bd3efd..848fa75 100644
--- a/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
+++ b/contrib/format-excel/src/main/java/org/apache/drill/exec/store/excel/ExcelBatchReader.java
@@ -45,9 +45,11 @@
import org.slf4j.Logger;
import java.io.InputStream;
+import java.util.Date;
import java.util.Iterator;
import java.io.IOException;
import java.util.ArrayList;
+import java.util.TimeZone;
public class ExcelBatchReader implements ManagedReader<FileSchemaNegotiator> {
@@ -216,7 +218,7 @@
Cell cell = cellIterator.next();
CellValue cellValue = evaluator.evaluate(cell);
- switch (cellValue.getCellTypeEnum()) {
+ switch (cellValue.getCellType()) {
case STRING:
tempColumnName = cell.getStringCellValue()
.replace(PARSER_WILDCARD, SAFE_WILDCARD)
@@ -265,7 +267,7 @@
/**
* Returns the column count. There are a few gotchas here in that we have to know the header row and count the physical number of cells
* in that row. Since the user can define the header row,
- * @return
+ * @return The number of actual columns
*/
private int getColumnCount() {
int columnCount;
@@ -301,7 +303,7 @@
}
int lastRow = readerConfig.lastRow;
- while (recordCount < lastRow && rowIterator.hasNext()) {
+ if (recordCount < lastRow && rowIterator.hasNext()) {
lineCount++;
Row row = rowIterator.next();
@@ -337,15 +339,18 @@
}
rowWriter.save();
recordCount++;
+ return true;
+ } else {
+ return false;
}
- return true;
+
}
/**
* Function to populate the column array
- * @param cell
- * @param cellValue
- * @param colPosition
+ * @param cell The input cell object
+ * @param cellValue The cell value
+ * @param colPosition The index of the column
*/
private void populateColumnArray(Cell cell, CellValue cellValue, int colPosition) {
if (!firstLine) {
@@ -355,7 +360,7 @@
if (cellValue == null) {
addColumnToArray(rowWriter, excelFieldNames.get(colPosition), MinorType.VARCHAR);
} else {
- CellType cellType = cellValue.getCellTypeEnum();
+ CellType cellType = cellValue.getCellType();
if (cellType == CellType.STRING || readerConfig.allTextMode) {
addColumnToArray(rowWriter, excelFieldNames.get(colPosition), MinorType.VARCHAR);
} else if (cellType == CellType.NUMERIC && DateUtil.isCellDateFormatted(cell)) {
@@ -412,7 +417,7 @@
/**
* Returns the index of the first row of actual data. This function is to be used to find the header row as the POI skips blank rows.
- * @return: The headerRow index, corrected for blank rows
+ * @return The headerRow index, corrected for blank rows
*/
private int getFirstHeaderRow() {
int firstRow = sheet.getFirstRowNum();
@@ -481,7 +486,6 @@
}
public void load(Cell cell) {
- CellValue cellValue = evaluator.evaluate(cell);
String fieldValue = String.valueOf(cell.getNumericCellValue());
if (fieldValue == null) {
@@ -519,7 +523,10 @@
if (cellValue == null) {
columnWriter.setNull();
} else {
- Instant timeStamp = new Instant(cellValue.getNumberValue());
+ logger.debug("Cell value: {}", cellValue.getNumberValue());
+
+ Date dt = DateUtil.getJavaDate(cellValue.getNumberValue(), TimeZone.getTimeZone("UTC"));
+ Instant timeStamp = new Instant(dt.toInstant().getEpochSecond() * 1000);
columnWriter.setTimestamp(timeStamp);
}
}
diff --git a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
index 50c4837..6a85c25 100644
--- a/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
+++ b/contrib/format-excel/src/test/java/org/apache/drill/exec/store/excel/TestExcelFormat.java
@@ -382,6 +382,25 @@
new RowSetComparison(expected).verifyAndClearAll(results);
}
+ @Test
+ public void testFileWithDoubleDates() throws Exception {
+ String sql = "SELECT `Close Date`, `Type` FROM table(cp.`excel/test_data.xlsx` (type=> 'excel', sheetName => 'comps')) WHERE style='Contemporary'";
+
+ RowSet results = client.queryBuilder().sql(sql).rowSet();
+
+ TupleMetadata expectedSchema = new SchemaBuilder()
+ .addNullable("Close Date", TypeProtos.MinorType.TIMESTAMP)
+ .addNullable("Type", TypeProtos.MinorType.VARCHAR)
+ .buildSchema();
+
+ RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
+ .addRow(1412294400000L, "Hi Rise")
+ .addRow(1417737600000L, "Hi Rise")
+ .build();
+
+ new RowSetComparison(expected).verifyAndClearAll(results);
+ }
+
private void generateCompressedFile(String fileName, String codecName, String outFileName) throws IOException {
FileSystem fs = ExecTest.getLocalFileSystem();
Configuration conf = fs.getConf();
diff --git a/contrib/format-excel/src/test/resources/excel/test_data.xlsx b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
index 1226f57..a04a6bd 100644
--- a/contrib/format-excel/src/test/resources/excel/test_data.xlsx
+++ b/contrib/format-excel/src/test/resources/excel/test_data.xlsx
Binary files differ