DRILL-7858: Drill Crashes with Certain HDF5 Files
diff --git a/contrib/format-hdf5/README.md b/contrib/format-hdf5/README.md
index d80c836..11c4072 100644
--- a/contrib/format-hdf5/README.md
+++ b/contrib/format-hdf5/README.md
@@ -13,7 +13,7 @@
### Example Configuration
For most uses, the configuration below will suffice to enable Drill to query HDF5 files.
-```
+```json
"hdf5": {
"type": "hdf5",
"extensions": [
@@ -34,7 +34,7 @@
```
The actual data in this file is mapped to a column called int_data. In order to effectively access the data, you should use Drill's `FLATTEN()` function on the `int_data` column, which produces the following result.
-```
+```bash
apache drill> select flatten(int_data) as int_data from dfs.test.`dset.h5`;
+---------------------+
| int_data |
@@ -47,7 +47,7 @@
```
Once the data is in this form, you can access it similarly to how you might access nested data in JSON or other files.
-```
+```bash
apache drill> SELECT int_data[0] as col_0,
. .semicolon> int_data[1] as col_1,
. .semicolon> int_data[2] as col_2
@@ -73,13 +73,13 @@
You can set the `defaultPath` variable in either the plugin configuration, or at query time using the `table()` function as shown in the example below:
- ```
+ ```sql
SELECT *
FROM table(dfs.test.`dset.h5` (type => 'hdf5', defaultPath => '/dset'))
```
This query will return the result below:
- ```
+ ```bash
apache drill> SELECT * FROM table(dfs.test.`dset.h5` (type => 'hdf5', defaultPath => '/dset'));
+-----------+-----------+-----------+-----------+-----------+-----------+
| int_col_0 | int_col_1 | int_col_2 | int_col_3 | int_col_4 | int_col_5 |
@@ -98,7 +98,7 @@
### Attributes
Occasionally, HDF5 paths will contain attributes. Drill will map these to a map data structure called `attributes`, as shown in the query below.
-```
+```bash
apache drill> SELECT attributes FROM dfs.test.`browsing.h5`;
+----------------------------------------------------------------------------------+
| attributes |
@@ -115,7 +115,7 @@
8 rows selected (0.292 seconds)
```
You can access the individual fields within the `attributes` map by using the structure `table.map.key`. Note that you will have to give the table an alias for this to work properly.
-```
+```bash
apache drill> SELECT path, data_type, file_name
FROM dfs.test.`browsing.h5` AS t1 WHERE t1.attributes.important = false;
+---------+-----------+-------------+
@@ -132,7 +132,7 @@
* HDF5 files can contain nested data sets of up to `n` dimensions. Since Drill works best with two dimensional data, datasets with more than two dimensions are reduced to 2
dimensions.
* HDF5 has a `COMPOUND` data type. At present, Drill supports reading `COMPOUND` data types that contain multiple datasets. At present Drill does not support `COMPOUND` fields
- with multidimesnional columns. Drill will ignore multidimensional columns within `COMPOUND` fields.
+ with multidimensional columns. Drill will ignore multidimensional columns within `COMPOUND` fields.
[1]: https://en.wikipedia.org/wiki/Hierarchical_Data_Format
[2]: https://www.hdfgroup.org
diff --git a/contrib/format-hdf5/pom.xml b/contrib/format-hdf5/pom.xml
index 7ef9cdc..3eff34f 100644
--- a/contrib/format-hdf5/pom.xml
+++ b/contrib/format-hdf5/pom.xml
@@ -37,9 +37,9 @@
<version>${project.version}</version>
</dependency>
<dependency>
- <groupId>cisd</groupId>
- <artifactId>jhdf5</artifactId>
- <version>14.12.6</version>
+ <groupId>io.jhdf</groupId>
+ <artifactId>jhdf</artifactId>
+ <version>0.6.1</version>
</dependency>
<!-- Test dependencies -->
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Attribute.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Attribute.java
index 570a9ef..7df118c 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Attribute.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Attribute.java
@@ -17,7 +17,6 @@
*/
package org.apache.drill.exec.store.hdf5;
-import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.TypeProtos.MinorType;
/**
@@ -27,11 +26,20 @@
private final MinorType dataType;
private final String key;
private final Object value;
+ private final boolean isCompound;
- public HDF5Attribute(TypeProtos.MinorType type, String key, Object value) {
+ public HDF5Attribute(MinorType type, String key, Object value) {
this.dataType = type;
this.key = key;
this.value = value;
+ this.isCompound = false;
+ }
+
+ public HDF5Attribute(MinorType type, String key, Object value, boolean isCompound) {
+ this.dataType = type;
+ this.key = key;
+ this.value = value;
+ this.isCompound = isCompound;
}
public MinorType getDataType(){ return dataType; }
@@ -42,6 +50,8 @@
return value;
}
+ public boolean isCompound() { return isCompound; }
+
@Override
public String toString() {
return String.format("%s: %s type: %s", getKey(), getValue(), getDataType());
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java
index c560660..9d2a7ef 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5BatchReader.java
@@ -18,15 +18,20 @@
package org.apache.drill.exec.store.hdf5;
-import ch.systemsx.cisd.hdf5.HDF5CompoundMemberInformation;
-import ch.systemsx.cisd.hdf5.HDF5DataSetInformation;
-import ch.systemsx.cisd.hdf5.HDF5FactoryProvider;
-import ch.systemsx.cisd.hdf5.HDF5LinkInformation;
-import ch.systemsx.cisd.hdf5.IHDF5Factory;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-import org.apache.commons.io.IOUtils;
+import io.jhdf.HdfFile;
+import io.jhdf.api.Attribute;
+import io.jhdf.api.Dataset;
+import io.jhdf.api.Group;
+import io.jhdf.api.Node;
+import io.jhdf.exceptions.HdfException;
+import io.jhdf.object.datatype.CompoundDataType;
+import io.jhdf.object.datatype.CompoundDataType.CompoundDataMember;
+import org.apache.drill.common.AutoCloseables;
+import org.apache.drill.common.exceptions.CustomErrorContext;
import org.apache.drill.common.exceptions.UserException;
import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileSchemaNegotiator;
import org.apache.drill.exec.physical.impl.scan.framework.ManagedReader;
import org.apache.drill.exec.physical.resultSet.ResultSetLoader;
@@ -38,7 +43,6 @@
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.store.hdf5.writers.HDF5DataWriter;
import org.apache.drill.exec.store.hdf5.writers.HDF5DoubleDataWriter;
-import org.apache.drill.exec.store.hdf5.writers.HDF5EnumDataWriter;
import org.apache.drill.exec.store.hdf5.writers.HDF5FloatDataWriter;
import org.apache.drill.exec.store.hdf5.writers.HDF5IntDataWriter;
import org.apache.drill.exec.store.hdf5.writers.HDF5LongDataWriter;
@@ -56,17 +60,15 @@
import org.slf4j.LoggerFactory;
import java.io.BufferedReader;
-import java.io.File;
import java.io.IOException;
import java.io.InputStream;
import java.io.InputStreamReader;
-import java.nio.file.StandardCopyOption;
import java.time.Instant;
import java.util.ArrayList;
import java.util.Arrays;
import java.util.HashMap;
-import java.util.BitSet;
import java.util.Iterator;
+import java.util.LinkedHashMap;
import java.util.List;
import java.util.Map;
@@ -113,11 +115,11 @@
private final int maxRecords;
+ private String fileName;
+
private FileSplit split;
- private IHDF5Reader hdf5Reader;
-
- private File inFile;
+ private HdfFile hdfFile;
private BufferedReader reader;
@@ -141,7 +143,9 @@
private ScalarWriter dimensionsWriter;
- private long[] dimensions;
+ private CustomErrorContext errorContext;
+
+ private int[] dimensions;
public static class HDF5ReaderConfig {
final HDF5FormatPlugin plugin;
@@ -150,13 +154,10 @@
final HDF5FormatConfig formatConfig;
- final File tempDirectory;
-
public HDF5ReaderConfig(HDF5FormatPlugin plugin, HDF5FormatConfig formatConfig) {
this.plugin = plugin;
this.formatConfig = formatConfig;
defaultPath = formatConfig.getDefaultPath();
- tempDirectory = plugin.getTmpDir();
}
}
@@ -169,20 +170,24 @@
@Override
public boolean open(FileSchemaNegotiator negotiator) {
split = negotiator.split();
+ errorContext = negotiator.parentErrorContext();
+ // Since the HDF file reader uses a stream to actually read the file, the file name from the
+ // module is incorrect.
+ fileName = split.getPath().getName();
try {
openFile(negotiator);
} catch (IOException e) {
throw UserException
.dataReadError(e)
.addContext("Failed to close input file: %s", split.getPath())
- .addContext(negotiator.parentErrorContext())
+ .addContext(errorContext)
.build(logger);
}
ResultSetLoader loader;
if (readerConfig.defaultPath == null) {
// Get file metadata
- List<HDF5DrillMetadata> metadata = getFileMetadata(hdf5Reader.object().getGroupMemberInformation("/", true), new ArrayList<>());
+ List<HDF5DrillMetadata> metadata = getFileMetadata(hdfFile, new ArrayList<>());
metadataIterator = metadata.iterator();
// Schema for Metadata query
@@ -198,27 +203,27 @@
negotiator.tableSchema(builder.buildSchema(), false);
loader = negotiator.build();
- dimensions = new long[0];
+ dimensions = new int[0];
rowWriter = loader.writer();
} else {
// This is the case when the default path is specified. Since the user is explicitly asking for a dataset
// Drill can obtain the schema by getting the datatypes below and ultimately mapping that schema to columns
- HDF5DataSetInformation dsInfo = hdf5Reader.object().getDataSetInformation(readerConfig.defaultPath);
- dimensions = dsInfo.getDimensions();
+ Dataset dataSet = hdfFile.getDatasetByPath(readerConfig.defaultPath);
+ dimensions = dataSet.getDimensions();
loader = negotiator.build();
rowWriter = loader.writer();
writerSpec = new WriterSpec(rowWriter, negotiator.providedSchema(),
negotiator.parentErrorContext());
if (dimensions.length <= 1) {
- buildSchemaFor1DimensionalDataset(dsInfo);
+ buildSchemaFor1DimensionalDataset(dataSet);
} else if (dimensions.length == 2) {
- buildSchemaFor2DimensionalDataset(dsInfo);
+ buildSchemaFor2DimensionalDataset(dataSet);
} else {
// Case for datasets of greater than 2D
// These are automatically flattened
- buildSchemaFor2DimensionalDataset(dsInfo);
+ buildSchemaFor2DimensionalDataset(dataSet);
}
}
if (readerConfig.defaultPath == null) {
@@ -237,37 +242,37 @@
* This function is called when the default path is set and the data set is a single dimension.
* This function will create an array of one dataWriter of the
* correct datatype
- * @param dsInfo The HDF5 dataset information
+ * @param dataset The HDF5 dataset
*/
- private void buildSchemaFor1DimensionalDataset(HDF5DataSetInformation dsInfo) {
- TypeProtos.MinorType currentDataType = HDF5Utils.getDataType(dsInfo);
+ private void buildSchemaFor1DimensionalDataset(Dataset dataset) {
+ MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
// Case for null or unknown data types:
if (currentDataType == null) {
- logger.warn("Couldn't add {}", dsInfo.getTypeInformation().tryGetJavaType().toGenericString());
+ logger.warn("Couldn't add {}", dataset.getJavaType().getName());
return;
}
dataWriters.add(buildWriter(currentDataType));
}
- private HDF5DataWriter buildWriter(TypeProtos.MinorType dataType) {
+ private HDF5DataWriter buildWriter(MinorType dataType) {
switch (dataType) {
- case GENERIC_OBJECT:
- return new HDF5EnumDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ /*case GENERIC_OBJECT:
+ return new HDF5EnumDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);*/
case VARCHAR:
- return new HDF5StringDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5StringDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case TIMESTAMP:
- return new HDF5TimestampDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5TimestampDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case INT:
- return new HDF5IntDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5IntDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case BIGINT:
- return new HDF5LongDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5LongDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case FLOAT8:
- return new HDF5DoubleDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5DoubleDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case FLOAT4:
- return new HDF5FloatDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5FloatDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
case MAP:
- return new HDF5MapDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath);
+ return new HDF5MapDataWriter(hdfFile, writerSpec, readerConfig.defaultPath);
default:
throw new UnsupportedOperationException(dataType.name());
}
@@ -276,17 +281,17 @@
/**
* Builds a Drill schema from a dataset with 2 or more dimensions. HDF5 only
* supports INT, LONG, DOUBLE and FLOAT for >2 data types so this function is
- * not as inclusinve as the 1D function. This function will build the schema
+ * not as inclusive as the 1D function. This function will build the schema
* by adding DataWriters to the dataWriters array.
*
- * @param dsInfo
+ * @param dataset
* The dataset which Drill will use to build a schema
*/
- private void buildSchemaFor2DimensionalDataset(HDF5DataSetInformation dsInfo) {
- TypeProtos.MinorType currentDataType = HDF5Utils.getDataType(dsInfo);
+ private void buildSchemaFor2DimensionalDataset(Dataset dataset) {
+ MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
// Case for null or unknown data types:
if (currentDataType == null) {
- logger.warn("Couldn't add {}", dsInfo.getTypeInformation().tryGetJavaType().toGenericString());
+ logger.warn("Couldn't add {}",dataset.getJavaType().getName());
return;
}
long cols = dimensions[1];
@@ -296,19 +301,19 @@
switch (currentDataType) {
case INT:
tempFieldName = INT_COLUMN_PREFIX + i;
- dataWriters.add(new HDF5IntDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath, tempFieldName, i));
+ dataWriters.add(new HDF5IntDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
break;
case BIGINT:
tempFieldName = LONG_COLUMN_PREFIX + i;
- dataWriters.add(new HDF5LongDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath, tempFieldName, i));
+ dataWriters.add(new HDF5LongDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
break;
case FLOAT8:
tempFieldName = DOUBLE_COLUMN_PREFIX + i;
- dataWriters.add(new HDF5DoubleDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath, tempFieldName, i));
+ dataWriters.add(new HDF5DoubleDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
break;
case FLOAT4:
tempFieldName = FLOAT_COLUMN_PREFIX + i;
- dataWriters.add(new HDF5FloatDataWriter(hdf5Reader, writerSpec, readerConfig.defaultPath, tempFieldName, i));
+ dataWriters.add(new HDF5FloatDataWriter(hdfFile, writerSpec, readerConfig.defaultPath, tempFieldName, i));
break;
default:
throw new UnsupportedOperationException(currentDataType.name());
@@ -322,10 +327,15 @@
private void openFile(FileSchemaNegotiator negotiator) throws IOException {
InputStream in = null;
try {
+ /*
+ * As a possible future improvement, the jhdf reader has the ability to read hdf5 files from
+ * a byte array or byte buffer. This implementation is better in that it does not require creating
+ * a temporary file which must be deleted later. However, it could result in memory issues in the
+ * event of large files.
+ */
+
in = negotiator.fileSystem().openPossiblyCompressedStream(split.getPath());
- IHDF5Factory factory = HDF5FactoryProvider.get();
- inFile = convertInputStreamToFile(in);
- hdf5Reader = factory.openForReading(inFile);
+ hdfFile = HdfFile.fromInputStream(in);
} catch (Exception e) {
if (in != null) {
in.close();
@@ -333,43 +343,12 @@
throw UserException
.dataReadError(e)
.message("Failed to open input file: %s", split.getPath())
+ .addContext(errorContext)
.build(logger);
}
reader = new BufferedReader(new InputStreamReader(in));
}
- /**
- * Converts the Drill InputStream into a File object for the HDF5 library. This function
- * exists due to a known limitation in the HDF5 library which cannot parse HDF5 directly from an input stream. A future
- * release of the library will support this.
- *
- * @param stream The input stream to be converted to a File
- * @return File The file which was converted from an InputStream
- */
- private File convertInputStreamToFile(InputStream stream) {
- File tmpDir = readerConfig.tempDirectory;
- String tempFileName = tmpDir.getPath() + "/~" + split.getPath().getName();
- File targetFile = new File(tempFileName);
-
- try {
- java.nio.file.Files.copy(stream, targetFile.toPath(), StandardCopyOption.REPLACE_EXISTING);
- } catch (Exception e) {
- if (targetFile.exists()) {
- if (!targetFile.delete()) {
- logger.warn("{} not deleted.", targetFile.getName());
- }
- }
- throw UserException
- .dataWriteError(e)
- .message("Failed to create temp HDF5 file: %s", split.getPath())
- .addContext(e.getMessage())
- .build(logger);
- }
-
- IOUtils.closeQuietly(stream);
- return targetFile;
- }
-
@Override
public boolean next() {
@@ -423,13 +402,12 @@
* @param rowWriter The input rowWriter object
*/
private void projectMetadataRow(RowSetLoader rowWriter) {
- String realFileName = inFile.getName().replace("~", "");
HDF5DrillMetadata metadataRow = metadataIterator.next();
rowWriter.start();
pathWriter.setString(metadataRow.getPath());
dataTypeWriter.setString(metadataRow.getDataType());
- fileNameWriter.setString(realFileName);
+ fileNameWriter.setString(fileName);
//Write attributes if present
if (metadataRow.getAttributes().size() > 0) {
@@ -437,14 +415,12 @@
}
if (metadataRow.getDataType().equalsIgnoreCase("DATASET")) {
-
- HDF5DataSetInformation dsInfo = hdf5Reader.object().getDataSetInformation(metadataRow.getPath());
-
+ Dataset dataset = hdfFile.getDatasetByPath(metadataRow.getPath());
// Project Dataset Metadata
- dataSizeWriter.setLong(dsInfo.getSize());
- elementCountWriter.setLong(dsInfo.getNumberOfElements());
- datasetTypeWriter.setString(dsInfo.getTypeInformation().getDataClass().name());
- dimensionsWriter.setString(Arrays.toString(dsInfo.getDimensions()));
+ dataSizeWriter.setLong(dataset.getSizeInBytes());
+ elementCountWriter.setLong(dataset.getSize());
+ datasetTypeWriter.setString(dataset.getJavaType().getName());
+ dimensionsWriter.setString(Arrays.toString(dataset.getDimensions()));
projectDataset(rowWriter, metadataRow.getPath());
}
@@ -455,35 +431,43 @@
* Gets the file metadata from a given HDF5 file. It will extract the file
* name the path, and adds any information to the metadata List.
*
- * @param members
- * A list of paths from which the metadata will be extracted
- * @param metadata
- * the HDF5 metadata object from which the metadata will be extracted
+ * @param group A list of paths from which the metadata will be extracted
+ * @param metadata The HDF5 metadata object from which the metadata will be extracted
* @return A list of metadata from the given file paths
*/
- private List<HDF5DrillMetadata> getFileMetadata(List<HDF5LinkInformation> members, List<HDF5DrillMetadata> metadata) {
- for (HDF5LinkInformation info : members) {
+ private List<HDF5DrillMetadata> getFileMetadata(Group group, List<HDF5DrillMetadata> metadata) {
+ Map<String, HDF5Attribute> attribs;
+
+ // The JHDF5 library seems to be unable to read certain nodes. This try/catch block verifies
+ // that the group can be read.
+ try {
+ group.getChildren();
+ } catch (HdfException e) {
+ logger.warn(e.getMessage());
+ return metadata;
+ }
+ for (Node node : group) {
HDF5DrillMetadata metadataRow = new HDF5DrillMetadata();
+ metadataRow.setPath(node.getPath());
+ metadataRow.setDataType(node.getType().name());
- metadataRow.setPath(info.getPath());
- metadataRow.setDataType(info.getType().toString());
-
- switch (info.getType()) {
+ switch (node.getType()) {
case DATASET:
- metadataRow.setAttributes(getAttributes(info.getPath()));
- metadata.add(metadataRow);
- break;
- case SOFT_LINK:
- // Soft links cannot have attributes
+ attribs = getAttributes(node.getPath());
+ metadataRow.setAttributes(attribs);
metadata.add(metadataRow);
break;
case GROUP:
- metadataRow.setAttributes(getAttributes(info.getPath()));
+ attribs = getAttributes(node.getPath());
+ metadataRow.setAttributes(attribs);
metadata.add(metadataRow);
- metadata = getFileMetadata(hdf5Reader.object().getGroupMemberInformation(info.getPath(), true), metadata);
+ if (!node.isLink()) {
+ // Links don't have metadata
+ getFileMetadata((Group) node, metadata);
+ }
break;
default:
- logger.warn("Unknown data type: {}", info.getType());
+ logger.warn("Unknown data type: {}", node.getType());
}
}
return metadata;
@@ -496,21 +480,34 @@
* @return Map The attributes for the given path. Empty Map if no attributes present
*/
private Map<String, HDF5Attribute> getAttributes(String path) {
- Map<String, HDF5Attribute> attributes = new HashMap<>();
- long attrCount = hdf5Reader.object().getObjectInformation(path).getNumberOfAttributes();
- if (attrCount > 0) {
- List<String> attrNames = hdf5Reader.object().getAllAttributeNames(path);
- for (String name : attrNames) {
- try {
- HDF5Attribute attribute = HDF5Utils.getAttribute(path, name, hdf5Reader);
- if (attribute == null) {
- continue;
- }
- attributes.put(attribute.getKey(), attribute);
- } catch (Exception e) {
- logger.warn("Couldn't add attribute: {} - {}", path, name);
- }
+ // Remove trailing slashes
+ if (path.endsWith("/")) {
+ path = path.substring(0, path.length() - 1);
+ }
+
+ logger.debug("Getting attributes for {}", path);
+ Map<String, HDF5Attribute> attributes = new HashMap<>();
+ Node theNode;
+ try {
+ theNode = hdfFile.getByPath(path);
+ } catch (Exception e) {
+ // Couldn't find node
+ logger.debug("Couldn't get attributes for path: {}", path);
+ logger.debug("Error: {}", e.getMessage());
+ return attributes;
+ }
+
+ Map<String, Attribute> attributeList = theNode.getAttributes();
+
+ logger.debug("Found {} attribtutes for {}", attributeList.size(), path);
+ for (Map.Entry<String, Attribute> attributeEntry : attributeList.entrySet()) {
+ HDF5Attribute attribute = HDF5Utils.getAttribute(path, attributeEntry.getKey(), hdfFile);
+
+ // Ignore compound attributes.
+ if (attribute != null && attributeEntry.getValue().isScalar()) {
+ logger.debug("Adding {} to attribute list for {}", attribute.getKey(), path);
+ attributes.put(attribute.getKey(), attribute);
}
}
return attributes;
@@ -528,124 +525,129 @@
* @param datapath
* The datapath from which the data will be read
*/
+
private void projectDataset(RowSetLoader rowWriter, String datapath) {
String fieldName = HDF5Utils.getNameFromPath(datapath);
- IHDF5Reader reader = hdf5Reader;
- HDF5DataSetInformation dsInfo = reader.object().getDataSetInformation(datapath);
+ Dataset dataset= hdfFile.getDatasetByPath(datapath);
// If the dataset is larger than 16MB, do not project the dataset
- if (dsInfo.getSize() > MAX_DATASET_SIZE) {
+ if (dataset.getSizeInBytes() > MAX_DATASET_SIZE) {
logger.warn("Dataset {} is greater than 16MB. Data will be truncated in Metadata view.", datapath);
}
- long[] dimensions = dsInfo.getDimensions();
+ int[] dimensions = dataset.getDimensions();
//Case for single dimensional data
- if (dimensions.length <= 1) {
- TypeProtos.MinorType currentDataType = HDF5Utils.getDataType(dsInfo);
+ if (dimensions.length == 1) {
+ MinorType currentDataType = HDF5Utils.getDataType(dataset.getDataType());
+ Object data;
+ try {
+ data = dataset.getData();
+ } catch (Exception e) {
+ logger.debug("Error reading {}", datapath);
+ return;
+ }
assert currentDataType != null;
switch (currentDataType) {
case GENERIC_OBJECT:
logger.warn("Couldn't read {}", datapath );
break;
case VARCHAR:
- String[] data = hdf5Reader.readStringArray(datapath);
- writeStringListColumn(rowWriter, fieldName, data);
+ String[] stringData = (String[])data;
+ writeStringListColumn(rowWriter, fieldName, stringData);
break;
case TIMESTAMP:
- long[] longList = hdf5Reader.time().readTimeStampArray(datapath);
+ long[] longList = (long[])data;
writeTimestampListColumn(rowWriter, fieldName, longList);
break;
case INT:
- if (!dsInfo.getTypeInformation().isSigned()) {
- if (dsInfo.getTypeInformation().getElementSize() > 4) {
- longList = hdf5Reader.uint64().readArray(datapath);
- writeLongListColumn(rowWriter, fieldName, longList);
- }
- } else {
- int[] intList = hdf5Reader.readIntArray(datapath);
- writeIntListColumn(rowWriter, fieldName, intList);
- }
+ int[] intList = (int[])data;
+ writeIntListColumn(rowWriter, fieldName, intList);
+ break;
+ case SMALLINT:
+ short[] shortList = (short[])data;
+ writeSmallIntColumn(rowWriter, fieldName, shortList);
+ break;
+ case TINYINT:
+ byte[] byteList = (byte[])data;
+ writeByteColumn(rowWriter, fieldName, byteList);
break;
case FLOAT4:
- float[] tempFloatList = hdf5Reader.readFloatArray(datapath);
+ float[] tempFloatList = (float[])data;
writeFloat4ListColumn(rowWriter, fieldName, tempFloatList);
break;
case FLOAT8:
- double[] tempDoubleList = hdf5Reader.readDoubleArray(datapath);
+ double[] tempDoubleList = (double[])data;
writeFloat8ListColumn(rowWriter, fieldName, tempDoubleList);
break;
case BIGINT:
- if (!dsInfo.getTypeInformation().isSigned()) {
- logger.warn("Drill does not support unsigned 64bit integers.");
- break;
- }
- long[] tempBigIntList = hdf5Reader.readLongArray(datapath);
+ long[] tempBigIntList = (long[])data;
writeLongListColumn(rowWriter, fieldName, tempBigIntList);
break;
case MAP:
try {
- getAndMapCompoundData(datapath, new ArrayList<>(), hdf5Reader, rowWriter);
+ getAndMapCompoundData(datapath, hdfFile, rowWriter);
} catch (Exception e) {
throw UserException
.dataReadError()
- .message("Error writing Compound Field: ")
- .addContext(e.getMessage())
+ .message("Error writing Compound Field: {}", e.getMessage())
+ .addContext(errorContext)
.build(logger);
}
break;
-
default:
// Case for data types that cannot be read
- logger.warn("{} not implemented yet.", dsInfo.getTypeInformation().tryGetJavaType());
+ logger.warn("{} not implemented.", currentDataType.name());
}
} else if (dimensions.length == 2) {
// Case for 2D data sets. These are projected as lists of lists or maps of maps
- long cols = dimensions[1];
- long rows = dimensions[0];
- switch (HDF5Utils.getDataType(dsInfo)) {
+ int cols = dimensions[1];
+ int rows = dimensions[0];
+
+ // TODO Add Boolean, Small and TinyInt data types
+ switch (HDF5Utils.getDataType(dataset.getDataType())) {
case INT:
- int[][] colData = hdf5Reader.readIntMatrix(datapath);
- mapIntMatrixField(colData, (int) cols, (int) rows, rowWriter);
+ int[][] colData = (int[][])dataset.getData();
+ mapIntMatrixField(colData, cols, rows, rowWriter);
break;
case FLOAT4:
- float[][] floatData = hdf5Reader.readFloatMatrix(datapath);
- mapFloatMatrixField(floatData, (int) cols, (int) rows, rowWriter);
+ float[][] floatData = (float[][])dataset.getData();
+ mapFloatMatrixField(floatData, cols, rows, rowWriter);
break;
case FLOAT8:
- double[][] doubleData = hdf5Reader.readDoubleMatrix(datapath);
- mapDoubleMatrixField(doubleData, (int) cols, (int) rows, rowWriter);
+ double[][] doubleData = (double[][])dataset.getData();
+ mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
break;
case BIGINT:
- long[][] longData = hdf5Reader.readLongMatrix(datapath);
- mapBigIntMatrixField(longData, (int) cols, (int) rows, rowWriter);
+ long[][] longData = (long[][])dataset.getData();
+ mapBigIntMatrixField(longData, cols, rows, rowWriter);
break;
default:
- logger.warn("{} not implemented.", HDF5Utils.getDataType(dsInfo));
+ logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
}
- } else {
+ } else if (dimensions.length > 2){
// Case for data sets with dimensions > 2
- long cols = dimensions[1];
- long rows = dimensions[0];
- switch (HDF5Utils.getDataType(dsInfo)) {
+ int cols = dimensions[1];
+ int rows = dimensions[0];
+ switch (HDF5Utils.getDataType(dataset.getDataType())) {
case INT:
- int[][] colData = hdf5Reader.int32().readMDArray(datapath).toMatrix();
- mapIntMatrixField(colData, (int) cols, (int) rows, rowWriter);
+ int[][] intMatrix = HDF5Utils.toIntMatrix((Object[]) dataset.getData());
+ mapIntMatrixField(intMatrix, cols, rows, rowWriter);
break;
case FLOAT4:
- float[][] floatData = hdf5Reader.float32().readMDArray(datapath).toMatrix();
- mapFloatMatrixField(floatData, (int) cols, (int) rows, rowWriter);
+ float[][] floatData = HDF5Utils.toFloatMatrix((Object[]) dataset.getData());
+ mapFloatMatrixField(floatData, cols, rows, rowWriter);
break;
case FLOAT8:
- double[][] doubleData = hdf5Reader.float64().readMDArray(datapath).toMatrix();
- mapDoubleMatrixField(doubleData, (int) cols, (int) rows, rowWriter);
+ double[][] doubleData = HDF5Utils.toDoubleMatrix((Object[]) dataset.getData());
+ mapDoubleMatrixField(doubleData, cols, rows, rowWriter);
break;
case BIGINT:
- long[][] longData = hdf5Reader.int64().readMDArray(datapath).toMatrix();
- mapBigIntMatrixField(longData, (int) cols, (int) rows, rowWriter);
+ long[][] longData = HDF5Utils.toLongMatrix((Object[]) dataset.getData());
+ mapBigIntMatrixField(longData, cols, rows, rowWriter);
break;
default:
- logger.warn("{} not implemented.", HDF5Utils.getDataType(dsInfo));
+ logger.warn("{} not implemented.", HDF5Utils.getDataType(dataset.getDataType()));
}
}
}
@@ -674,6 +676,70 @@
}
/**
+ * Helper function to write a 1D short column
+ *
+ * @param rowWriter The row to which the data will be written
+ * @param name The column name
+ * @param value The value to be written
+ */
+ private void writeSmallIntColumn(TupleWriter rowWriter, String name, short value) {
+ ScalarWriter colWriter = getColWriter(rowWriter, name, MinorType.SMALLINT);
+ colWriter.setInt(value);
+ }
+
+ /**
+ * Helper function to write a 2D short list
+ * @param rowWriter the row to which the data will be written
+ * @param name the name of the outer list
+ * @param list the list of data
+ */
+ private void writeSmallIntColumn(TupleWriter rowWriter, String name, short[] list) {
+ int index = rowWriter.tupleSchema().index(name);
+ if (index == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(name, MinorType.SMALLINT, TypeProtos.DataMode.REPEATED);
+ index = rowWriter.addColumn(colSchema);
+ }
+
+ ScalarWriter arrayWriter = rowWriter.column(index).array().scalar();
+ int maxElements = Math.min(list.length, PREVIEW_ROW_LIMIT);
+ for (int i = 0; i < maxElements; i++) {
+ arrayWriter.setInt(list[i]);
+ }
+ }
+
+ /**
+ * Helper function to write a 1D byte column
+ *
+ * @param rowWriter The row to which the data will be written
+ * @param name The column name
+ * @param value The value to be written
+ */
+ private void writeByteColumn(TupleWriter rowWriter, String name, byte value) {
+ ScalarWriter colWriter = getColWriter(rowWriter, name, MinorType.TINYINT);
+ colWriter.setInt(value);
+ }
+
+ /**
+ * Helper function to write a 2D byte list
+ * @param rowWriter the row to which the data will be written
+ * @param name the name of the outer list
+ * @param list the list of data
+ */
+ private void writeByteColumn(TupleWriter rowWriter, String name, byte[] list) {
+ int index = rowWriter.tupleSchema().index(name);
+ if (index == -1) {
+ ColumnMetadata colSchema = MetadataUtils.newScalar(name, MinorType.TINYINT, TypeProtos.DataMode.REPEATED);
+ index = rowWriter.addColumn(colSchema);
+ }
+
+ ScalarWriter arrayWriter = rowWriter.column(index).array().scalar();
+ int maxElements = Math.min(list.length, PREVIEW_ROW_LIMIT);
+ for (int i = 0; i < maxElements; i++) {
+ arrayWriter.setInt(list[i]);
+ }
+ }
+
+ /**
* Helper function to write a 1D int column
*
* @param rowWriter The row to which the data will be written
@@ -1024,18 +1090,19 @@
HDF5Attribute attrib = entry.getValue();
switch (attrib.getDataType()) {
case BIT:
- boolean value = (Boolean) attrib.getValue();
- writeBooleanColumn(mapWriter, key, value);
+ writeBooleanColumn(mapWriter, key, (Boolean) attrib.getValue());
break;
case BIGINT:
writeLongColumn(mapWriter, key, (Long) attrib.getValue());
break;
case INT:
- //try {
- writeIntColumn(mapWriter, key, (Integer) attrib.getValue());
- //} catch (Exception e) {
- // logger.warn("{} {}", key, attrib);
- //}
+ writeIntColumn(mapWriter, key, (Integer) attrib.getValue());
+ break;
+ case SMALLINT:
+ writeSmallIntColumn(mapWriter, key, (Short) attrib.getValue());
+ break;
+ case TINYINT:
+ writeByteColumn(mapWriter, key, (Byte) attrib.getValue());
break;
case FLOAT8:
writeFloat8Column(mapWriter, key, (Double) attrib.getValue());
@@ -1048,6 +1115,7 @@
break;
case TIMESTAMP:
writeTimestampColumn(mapWriter, key, (Long) attrib.getValue());
+ break;
case GENERIC_OBJECT:
//This is the case for HDF5 enums
String enumText = attrib.getValue().toString();
@@ -1064,130 +1132,102 @@
* It automatically flattens anything greater than 2 dimensions.
*
* @param path the HDF5 path tp the compound data
- * @param fieldNames The field names to be mapped
* @param reader the HDF5 reader for the data file
* @param rowWriter the rowWriter to write the data
*/
- private void getAndMapCompoundData(String path, List<String> fieldNames, IHDF5Reader reader, RowSetLoader rowWriter) {
+ private void getAndMapCompoundData(String path, HdfFile reader, RowSetLoader rowWriter) {
final String COMPOUND_DATA_FIELD_NAME = "compound_data";
- String resolvedPath = HDF5Utils.resolvePath(path, reader);
- Class<?> dataClass = HDF5Utils.getDatasetClass(resolvedPath, reader);
- if (dataClass == Map.class) {
- Object[][] values = reader.compound().readArray(resolvedPath, Object[].class);
- String currentFieldName;
+ List<CompoundDataMember> data = ((CompoundDataType) reader.getDatasetByPath(path).getDataType()).getMembers();
+ int index;
- if (fieldNames != null) {
- HDF5CompoundMemberInformation[] infos = reader.compound().getDataSetInfo(resolvedPath);
- for (HDF5CompoundMemberInformation info : infos) {
- fieldNames.add(info.getName());
- }
+ // Add map to schema
+ SchemaBuilder innerSchema = new SchemaBuilder();
+ MapBuilder mapBuilder = innerSchema.addMap(COMPOUND_DATA_FIELD_NAME);
+
+ // Loop to build schema
+ for (CompoundDataMember dataMember : data) {
+ String dataType = dataMember.getDataType().getJavaType().getName();
+ String fieldName = dataMember.getName();
+
+ switch (dataType) {
+ case "byte":
+ mapBuilder.add(fieldName, MinorType.TINYINT, DataMode.REPEATED);
+ break;
+ case "short":
+ mapBuilder.add(fieldName, MinorType.SMALLINT, DataMode.REPEATED);
+ break;
+ case "int":
+ mapBuilder.add(fieldName, MinorType.INT, DataMode.REPEATED);
+ break;
+ case "double":
+ mapBuilder.add(fieldName, MinorType.FLOAT8, DataMode.REPEATED);
+ break;
+ case "float":
+ mapBuilder.add(fieldName, MinorType.FLOAT4, DataMode.REPEATED);
+ break;
+ case "long":
+ mapBuilder.add(fieldName, MinorType.BIGINT, DataMode.REPEATED);
+ break;
+ case "boolean":
+ mapBuilder.add(fieldName, MinorType.BIT, DataMode.REPEATED);
+ break;
+ case "java.lang.String":
+ mapBuilder.add(fieldName, MinorType.VARCHAR, DataMode.REPEATED);
+ break;
+ default:
+ logger.warn("Drill cannot process data type {} in compound fields.", dataType);
+ break;
}
+ }
+ TupleMetadata finalInnerSchema = mapBuilder.resumeSchema().buildSchema();
+ index = rowWriter.tupleSchema().index(COMPOUND_DATA_FIELD_NAME);
+ if (index == -1) {
+ index = rowWriter.addColumn(finalInnerSchema.column(COMPOUND_DATA_FIELD_NAME));
+ }
+ TupleWriter listWriter = rowWriter.column(index).tuple();
- // Case for auto-flatten
- if (readerConfig.defaultPath != null) {
- for (int row = 0; row < values.length; row++) {
- rowWriter.start();
- for (int col = 0; col < values[row].length; col++) {
+ for (CompoundDataMember dataMember : data) {
+ String dataType = dataMember.getDataType().getJavaType().getName();
+ String fieldName = dataMember.getName();
+ int[] dataLength = reader.getDatasetByPath(path).getDimensions();
+ Object rawData = ((LinkedHashMap<String, ?>)reader.getDatasetByPath(path).getData()).get(fieldName);
- assert fieldNames != null;
- currentFieldName = fieldNames.get(col);
-
- if (values[row][col] instanceof Integer) {
- writeIntColumn(rowWriter, currentFieldName, (Integer) values[row][col]);
- } else if (values[row][col] instanceof Short) {
- writeIntColumn(rowWriter, currentFieldName, ((Short) values[row][col]).intValue());
- } else if (values[row][col] instanceof Byte) {
- writeIntColumn(rowWriter, currentFieldName, ((Byte) values[row][col]).intValue());
- } else if (values[row][col] instanceof Long) {
- writeLongColumn(rowWriter, currentFieldName, (Long) values[row][col]);
- } else if (values[row][col] instanceof Float) {
- writeFloat4Column(rowWriter, currentFieldName, (Float) values[row][col]);
- } else if (values[row][col] instanceof Double) {
- writeFloat8Column(rowWriter, currentFieldName, (Double) values[row][col]);
- } else if (values[row][col] instanceof BitSet || values[row][col] instanceof Boolean) {
- assert values[row][col] instanceof Integer;
- writeBooleanColumn(rowWriter, currentFieldName, (Integer) values[row][col]);
- } else if (values[row][col] instanceof String) {
- String stringValue = (String) values[row][col];
- writeStringColumn(rowWriter, currentFieldName, stringValue);
+ ArrayWriter innerWriter = listWriter.array(fieldName);
+ for (int i = 0; i < dataLength[0]; i++) {
+ switch (dataType) {
+ case "byte":
+ innerWriter.scalar().setInt(((byte[])rawData)[i]);
+ break;
+ case "short":
+ innerWriter.scalar().setInt(((short[])rawData)[i]);
+ break;
+ case "int":
+ innerWriter.scalar().setInt(((int[])rawData)[i]);
+ break;
+ case "double":
+ innerWriter.scalar().setDouble(((double[])rawData)[i]);
+ break;
+ case "float":
+ innerWriter.scalar().setFloat(((float[])rawData)[i]);
+ break;
+ case "long":
+ innerWriter.scalar().setLong(((long[])rawData)[i]);
+ break;
+ case "boolean":
+ innerWriter.scalar().setBoolean(((boolean[])rawData)[i]);
+ break;
+ case "java.lang.String":
+ if ((((String[])rawData)[i]) != null) {
+ innerWriter.scalar().setString(((String[]) rawData)[i]);
+ } else {
+ innerWriter.scalar().setNull();
}
- }
- rowWriter.save();
- }
- } else {
- int index = 0;
- if (fieldNames != null) {
- HDF5CompoundMemberInformation[] infos = reader.compound().getDataSetInfo(resolvedPath);
-
- SchemaBuilder innerSchema = new SchemaBuilder();
- MapBuilder mapBuilder = innerSchema.addMap(COMPOUND_DATA_FIELD_NAME);
- for (HDF5CompoundMemberInformation info : infos) {
- fieldNames.add(info.getName());
- String compoundColumnDataType = info.getType().tryGetJavaType().getSimpleName();
-
- switch (compoundColumnDataType) {
- case "int":
- mapBuilder.add(info.getName(), TypeProtos.MinorType.INT, TypeProtos.DataMode.REPEATED);
- break;
- case "double":
- mapBuilder.add(info.getName(), TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.REPEATED);
- break;
- case "float":
- mapBuilder.add(info.getName(), TypeProtos.MinorType.FLOAT4, TypeProtos.DataMode.REPEATED);
- break;
- case "long":
- mapBuilder.add(info.getName(), TypeProtos.MinorType.BIGINT, TypeProtos.DataMode.REPEATED);
- break;
- case "String":
- mapBuilder.add(info.getName(), TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.REPEATED);
- break;
- }
- }
- TupleMetadata finalInnerSchema = mapBuilder.resumeSchema().buildSchema();
-
- index = rowWriter.tupleSchema().index(COMPOUND_DATA_FIELD_NAME);
- if (index == -1) {
- index = rowWriter.addColumn(finalInnerSchema.column(COMPOUND_DATA_FIELD_NAME));
- }
- }
- TupleWriter listWriter = rowWriter.column(index).tuple();
-
- //Note: The [][] returns an array of [rows][cols]
- for (int row = 0; row < values.length; row++) {
- // Iterate over rows
- for (int col = 0; col < values[row].length; col++) {
- assert fieldNames != null;
- currentFieldName = fieldNames.get(col);
- try {
- ArrayWriter innerWriter = listWriter.array(currentFieldName);
- if (values[row][col] instanceof Integer) {
- innerWriter.scalar().setInt((Integer) values[row][col]);
- } else if (values[row][col] instanceof Short) {
- innerWriter.scalar().setInt((Short) values[row][col]);
- } else if (values[row][col] instanceof Byte) {
- innerWriter.scalar().setInt((Byte) values[row][col]);
- } else if (values[row][col] instanceof Long) {
- innerWriter.scalar().setLong((Long) values[row][col]);
- } else if (values[row][col] instanceof Float) {
- innerWriter.scalar().setDouble((Float) values[row][col]);
- } else if (values[row][col] instanceof Double) {
- innerWriter.scalar().setDouble((Double) values[row][col]);
- } else if (values[row][col] instanceof BitSet || values[row][col] instanceof Boolean) {
- innerWriter.scalar().setBoolean((Boolean) values[row][col]);
- } else if (values[row][col] instanceof String) {
- innerWriter.scalar().setString((String) values[row][col]);
- } else {
- logger.warn("Skipping {}/{} due to unsupported data type.", resolvedPath, currentFieldName);
- }
- if (col == values[row].length) {
- innerWriter.save();
- }
- } catch (TupleWriter.UndefinedColumnException e) {
- logger.warn("Drill does not support maps and lists in HDF5 Compound fields. Skipping: {}/{}", resolvedPath, currentFieldName);
- }
- }
+ break;
+ default:
+ logger.warn("Drill cannot process data type {} in compound fields.", dataType);
+ break;
}
}
}
@@ -1195,23 +1235,18 @@
@Override
public void close() {
- if (hdf5Reader != null) {
- hdf5Reader.close();
- hdf5Reader = null;
+ AutoCloseables.closeSilently(hdfFile);
+ /*
+ * The current implementation of the HDF5 reader creates a temp file which needs to be removed
+ * when the batch reader is closed. A possible future functionality might be to use the
+ */
+ boolean result = hdfFile.getFile().delete();
+ if (!result) {
+ logger.warn("Failed to delete HDF5 temp file {}", hdfFile.getFile().getName());
}
- if (reader != null) {
- try {
- reader.close();
- } catch (IOException e) {
- logger.debug("Failed to close HDF5 Reader.");
- }
- reader = null;
- }
- if (inFile != null) {
- if (!inFile.delete()) {
- logger.warn("{} file not deleted.", inFile.getName());
- }
- inFile = null;
- }
+ hdfFile = null;
+
+ AutoCloseables.closeSilently(reader);
+ reader = null;
}
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5DrillMetadata.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5DrillMetadata.java
index 2ed43d3..557214b 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5DrillMetadata.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5DrillMetadata.java
@@ -37,6 +37,9 @@
}
public void setPath(String path) {
+ if (path.endsWith("/")) {
+ path = path.substring(0, path.length() - 1);
+ }
this.path = path;
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5FormatPlugin.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5FormatPlugin.java
index 07e0763..e26ed67 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5FormatPlugin.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5FormatPlugin.java
@@ -18,11 +18,9 @@
package org.apache.drill.exec.store.hdf5;
-import org.apache.drill.common.config.DrillConfig;
import org.apache.drill.common.logical.StoragePluginConfig;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.common.types.Types;
-import org.apache.drill.exec.ExecConstants;
import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework;
import org.apache.drill.exec.physical.impl.scan.file.FileScanFramework.FileScanBuilder;
@@ -30,26 +28,22 @@
import org.apache.drill.exec.server.DrillbitContext;
import org.apache.drill.exec.server.options.OptionManager;
import org.apache.drill.exec.store.dfs.easy.EasySubScan;
-import org.apache.drill.shaded.guava.com.google.common.io.Files;
import org.apache.hadoop.conf.Configuration;
import org.apache.drill.exec.store.dfs.easy.EasyFormatPlugin;
import org.apache.drill.exec.store.hdf5.HDF5BatchReader.HDF5ReaderConfig;
-import java.io.File;
public class HDF5FormatPlugin extends EasyFormatPlugin<HDF5FormatConfig> {
public static final String DEFAULT_NAME = "hdf5";
- private final DrillbitContext context;
public HDF5FormatPlugin(String name, DrillbitContext context,
Configuration fsConf,
StoragePluginConfig storageConfig,
HDF5FormatConfig formatConfig) {
super(name, easyConfig(fsConf, formatConfig), context, storageConfig, formatConfig);
- this.context = context;
}
private static EasyFormatConfig easyConfig(Configuration fsConf, HDF5FormatConfig pluginConfig) {
@@ -92,27 +86,4 @@
return new HDF5BatchReader(readerConfig, maxRecords);
}
}
-
- /**
- * First tries to get drill temporary directory value from from config ${drill.tmp-dir},
- * then checks environmental variable $DRILL_TMP_DIR.
- * If value is still missing, generates directory using {@link Files#createTempDir()}.
- *
- * @return drill temporary directory path
- */
- protected File getTmpDir() {
- DrillConfig config = context.getConfig();
- String drillTempDir;
- if (config.hasPath(ExecConstants.DRILL_TMP_DIR)) {
- drillTempDir = config.getString(ExecConstants.DRILL_TMP_DIR);
- } else {
- drillTempDir = System.getenv("DRILL_TMP_DIR");
- }
-
- if (drillTempDir == null) {
- return Files.createTempDir();
- }
-
- return new File(drillTempDir);
- }
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Utils.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Utils.java
index 3401ce8..8d54692 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Utils.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/HDF5Utils.java
@@ -18,27 +18,25 @@
package org.apache.drill.exec.store.hdf5;
-import ch.systemsx.cisd.hdf5.HDF5DataClass;
-import ch.systemsx.cisd.hdf5.HDF5EnumerationValue;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-import ch.systemsx.cisd.hdf5.HDF5DataSetInformation;
-import ch.systemsx.cisd.hdf5.HDF5DataTypeInformation;
+import io.jhdf.HdfFile;
+import io.jhdf.api.Attribute;
+import io.jhdf.object.datatype.DataType;
import org.apache.commons.lang3.ArrayUtils;
import org.apache.drill.common.types.TypeProtos.MinorType;
+
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
+import java.util.Arrays;
import java.util.BitSet;
-import java.util.List;
import java.util.Map;
import java.util.regex.Matcher;
import java.util.regex.Pattern;
+import java.util.stream.Stream;
public class HDF5Utils {
private static final Logger logger = LoggerFactory.getLogger(HDF5Utils.class);
- private static final boolean CASE_SENSITIVE = false;
-
/*
* This regex is used to extract the final part of an HDF5 path, which is the name of the data field or column.
* While these look like file paths, they are fully contained within HDF5. This regex would extract part3 from:
@@ -46,132 +44,93 @@
*/
private static final Pattern PATH_PATTERN = Pattern.compile("/*.*/(.+?)$");
+
/**
* This function returns and HDF5Attribute object for use when Drill maps the attributes.
*
* @param pathName The path to retrieve attributes from
* @param key The key for the specific attribute you are retrieving
- * @param reader The IHDF5 reader object for the file you are querying
+ * @param hdf5File The hdfFile reader object for the file you are querying
* @return HDF5Attribute The attribute from the path with the key that was requested.
*/
- public static HDF5Attribute getAttribute(String pathName, String key, IHDF5Reader reader) {
+ public static HDF5Attribute getAttribute(String pathName, String key, HdfFile hdf5File) {
if (pathName.equals("")) {
pathName = "/";
}
-
- if (!reader.exists(pathName)) {
+ if (hdf5File.getByPath(pathName) == null) {
return null;
}
if (key.equals("dimensions")) {
- HDF5DataSetInformation datasetInfo = reader.object().getDataSetInformation(pathName);
- long[] dimensions = datasetInfo.getDimensions();
+ int[] dimensions = hdf5File.getDatasetByPath(pathName).getDimensions();
ArrayUtils.reverse(dimensions);
return new HDF5Attribute(MinorType.LIST, "dimensions", dimensions);
}
if (key.equals("dataType")) {
- HDF5DataSetInformation datasetInfo = reader.object().getDataSetInformation(pathName);
- return new HDF5Attribute(getDataType(datasetInfo), "DataType", datasetInfo.getTypeInformation().getDataClass());
+ String typeName = hdf5File.getDatasetByPath(pathName).getDataType().getJavaType().getName();
+ return new HDF5Attribute(getDataType(hdf5File.getDatasetByPath(pathName).getDataType()), "DataType", typeName);
}
-
- if (!reader.object().hasAttribute(pathName, key)) {
+ if (hdf5File.getByPath(pathName).getAttribute(key) == null) {
return null;
}
- HDF5DataTypeInformation attributeInfo = reader.object().getAttributeInformation(pathName, key);
- Class<?> type = attributeInfo.tryGetJavaType();
+ Attribute attribute = hdf5File.getByPath(pathName).getAttribute(key);
+ Class<?> type = hdf5File.getByPath(pathName).getAttribute(key).getJavaType();
+
if (type.isAssignableFrom(long[].class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.BIGINT, key, reader.int64().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.BIGINT, key, reader.uint64().getAttr(pathName, key));
- }
+ return new HDF5Attribute(MinorType.BIGINT, key, attribute.getData(), true);
} else if (type.isAssignableFrom(int[].class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int32().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint32().getAttr(pathName, key));
- }
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData(), true);
} else if (type.isAssignableFrom(short[].class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int16().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint16().getAttr(pathName, key));
- }
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData(), true);
} else if (type.isAssignableFrom(byte[].class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int8().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint8().getAttr(pathName, key));
- }
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData(), true);
} else if (type.isAssignableFrom(double[].class)) {
- return new HDF5Attribute(MinorType.FLOAT8, key, reader.float64().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.FLOAT8, key, attribute.getData(), true);
} else if (type.isAssignableFrom(float[].class)) {
- return new HDF5Attribute(MinorType.FLOAT8, key, reader.float32().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.FLOAT8, key, attribute.getData(), true);
} else if (type.isAssignableFrom(String[].class)) {
- return new HDF5Attribute(MinorType.VARCHAR, key, reader.string().getAttr(pathName, key));
- } else if (type.isAssignableFrom(long.class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.BIGINT, key, reader.int64().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.BIGINT, key, reader.uint64().getAttr(pathName, key));
- }
- } else if (type.isAssignableFrom(int.class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int32().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint32().getAttr(pathName, key));
- }
- } else if (type.isAssignableFrom(short.class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int16().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint16().getAttr(pathName, key));
- }
- } else if (type.isAssignableFrom(byte.class)) {
- if (attributeInfo.isSigned()) {
- return new HDF5Attribute(MinorType.INT, key, reader.int8().getAttr(pathName, key));
- } else {
- return new HDF5Attribute(MinorType.INT, key, reader.uint8().getAttr(pathName, key));
- }
- } else if (type.isAssignableFrom(double.class)) {
- return new HDF5Attribute(MinorType.FLOAT8, key, reader.float64().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.VARCHAR, key, attribute.getData(), true);
+ } else if (type.isAssignableFrom(java.lang.Long.class)) {
+ return new HDF5Attribute(MinorType.BIGINT, key, attribute.getData());
+ } else if (type.isAssignableFrom(java.lang.Integer.class)) {
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData());
+ } else if (type.isAssignableFrom(java.lang.Short.class)) {
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData());
+ } else if (type.isAssignableFrom(java.lang.Byte.class)) {
+ return new HDF5Attribute(MinorType.INT, key, attribute.getData());
+ } else if (type.isAssignableFrom(java.lang.Double.class)) {
+ return new HDF5Attribute(MinorType.FLOAT8, key, attribute.getData());
} else if (type.isAssignableFrom(float.class)) {
- return new HDF5Attribute(MinorType.FLOAT4, key, reader.float32().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.FLOAT4, key, attribute.getData());
} else if (type.isAssignableFrom(String.class)) {
- return new HDF5Attribute(MinorType.VARCHAR, key, reader.string().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.VARCHAR, key, attribute.getData());
} else if (type.isAssignableFrom(boolean.class)) {
- return new HDF5Attribute(MinorType.BIT, key, reader.bool().getAttr(pathName, key));
- } else if (type.isAssignableFrom(HDF5EnumerationValue.class)) {
+ return new HDF5Attribute(MinorType.BIT, key, attribute.getData());
+ }/*else if (type.isAssignableFrom(HDF5EnumerationValue.class)) {
// Convert HDF5 Enum to String
- return new HDF5Attribute(MinorType.GENERIC_OBJECT, key, reader.enumeration().getAttr(pathName, key));
- } else if (type.isAssignableFrom(BitSet.class)) {
- return new HDF5Attribute(MinorType.BIT, key, reader.bool().getAttr(pathName, key));
+ return new HDF5Attribute(MinorType.GENERIC_OBJECT, key, attribute.getData());
+ }*/ else if (type.isAssignableFrom(BitSet.class)) {
+ return new HDF5Attribute(MinorType.BIT, key, attribute.getData());
}
- logger.warn("Reading attributes of type {} not yet implemented.", attributeInfo);
+ logger.warn("Reading attributes of type {} not yet implemented.", attribute.getJavaType());
return null;
}
/**
* This function returns the Drill data type of a given HDF5 dataset.
- * @param datasetInfo The input data set.
+ * @param dataType The input data set.
* @return MinorType The Drill data type of the dataset in question
*/
- public static MinorType getDataType(HDF5DataSetInformation datasetInfo) {
+ public static MinorType getDataType(DataType dataType) {
- HDF5DataTypeInformation typeInfo = datasetInfo.getTypeInformation();
- Class<?> type = typeInfo.tryGetJavaType();
- String name = typeInfo.getDataClass().name();
+ Class<?> type = dataType.getJavaType();
if (type == null) {
- logger.warn("Datasets of type {} not implemented.", typeInfo);
+ logger.warn("Datasets of type {} not implemented.", dataType.getDataClass());
//Fall back to string
- if(name.equalsIgnoreCase("OTHER")) {
- return MinorType.GENERIC_OBJECT;
- } else {
- return MinorType.VARCHAR;
- }
+ return MinorType.VARCHAR;
} else if (type.isAssignableFrom(long.class)) {
return MinorType.BIGINT;
} else if (type.isAssignableFrom(short.class)) {
@@ -180,11 +139,13 @@
return MinorType.TINYINT;
} else if (type.isAssignableFrom(int.class)) {
return MinorType.INT;
- } else if (type.isAssignableFrom(double.class) || type.isAssignableFrom(float.class)) {
+ } else if (type.isAssignableFrom(float.class)) {
+ return MinorType.FLOAT4;
+ } else if (type.isAssignableFrom(double.class)) {
return MinorType.FLOAT8;
} else if (type.isAssignableFrom(String.class)) {
return MinorType.VARCHAR;
- } else if (type.isAssignableFrom(java.util.Date.class)) {
+ } else if (type.isAssignableFrom(java.util.Date.class) || type.isAssignableFrom(java.lang.Long.class)) {
return MinorType.TIMESTAMP;
} else if (type.isAssignableFrom(boolean.class) || type.isAssignableFrom(BitSet.class)) {
return MinorType.BIT;
@@ -202,49 +163,8 @@
* @param reader The HDF5 reader
* @return The data type
*/
- public static Class<?> getDatasetClass(String path, IHDF5Reader reader) {
- HDF5DataSetInformation info = reader.getDataSetInformation(resolvePath(path, reader));
- return info.getTypeInformation().tryGetJavaType();
- }
-
- /**
- * This function resolves path references
- * @param path The path for the possible reference
- * @param reader The HDF5 reader object for the file
- * @return the string for the relative path
- */
- public static String resolvePath(String path, IHDF5Reader reader) {
- if (reader.exists(path)) {
- // Resolve references, if any.
- if (reader.object().isDataSet(path)) {
- HDF5DataClass dataClass = reader.getDataSetInformation(path).getTypeInformation().getDataClass();
- if (dataClass.toString().equals("REFERENCE")) {
- return reader.reference().read(path);
- }
- }
- return path;
- } else if (!CASE_SENSITIVE) {
- // Look for a match with a different case.
- String[] pathParts = path.split("/");
- String resolvedPath = "/";
- for (int i = 1; i < pathParts.length; i++) {
- String testPath = (resolvedPath.endsWith("/")) ? resolvedPath + pathParts[i] : resolvedPath + "/" + pathParts[i];
- if (reader.exists(testPath)) {
- resolvedPath = testPath;
- } else if (reader.isGroup(resolvedPath)) {
- List<String> children = reader.getGroupMembers(resolvedPath);
- for (String child : children) {
- if (child.equalsIgnoreCase(pathParts[i])) {
- resolvedPath = resolvedPath + "/" + child;
- }
- }
- }
- }
- if (path.equalsIgnoreCase(resolvedPath)) {
- return resolvedPath;
- }
- }
- return null;
+ public static Class<?> getDatasetClass(String path, HdfFile reader) {
+ return reader.getDatasetByPath(path).getJavaType();
}
/**
@@ -265,4 +185,126 @@
return "";
}
}
+
+ public static Object[] toMatrix(Object[] inputArray) {
+ return flatten(inputArray).toArray();
+ }
+
+ public static boolean[][] toBooleanMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((boolean[][])input[0]).length;
+
+ boolean[][] result = new boolean[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ boolean[] row = (boolean[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static byte[][] toByteMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((byte[])input[0]).length;
+
+ byte[][] result = new byte[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ byte[] row = (byte[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static short[][] toShortMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((short[])input[0]).length;
+
+ short[][] result = new short[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ short[] row = (short[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+
+ public static int[][] toIntMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((int[])input[0]).length;
+
+ int[][] result = new int[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ int[] row = (int[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static long[][] toLongMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((long[])input[0]).length;
+
+ long[][] result = new long[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ long[] row = (long[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static float[][] toFloatMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((float[])input[0]).length;
+
+ float[][] result = new float[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ float[] row = (float[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static double[][] toDoubleMatrix(Object[] inputArray) {
+ Object[] input = flatten(inputArray).toArray();
+ int rows = input.length;
+ int cols = ((double[])input[0]).length;
+
+ double[][] result = new double[cols][rows];
+
+ for (int i = 0; i < rows; i++) {
+ double[] row = (double[])input[i];
+ for (int j = 0; j < cols; j++) {
+ result[j][i] = row[j];
+ }
+ }
+ return result;
+ }
+
+ public static Stream<Object> flatten(Object[] array) {
+ return Arrays.stream(array)
+ .flatMap(o -> o instanceof Object[]? flatten((Object[])o): Stream.of(o));
+ }
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5ByteDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5ByteDataWriter.java
new file mode 100644
index 0000000..dabc6ab
--- /dev/null
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5ByteDataWriter.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.hdf5.writers;
+
+import io.jhdf.HdfFile;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.store.hdf5.HDF5Utils;
+import org.apache.drill.exec.vector.accessor.ValueWriter;
+
+public class HDF5ByteDataWriter extends HDF5DataWriter {
+
+ private final byte[] data;
+
+ private final ValueWriter colWriter;
+
+ // This constructor is used when the data is a 1D column. The column is inferred from the datapath
+ public HDF5ByteDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
+ super(reader, datapath);
+ data = (byte[]) reader.getDatasetByPath(datapath).getData();
+ fieldName = HDF5Utils.getNameFromPath(datapath);
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.TINYINT, DataMode.OPTIONAL);
+ }
+
+ // This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
+ public HDF5ByteDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ super(reader, datapath, fieldName, currentColumn);
+ // Get dimensions
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ byte[][] tempData = new byte[0][];
+ if (dimensions.length == 2) {
+ tempData = transpose((byte[][]) reader.getDatasetByPath(datapath).getData());
+ } else {
+ tempData = transpose(HDF5Utils.toByteMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
+ }
+ data = tempData[currentColumn];
+
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.TINYINT, DataMode.OPTIONAL);
+ }
+
+ // This constructor is used for compound data types.
+ public HDF5ByteDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, byte[] tempListData) {
+ super(reader, null);
+ this.fieldName = fieldName;
+ data = tempListData;
+
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.SMALLINT, DataMode.OPTIONAL);
+ }
+
+ @Override
+ public boolean write() {
+ if (counter > data.length) {
+ return false;
+ } else {
+ colWriter.setInt(data[counter++]);
+ return true;
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ return counter < data.length;
+ }
+
+ /**
+ * Transposes the input matrix by flipping a matrix over its diagonal by switching the row and column
+ * indices of the matrix by producing another matrix.
+ * @param matrix The input matrix to be transposed
+ * @return The transposed matrix.
+ */
+ private byte[][] transpose (byte[][] matrix) {
+ if (matrix == null || matrix.length == 0) {
+ return matrix;
+ }
+ int width = matrix.length;
+ int height = matrix[0].length;
+
+ byte[][] transposedMatrix = new byte[height][width];
+
+ for (int x = 0; x < width; x++) {
+ for (int y = 0; y < height; y++) {
+ transposedMatrix[y][x] = matrix[x][y];
+ }
+ }
+ return transposedMatrix;
+ }
+
+ @Override
+ public int getDataSize() {
+ return data.length;
+ }
+}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DataWriter.java
index 79865e5..fc03bf9 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DataWriter.java
@@ -18,13 +18,11 @@
package org.apache.drill.exec.store.hdf5.writers;
-import java.util.ArrayList;
-import java.util.List;
-
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
+import java.util.LinkedHashMap;
+import io.jhdf.HdfFile;
public abstract class HDF5DataWriter {
- protected final IHDF5Reader reader;
+ protected final HdfFile reader;
protected final String datapath;
@@ -34,14 +32,14 @@
protected int counter;
- protected Object[][] compoundData;
+ protected LinkedHashMap<String, ?> compoundData;
- public HDF5DataWriter(IHDF5Reader reader, String datapath) {
+ public HDF5DataWriter(HdfFile reader, String datapath) {
this.reader = reader;
this.datapath = datapath;
}
- public HDF5DataWriter(IHDF5Reader reader, String datapath, String fieldName, int colCount) {
+ public HDF5DataWriter(HdfFile reader, String datapath, String fieldName, int colCount) {
this(reader, datapath);
this.fieldName = fieldName;
this.colCount = colCount;
@@ -59,15 +57,6 @@
return counter;
}
- @SuppressWarnings("unchecked")
- public <T> List<T> getColumn(int columnIndex) {
- List<T> result = new ArrayList<>();
- for (Object[] compoundDatum : compoundData) {
- result.add((T) compoundDatum[columnIndex]);
- }
- return result;
- }
-
public abstract int getDataSize();
public boolean isCompound() {
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DoubleDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DoubleDataWriter.java
index 0143854..b7c1c90 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DoubleDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5DoubleDataWriter.java
@@ -18,14 +18,11 @@
package org.apache.drill.exec.store.hdf5.writers;
-import java.util.List;
-
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
public class HDF5DoubleDataWriter extends HDF5DataWriter {
private final double[] data;
@@ -33,36 +30,34 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5DoubleDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5DoubleDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.readDoubleArray(datapath);
+ data = (double[]) reader.getDatasetByPath(datapath).getData();
fieldName = HDF5Utils.getNameFromPath(datapath);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
// This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
- public HDF5DoubleDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ public HDF5DoubleDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
super(reader, datapath, fieldName, currentColumn);
// Get dimensions
- long[] dimensions = reader.object().getDataSetInformation(datapath).getDimensions();
- double[][] tempData;
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ double[][] tempData = new double[0][];
if (dimensions.length == 2) {
- tempData = transpose(reader.readDoubleMatrix(datapath));
+ tempData = transpose((double[][]) reader.getDatasetByPath(datapath).getData());
} else {
- tempData = transpose(reader.float64().readMDArray(datapath).toMatrix());
+ tempData = transpose(HDF5Utils.toDoubleMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
}
data = tempData[currentColumn];
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
- public HDF5DoubleDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String fieldName, List<Double> tempListData) {
+ public HDF5DoubleDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, double[] tempListData) {
super(reader, null);
this.fieldName = fieldName;
- data = new double[tempListData.size()];
- for (int i = 0; i < tempListData.size(); i++) {
- data[i] = tempListData.get(i);
- }
+ data = tempListData;
+
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5EnumDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5EnumDataWriter.java
deleted file mode 100644
index 136f454..0000000
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5EnumDataWriter.java
+++ /dev/null
@@ -1,59 +0,0 @@
-/*
- * Licensed to the Apache Software Foundation (ASF) under one
- * or more contributor license agreements. See the NOTICE file
- * distributed with this work for additional information
- * regarding copyright ownership. The ASF licenses this file
- * to you under the Apache License, Version 2.0 (the
- * "License"); you may not use this file except in compliance
- * with the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.drill.exec.store.hdf5.writers;
-
-import org.apache.drill.common.types.TypeProtos;
-import org.apache.drill.exec.store.hdf5.HDF5Utils;
-import org.apache.drill.exec.vector.accessor.ValueWriter;
-
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
-public class HDF5EnumDataWriter extends HDF5DataWriter {
-
- private final String[] data;
-
- private final ValueWriter colWriter;
-
- // This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5EnumDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
- super(reader, datapath);
- data = reader.readEnumArrayAsString(datapath);
-
- fieldName = HDF5Utils.getNameFromPath(datapath);
- colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
- }
-
- @Override
- public boolean write() {
- if (counter > data.length) {
- return false;
- } else {
- colWriter.setString(data[counter++]);
- return true;
- }
- }
-
- @Override
- public int getDataSize() { return data.length; }
-
- @Override
- public boolean hasNext() {
- return counter < data.length;
- }
-}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5FloatDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5FloatDataWriter.java
index 5cd20e2..335c39d 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5FloatDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5FloatDataWriter.java
@@ -18,14 +18,11 @@
package org.apache.drill.exec.store.hdf5.writers;
-import java.util.List;
-
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
public class HDF5FloatDataWriter extends HDF5DataWriter {
private final float[] data;
@@ -33,35 +30,33 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5FloatDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5FloatDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.readFloatArray(datapath);
+ data = (float[]) reader.getDatasetByPath(datapath).getData();
fieldName = HDF5Utils.getNameFromPath(datapath);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
// This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
- public HDF5FloatDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ public HDF5FloatDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
super(reader, datapath, fieldName, currentColumn);
// Get dimensions
- long[] dimensions = reader.object().getDataSetInformation(datapath).getDimensions();
- float[][] tempData;
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ float[][] tempData = new float[0][];
+
if (dimensions.length == 2) {
- tempData = transpose(reader.readFloatMatrix(datapath));
+ tempData = transpose((float[][]) reader.getDatasetByPath(datapath).getData());
} else {
- tempData = transpose(reader.float32().readMDArray(datapath).toMatrix());
+ tempData = transpose(HDF5Utils.toFloatMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
}
data = tempData[currentColumn];
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
- public HDF5FloatDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String fieldName, List<Float> tempListData) {
+ public HDF5FloatDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, float[] tempListData) {
super(reader, null);
this.fieldName = fieldName;
- data = new float[tempListData.size()];
- for (int i = 0; i < tempListData.size(); i++) {
- data[i] = tempListData.get(i);
- }
+ data = tempListData;
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL);
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5IntDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5IntDataWriter.java
index e2cea35..ba49a3c 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5IntDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5IntDataWriter.java
@@ -18,14 +18,11 @@
package org.apache.drill.exec.store.hdf5.writers;
-import java.util.List;
-
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
public class HDF5IntDataWriter extends HDF5DataWriter {
private final int[] data;
@@ -33,25 +30,23 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5IntDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5IntDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.readIntArray(datapath);
-
+ data = (int[]) reader.getDatasetByPath(datapath).getData();
fieldName = HDF5Utils.getNameFromPath(datapath);
-
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL);
}
// This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
- public HDF5IntDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ public HDF5IntDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
super(reader, datapath, fieldName, currentColumn);
// Get dimensions
- long[] dimensions = reader.object().getDataSetInformation(datapath).getDimensions();
- int[][] tempData;
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ int[][] tempData = new int[0][];
if (dimensions.length == 2) {
- tempData = transpose(reader.readIntMatrix(datapath));
+ tempData = transpose((int[][]) reader.getDatasetByPath(datapath).getData());
} else {
- tempData = transpose(reader.int32().readMDArray(datapath).toMatrix());
+ tempData = transpose(HDF5Utils.toIntMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
}
data = tempData[currentColumn];
@@ -59,13 +54,10 @@
}
// This constructor is used for compound data types.
- public HDF5IntDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String fieldName, List<Integer> tempListData) {
+ public HDF5IntDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, int[] tempListData) {
super(reader, null);
this.fieldName = fieldName;
- data = new int[tempListData.size()];
- for (int i = 0; i < tempListData.size(); i++) {
- data[i] = tempListData.get(i);
- }
+ data = tempListData;
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL);
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5LongDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5LongDataWriter.java
index b9f63b4..fadb290 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5LongDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5LongDataWriter.java
@@ -18,13 +18,12 @@
package org.apache.drill.exec.store.hdf5.writers;
-import java.util.List;
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
public class HDF5LongDataWriter extends HDF5DataWriter {
@@ -33,36 +32,32 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5LongDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5LongDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.readLongArray(datapath);
-
+ data = (long[]) reader.getDatasetByPath(datapath).getData();
fieldName = HDF5Utils.getNameFromPath(datapath);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.BIGINT, TypeProtos.DataMode.OPTIONAL);
}
// This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
- public HDF5LongDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ public HDF5LongDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
super(reader, datapath, fieldName, currentColumn);
// Get dimensions
- long[] dimensions = reader.object().getDataSetInformation(datapath).getDimensions();
- long[][] tempData;
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ long[][] tempData = new long[0][];
if (dimensions.length == 2) {
- tempData = transpose(reader.readLongMatrix(datapath));
+ tempData = transpose((long[][]) reader.getDatasetByPath(datapath).getData());
} else {
- tempData = transpose(reader.int64().readMDArray(datapath).toMatrix());
+ tempData = transpose(HDF5Utils.toLongMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
}
data = tempData[currentColumn];
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.BIGINT, TypeProtos.DataMode.OPTIONAL);
}
- public HDF5LongDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String fieldName, List<Long> tempListData) {
+ public HDF5LongDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, long[] tempListData) {
super(reader, null);
this.fieldName = fieldName;
- data = new long[tempListData.size()];
- for (int i = 0; i < tempListData.size(); i++) {
- data[i] = tempListData.get(i);
- }
+ data = tempListData;
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.BIGINT, TypeProtos.DataMode.OPTIONAL);
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5MapDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5MapDataWriter.java
index a0c19e2..5894221 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5MapDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5MapDataWriter.java
@@ -18,33 +18,33 @@
package org.apache.drill.exec.store.hdf5.writers;
-import ch.systemsx.cisd.hdf5.HDF5CompoundMemberInformation;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
+import io.jhdf.HdfFile;
+import io.jhdf.object.datatype.CompoundDataType;
+import io.jhdf.object.datatype.CompoundDataType.CompoundDataMember;
import org.apache.drill.common.exceptions.UserException;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import java.util.ArrayList;
+import java.util.LinkedHashMap;
import java.util.List;
+
public class HDF5MapDataWriter extends HDF5DataWriter {
+
private static final Logger logger = LoggerFactory.getLogger(HDF5MapDataWriter.class);
-
private static final String UNSAFE_SPACE_SEPARATOR = " ";
-
private static final String SAFE_SPACE_SEPARATOR = "_";
-
private final List<HDF5DataWriter> dataWriters;
+ private final List<CompoundDataMember> data;
- private List<String> fieldNames;
-
- public HDF5MapDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5MapDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- fieldNames = new ArrayList<>();
-
- compoundData = reader.compound().readArray(datapath, Object[].class);
+ // Get the members of the dataset
+ compoundData = (LinkedHashMap<String, ?>)reader.getDatasetByPath(datapath).getData();
+ data = ((CompoundDataType) reader.getDatasetByPath(datapath).getDataType()).getMembers();
dataWriters = new ArrayList<>();
- fieldNames = getFieldNames();
+
try {
getDataWriters(writerSpec);
} catch (Exception e) {
@@ -75,16 +75,6 @@
return counter < dataWriters.get(0).getDataSize();
}
- private List<String> getFieldNames() {
- List<String> names = new ArrayList<>();
-
- HDF5CompoundMemberInformation[] infos = reader.compound().getDataSetInfo(datapath);
- for (HDF5CompoundMemberInformation info : infos) {
- names.add(info.getName().replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR));
- }
- return names;
- }
-
/**
* Populates the ArrayList of DataWriters. Since HDF5 Maps contain homogeneous
* columns, it is fine to get the first row, and iterate through the columns
@@ -92,27 +82,30 @@
*/
private void getDataWriters(WriterSpec writerSpec) {
- for (int col = 0; col < compoundData[0].length; col++) {
- Object currentColumn = compoundData[0][col];
- String dataType = currentColumn.getClass().getSimpleName();
-
+ for (CompoundDataMember dataMember : data) {
+ String dataType = dataMember.getDataType().getJavaType().getName();
+ String fieldName = dataMember.getName();
switch (dataType) {
- case "Byte":
- case "Short":
- case "Integer":
- dataWriters.add(new HDF5IntDataWriter(reader, writerSpec, fieldNames.get(col), getColumn(col)));
+ case "byte":
+ dataWriters.add(new HDF5ByteDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (byte[])compoundData.get(fieldName)));
break;
- case "Long":
- dataWriters.add(new HDF5LongDataWriter(reader, writerSpec, fieldNames.get(col), getColumn(col)));
+ case "short":
+ dataWriters.add(new HDF5SmallIntDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (short[])compoundData.get(fieldName)));
break;
- case "Double":
- dataWriters.add(new HDF5DoubleDataWriter(reader, writerSpec, fieldNames.get(col), getColumn(col)));
+ case "int":
+ dataWriters.add(new HDF5IntDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (int[])compoundData.get(fieldName)));
break;
- case "Float":
- dataWriters.add(new HDF5FloatDataWriter(reader, writerSpec, fieldNames.get(col), getColumn(col)));
+ case "long":
+ dataWriters.add(new HDF5LongDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (long[])compoundData.get(fieldName)));
break;
- case "String":
- dataWriters.add(new HDF5StringDataWriter(reader, writerSpec, fieldNames.get(col), getColumn(col)));
+ case "double":
+ dataWriters.add(new HDF5DoubleDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (double[])compoundData.get(fieldName)));
+ break;
+ case "float":
+ dataWriters.add(new HDF5FloatDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (float[])compoundData.get(fieldName)));
+ break;
+ case "java.lang.String":
+ dataWriters.add(new HDF5StringDataWriter(reader, writerSpec, fieldName.replace(UNSAFE_SPACE_SEPARATOR, SAFE_SPACE_SEPARATOR), (String[])compoundData.get(fieldName)));
break;
default:
// Log unknown data type
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5SmallIntDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5SmallIntDataWriter.java
new file mode 100644
index 0000000..fbda008
--- /dev/null
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5SmallIntDataWriter.java
@@ -0,0 +1,108 @@
+/*
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.drill.exec.store.hdf5.writers;
+
+import io.jhdf.HdfFile;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
+import org.apache.drill.exec.store.hdf5.HDF5Utils;
+import org.apache.drill.exec.vector.accessor.ValueWriter;
+
+public class HDF5SmallIntDataWriter extends HDF5DataWriter {
+
+ private final short[] data;
+
+ private final ValueWriter colWriter;
+
+ // This constructor is used when the data is a 1D column. The column is inferred from the datapath
+ public HDF5SmallIntDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
+ super(reader, datapath);
+ data = (short[]) reader.getDatasetByPath(datapath).getData();
+ fieldName = HDF5Utils.getNameFromPath(datapath);
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.SMALLINT, DataMode.OPTIONAL);
+ }
+
+ // This constructor is used when the data is part of a 2D array. In this case the column name is provided in the constructor
+ public HDF5SmallIntDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath, String fieldName, int currentColumn) {
+ super(reader, datapath, fieldName, currentColumn);
+ // Get dimensions
+ int[] dimensions = reader.getDatasetByPath(datapath).getDimensions();
+ short[][] tempData = new short[0][];
+ if (dimensions.length == 2) {
+ tempData = transpose((short[][]) reader.getDatasetByPath(datapath).getData());
+ } else {
+ tempData = transpose(HDF5Utils.toShortMatrix((Object[])reader.getDatasetByPath(datapath).getData()));
+ }
+ data = tempData[currentColumn];
+
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.SMALLINT, DataMode.OPTIONAL);
+ }
+
+ // This constructor is used for compound data types.
+ public HDF5SmallIntDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, short[] tempListData) {
+ super(reader, null);
+ this.fieldName = fieldName;
+ data = tempListData;
+
+ colWriter = writerSpec.makeWriter(fieldName, MinorType.SMALLINT, DataMode.OPTIONAL);
+ }
+
+ @Override
+ public boolean write() {
+ if (counter > data.length) {
+ return false;
+ } else {
+ colWriter.setInt(data[counter++]);
+ return true;
+ }
+ }
+
+ @Override
+ public boolean hasNext() {
+ return counter < data.length;
+ }
+
+ /**
+ * Transposes the input matrix by flipping a matrix over its diagonal by switching the row and column
+ * indices of the matrix by producing another matrix.
+ * @param matrix The input matrix to be transposed
+ * @return The transposed matrix.
+ */
+ private short[][] transpose (short[][] matrix) {
+ if (matrix == null || matrix.length == 0) {
+ return matrix;
+ }
+ int width = matrix.length;
+ int height = matrix[0].length;
+
+ short[][] transposedMatrix = new short[height][width];
+
+ for (int x = 0; x < width; x++) {
+ for (int y = 0; y < height; y++) {
+ transposedMatrix[y][x] = matrix[x][y];
+ }
+ }
+ return transposedMatrix;
+ }
+
+ @Override
+ public int getDataSize() {
+ return data.length;
+ }
+}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5StringDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5StringDataWriter.java
index db7ea32..bbd1854 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5StringDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5StringDataWriter.java
@@ -21,12 +21,11 @@
import java.util.Arrays;
import java.util.List;
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
public class HDF5StringDataWriter extends HDF5DataWriter {
private String[] data;
@@ -36,18 +35,18 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5StringDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5StringDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.readStringArray(datapath);
+ data = (String[]) reader.getDatasetByPath(datapath).getData();
listData = Arrays.asList(data);
fieldName = HDF5Utils.getNameFromPath(datapath);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
}
- public HDF5StringDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String fieldName, List<String> data) {
+ public HDF5StringDataWriter(HdfFile reader, WriterSpec writerSpec, String fieldName, String[] data) {
super(reader, null);
this.fieldName = fieldName;
- this.listData = data;
+ this.listData = Arrays.asList(data);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL);
}
diff --git a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5TimestampDataWriter.java b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5TimestampDataWriter.java
index 881a762..9517028 100644
--- a/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5TimestampDataWriter.java
+++ b/contrib/format-hdf5/src/main/java/org/apache/drill/exec/store/hdf5/writers/HDF5TimestampDataWriter.java
@@ -20,12 +20,11 @@
import java.time.Instant;
+import io.jhdf.HdfFile;
import org.apache.drill.common.types.TypeProtos;
import org.apache.drill.exec.store.hdf5.HDF5Utils;
import org.apache.drill.exec.vector.accessor.ValueWriter;
-import ch.systemsx.cisd.hdf5.IHDF5Reader;
-
public class HDF5TimestampDataWriter extends HDF5DataWriter {
private final long[] data;
@@ -33,9 +32,9 @@
private final ValueWriter colWriter;
// This constructor is used when the data is a 1D column. The column is inferred from the datapath
- public HDF5TimestampDataWriter(IHDF5Reader reader, WriterSpec writerSpec, String datapath) {
+ public HDF5TimestampDataWriter(HdfFile reader, WriterSpec writerSpec, String datapath) {
super(reader, datapath);
- data = reader.time().readTimeStampArray(datapath);
+ data = (long[]) reader.getDatasetByPath(datapath).getData();
fieldName = HDF5Utils.getNameFromPath(datapath);
colWriter = writerSpec.makeWriter(fieldName, TypeProtos.MinorType.TIMESTAMP, TypeProtos.DataMode.OPTIONAL);
diff --git a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
index 03ec0a9..c6e8e52 100644
--- a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
+++ b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Format.java
@@ -20,6 +20,8 @@
import org.apache.drill.categories.RowSetTests;
import org.apache.drill.common.types.TypeProtos;
+import org.apache.drill.common.types.TypeProtos.DataMode;
+import org.apache.drill.common.types.TypeProtos.MinorType;
import org.apache.drill.exec.record.metadata.TupleMetadata;
import org.apache.drill.exec.rpc.RpcException;
import org.apache.drill.test.ClusterFixtureBuilder;
@@ -85,7 +87,7 @@
.sqlQuery("SELECT * FROM dfs.`hdf5/dset.h5`")
.unOrdered()
.baselineColumns("path", "data_type", "file_name", "data_size", "element_count", "dataset_data_type", "dimensions", "int_data")
- .baselineValues("/dset", "DATASET", "dset.h5", 96L, 24L, "INTEGER", "[4, 6]", finalList)
+ .baselineValues("/dset", "DATASET", "dset.h5", 96L, 24L, "int", "[4, 6]", finalList)
.go();
}
@@ -121,9 +123,9 @@
")";
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("col1", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
- .add("col2", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
- .add("col3", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
+ .add("col1", MinorType.FLOAT8, DataMode.OPTIONAL)
+ .add("col2", MinorType.FLOAT8, DataMode.OPTIONAL)
+ .add("col3", MinorType.FLOAT8, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
.addRow(1.1, 2.2, 3.3)
@@ -139,7 +141,7 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("path", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+ .add("path", MinorType.VARCHAR, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -159,7 +161,7 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("float_col", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.REQUIRED)
+ .add("float_col", TypeProtos.MinorType.FLOAT4, TypeProtos.DataMode.REQUIRED)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -493,7 +495,7 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("1D", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("1D", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -567,18 +569,17 @@
"WHERE path='/nd/3D'\n" +
") AS t1";
-
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("col1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("col2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("col1", MinorType.INT, DataMode.OPTIONAL)
+ .add("col2", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
- .addRow(-2147483648, 1)
- .addRow(2, 3)
- .addRow(4, 5)
- .addRow(6, 7)
+ .addRow(-2147483648, 5)
+ .addRow(1, 6)
+ .addRow(2, 7)
+ .addRow(3, 8)
.build();
new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
@@ -590,15 +591,15 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("int_col_0", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_1", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
- .addRow(-2147483648, 1)
- .addRow(2, 3)
- .addRow(4, 5)
- .addRow(6, 7)
+ .addRow(-2147483648, 5)
+ .addRow(1, 6)
+ .addRow(2, 7)
+ .addRow(3, 8)
.build();
new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
@@ -617,13 +618,13 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("col1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("col2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("col1", MinorType.INT, DataMode.OPTIONAL)
+ .add("col2", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
- .addRow(-2147483648, 1)
- .addRow(2, 3)
+ .addRow(-2147483648, 5)
+ .addRow(1, 6)
.build();
new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
@@ -635,13 +636,13 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("int_col_0", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_1", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
- .addRow(-2147483648, 1)
- .addRow(2, 3)
+ .addRow(-2147483648, 5)
+ .addRow(1, 6)
.build();
new RowSetComparison(expected).unorderedVerifyAndClearAll(results);
@@ -674,7 +675,7 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("field_2", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.REQUIRED)
+ .add("field_2", MinorType.FLOAT8, DataMode.REQUIRED)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -714,10 +715,12 @@
String sql = "SELECT path, file_name\n" +
"FROM dfs.`hdf5/browsing.h5` AS t1 WHERE t1.attributes.`important` = false";
+ //String sql = "SELECT path, attributes FROM dfs.`hdf5/browsing.h5`";
RowSet results = client.queryBuilder().sql(sql).rowSet();
+
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("path", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
- .add("file_name", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+ .add("path", MinorType.VARCHAR, DataMode.OPTIONAL)
+ .add("file_name", MinorType.VARCHAR, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -735,12 +738,12 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_3", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_4", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_5", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("int_col_0", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_1", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_2", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_3", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_4", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_5", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -761,11 +764,11 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("int_col_0", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_2", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_3", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("int_col_4", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
+ .add("int_col_0", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_1", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_2", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_3", MinorType.INT, DataMode.OPTIONAL)
+ .add("int_col_4", MinorType.INT, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -785,9 +788,9 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("field_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("field_2", TypeProtos.MinorType.FLOAT8, TypeProtos.DataMode.OPTIONAL)
- .add("field_3", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+ .add("field_1", MinorType.INT, DataMode.OPTIONAL)
+ .add("field_2", MinorType.FLOAT8, DataMode.OPTIONAL)
+ .add("field_3", MinorType.VARCHAR, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
@@ -808,8 +811,8 @@
RowSet results = client.queryBuilder().sql(sql).rowSet();
TupleMetadata expectedSchema = new SchemaBuilder()
- .add("field_1", TypeProtos.MinorType.INT, TypeProtos.DataMode.OPTIONAL)
- .add("field_3", TypeProtos.MinorType.VARCHAR, TypeProtos.DataMode.OPTIONAL)
+ .add("field_1", MinorType.INT, DataMode.OPTIONAL)
+ .add("field_3", MinorType.VARCHAR, DataMode.OPTIONAL)
.buildSchema();
RowSet expected = new RowSetBuilder(client.allocator(), expectedSchema)
diff --git a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Utils.java b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Utils.java
index 6e09b28..3ace1e7 100644
--- a/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Utils.java
+++ b/contrib/format-hdf5/src/test/java/org/apache/drill/exec/store/hdf5/TestHDF5Utils.java
@@ -20,7 +20,6 @@
import org.apache.drill.test.BaseTest;
import org.junit.Test;
-
import static org.junit.Assert.assertEquals;
import static org.junit.Assert.assertNull;
diff --git a/pom.xml b/pom.xml
index e16bfe1..2319f5d 100644
--- a/pom.xml
+++ b/pom.xml
@@ -207,12 +207,6 @@
</repository>
<repository>
- <id>jhdf5-repo</id>
- <name>ImageJ Repository</name>
- <url>https://maven.scijava.org/content/repositories/public/</url>
- </repository>
-
- <repository>
<id>jitpack.io</id>
<url>https://jitpack.io</url>
</repository>