blob: ca55766dc9213d1628d6ac8a55ffb26db0e22b9c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.runtime.io;
import java.io.EOFException;
import java.io.IOException;
import java.io.InputStream;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.sysds.common.Types.ValueType;
import org.apache.sysds.hops.OptimizerUtils;
import org.apache.sysds.runtime.DMLRuntimeException;
import org.apache.sysds.runtime.matrix.data.FrameBlock;
import org.apache.sysds.runtime.util.HDFSTool;
import org.apache.sysds.runtime.util.UtilFunctions;
/**
* Base class for all format-specific frame readers. Every reader is required to implement the basic read functionality
* but might provide additional custom functionality. Any non-default parameters (e.g., CSV read properties) should be
* passed into custom constructors. There is also a factory for creating format-specific readers.
*
*/
public abstract class FrameReader {
protected static final Log LOG = LogFactory.getLog(FrameReader.class.getName());
public abstract FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, String[] names, long rlen, long clen)
throws IOException, DMLRuntimeException;
public FrameBlock readFrameFromHDFS(String fname, ValueType[] schema, long rlen, long clen)
throws IOException, DMLRuntimeException {
LOG.debug("readFrameFromHDFS with schema");
return readFrameFromHDFS(fname, schema, getDefColNames(schema.length), rlen, clen);
}
public FrameBlock readFrameFromHDFS(String fname, long rlen, long clen) throws IOException, DMLRuntimeException {
LOG.debug("readFrameFromHDFS no schema");
return readFrameFromHDFS(fname, getDefSchema(clen), getDefColNames(clen), rlen, clen);
}
public abstract FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, String[] names, long rlen,
long clen) throws IOException, DMLRuntimeException;
public FrameBlock readFrameFromInputStream(InputStream is, ValueType[] schema, long rlen, long clen)
throws IOException, DMLRuntimeException {
LOG.debug("readFrame from Input Stream with schema");
return readFrameFromInputStream(is, schema, getDefColNames(schema.length), rlen, clen);
}
public FrameBlock readFrameFromInputStream(InputStream is, long rlen, long clen)
throws IOException, DMLRuntimeException {
LOG.debug("readFrame from Input Stream no schema");
return readFrameFromInputStream(is, getDefSchema(clen), getDefColNames(clen), rlen, clen);
}
public ValueType[] getDefSchema(long clen) throws DMLRuntimeException {
int lclen = Math.max((int) clen, 1);
return UtilFunctions.nCopies(lclen, ValueType.STRING);
}
public String[] getDefColNames(long clen) throws DMLRuntimeException {
return (clen < 0) ? new String[0] : FrameBlock.createColNames((int) clen);
}
/**
* NOTE: mallocDense controls if the output matrix blocks is fully allocated, this can be redundant if binary block
* read and single block.
*
* @param schema schema as array of ValueTypes
* @param names column names
* @param nrow number of rows
* @return frame block
* @throws IOException if IOException occurs
*/
protected static FrameBlock createOutputFrameBlock(ValueType[] schema, String[] names, long nrow)
throws IOException {
// check schema and column names
if(!OptimizerUtils.isValidCPDimensions(schema, names))
throw new DMLRuntimeException("Schema and names to be define with equal size.");
// prepare result frame block
FrameBlock ret = new FrameBlock(schema, names);
ret.ensureAllocatedColumns((int) nrow);
return ret;
}
protected static ValueType[] createOutputSchema(ValueType[] schema, long ncol) {
if(schema.length == 1 && ncol > 1)
return UtilFunctions.nCopies((int) ncol, schema[0]);
return schema;
}
protected static String[] createOutputNames(String[] names, long ncol) {
if(names.length != ncol)
return FrameBlock.createColNames((int) ncol);
return names;
}
protected static void checkValidInputFile(FileSystem fs, Path path) throws IOException {
// check non-existing file
if(!fs.exists(path))
throw new IOException("File " + path.toString() + " does not exist on HDFS/LFS.");
// check for empty file
if(HDFSTool.isFileEmpty(fs, path))
throw new EOFException("Empty input file " + path.toString() + ".");
}
}