| /* |
| * Licensed to the Apache Software Foundation (ASF) under one |
| * or more contributor license agreements. See the NOTICE file |
| * distributed with this work for additional information |
| * regarding copyright ownership. The ASF licenses this file |
| * to you under the Apache License, Version 2.0 (the |
| * "License"); you may not use this file except in compliance |
| * with the License. You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package org.apache.drill.exec.physical.impl.spill; |
| |
| import java.io.BufferedInputStream; |
| import java.io.File; |
| import java.io.FileInputStream; |
| import java.io.IOException; |
| import java.io.InputStream; |
| import java.io.OutputStream; |
| import java.nio.ByteBuffer; |
| import java.nio.channels.FileChannel; |
| import java.nio.channels.WritableByteChannel; |
| import java.nio.file.StandardOpenOption; |
| import java.util.Iterator; |
| import java.util.List; |
| import java.util.Set; |
| |
| import org.apache.drill.common.config.DrillConfig; |
| import org.apache.drill.common.exceptions.UserException; |
| import org.apache.drill.exec.ExecConstants; |
| import org.apache.drill.exec.cache.VectorSerializer; |
| import org.apache.drill.exec.ops.FragmentContext; |
| import org.apache.drill.exec.physical.base.PhysicalOperator; |
| import org.apache.drill.exec.physical.config.HashAggregate; |
| import org.apache.drill.exec.physical.config.HashJoinPOP; |
| import org.apache.drill.exec.physical.config.Sort; |
| import org.apache.drill.exec.proto.ExecProtos.FragmentHandle; |
| import org.apache.drill.exec.proto.helper.QueryIdHelper; |
| import org.apache.hadoop.conf.Configuration; |
| import org.apache.hadoop.fs.FSDataInputStream; |
| import org.apache.hadoop.fs.FSDataOutputStream; |
| import org.apache.hadoop.fs.FileSystem; |
| import org.apache.hadoop.fs.Path; |
| |
| import org.apache.drill.shaded.guava.com.google.common.base.Joiner; |
| import org.apache.drill.shaded.guava.com.google.common.collect.Iterators; |
| import org.apache.drill.shaded.guava.com.google.common.collect.Sets; |
| |
| /** |
| * Generates the set of spill files for this sort session. |
| */ |
| |
| public class SpillSet { |
| private static final org.slf4j.Logger logger = org.slf4j.LoggerFactory.getLogger(SpillSet.class); |
| |
| /** |
| * Spilling on the Mac using the HDFS file system is very inefficient, |
| * affects performance numbers. This interface allows using HDFS in |
| * production, but to bypass the HDFS file system when needed. |
| */ |
| |
| private interface FileManager { |
| |
| void deleteOnExit(String fragmentSpillDir) throws IOException; |
| |
| WritableByteChannel createForWrite(String fileName) throws IOException; |
| |
| InputStream openForInput(String fileName) throws IOException; |
| |
| void deleteFile(String fileName) throws IOException; |
| |
| void deleteDir(String fragmentSpillDir) throws IOException; |
| |
| /** |
| * Given a manager-specific output stream, return the current write position. |
| * Used to report total write bytes. |
| * |
| * @param channel created by the file manager |
| * @return |
| */ |
| long getWriteBytes(WritableByteChannel channel); |
| |
| /** |
| * Given a manager-specific input stream, return the current read position. |
| * Used to report total read bytes. |
| * |
| * @param inputStream input stream created by the file manager |
| * @return |
| */ |
| long getReadBytes(InputStream inputStream); |
| } |
| |
| /** |
| * Normal implementation of spill files using the HDFS file system. |
| */ |
| |
| private static class HadoopFileManager implements FileManager{ |
| /** |
| * The HDFS file system (for local directories, HDFS storage, etc.) used to |
| * create the temporary spill files. Allows spill files to be either on local |
| * disk, or in a DFS. (The admin can choose to put spill files in DFS when |
| * nodes provide insufficient local disk space) |
| */ |
| |
| // The buffer size is calculated as LCM of the Hadoop internal checksum buffer (9 * checksum length), where |
| // checksum length is 512 by default, and MapRFS page size that equals to 8 * 1024. The length of the transfer |
| // buffer does not affect performance of the write to hdfs or maprfs significantly once buffer length is more |
| // than 32 bytes. |
| private static final int TRANSFER_SIZE = 9 * 8 * 1024; |
| |
| private final byte buffer[]; |
| private FileSystem fs; |
| |
| protected HadoopFileManager(String fsName) { |
| buffer = new byte[TRANSFER_SIZE]; |
| Configuration conf = new Configuration(); |
| conf.set(FileSystem.FS_DEFAULT_NAME_KEY, fsName); |
| try { |
| fs = FileSystem.get(conf); |
| } catch (IOException e) { |
| throw UserException.resourceError(e) |
| .message("Failed to get the File System for external sort") |
| .build(logger); |
| } |
| } |
| |
| @Override |
| public void deleteOnExit(String fragmentSpillDir) throws IOException { |
| fs.deleteOnExit(new Path(fragmentSpillDir)); |
| } |
| |
| @Override |
| public WritableByteChannel createForWrite(String fileName) throws IOException { |
| return new WritableByteChannelImpl(buffer, fs.create(new Path(fileName))); |
| } |
| |
| @Override |
| public InputStream openForInput(String fileName) throws IOException { |
| return fs.open(new Path(fileName)); |
| } |
| |
| @Override |
| public void deleteFile(String fileName) throws IOException { |
| Path path = new Path(fileName); |
| if (fs.exists(path)) { |
| fs.delete(path, false); |
| } |
| } |
| |
| @Override |
| public void deleteDir(String fragmentSpillDir) throws IOException { |
| Path path = new Path(fragmentSpillDir); |
| if (path != null && fs.exists(path)) { |
| if (fs.delete(path, true)) { |
| fs.cancelDeleteOnExit(path); |
| } |
| } |
| } |
| |
| @Override |
| public long getWriteBytes(WritableByteChannel channel) { |
| try { |
| return ((FSDataOutputStream)((WritableByteChannelImpl)channel).out).getPos(); |
| } catch (Exception e) { |
| // Just used for logging, not worth dealing with the exception. |
| return 0; |
| } |
| } |
| |
| @Override |
| public long getReadBytes(InputStream inputStream) { |
| try { |
| return ((FSDataInputStream) inputStream).getPos(); |
| } catch (IOException e) { |
| // Just used for logging, not worth dealing with the exception. |
| return 0; |
| } |
| } |
| } |
| |
| /** |
| * Wrapper around an input stream to collect the total bytes |
| * read through the stream for use in reporting performance |
| * metrics. |
| */ |
| |
| public static class CountingInputStream extends InputStream |
| { |
| private InputStream in; |
| private long count; |
| |
| public CountingInputStream(InputStream in) { |
| this.in = in; |
| } |
| |
| @Override |
| public int read() throws IOException { |
| int b = in.read(); |
| if (b != -1) { |
| count++; |
| } |
| return b; |
| } |
| |
| @Override |
| public int read(byte b[]) throws IOException { |
| int n = in.read(b); |
| if (n != -1) { |
| count += n; |
| } |
| return n; |
| } |
| |
| @Override |
| public int read(byte b[], int off, int len) throws IOException { |
| int n = in.read(b, off, len); |
| if (n != -1) { |
| count += n; |
| } |
| return n; |
| } |
| |
| @Override |
| public long skip(long n) throws IOException { |
| return in.skip(n); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| in.close(); |
| } |
| |
| public long getCount() { return count; } |
| } |
| |
| /** |
| * Wrapper around an output stream to collect the total bytes |
| * written through the stream for use in reporting performance |
| * metrics. |
| */ |
| |
| public static class CountingOutputStream extends OutputStream { |
| |
| private OutputStream out; |
| private long count; |
| |
| public CountingOutputStream(OutputStream out) { |
| this.out = out; |
| } |
| |
| @Override |
| public void write(int b) throws IOException { |
| count++; |
| out.write(b); |
| } |
| |
| @Override |
| public void write(byte[] b) throws IOException { |
| count += b.length; |
| out.write(b); |
| } |
| |
| @Override |
| public void write(byte[] b, int off, int len) throws IOException { |
| count += len; |
| out.write(b, off, len); |
| } |
| |
| @Override |
| public void flush() throws IOException { |
| out.flush(); |
| } |
| |
| @Override |
| public void close() throws IOException { |
| out.close(); |
| } |
| |
| public long getCount() { return count; } |
| } |
| |
| /** |
| * Performance-oriented direct access to the local file system which |
| * bypasses HDFS. |
| */ |
| |
| private static class LocalFileManager implements FileManager { |
| |
| private File baseDir; |
| |
| public LocalFileManager(String fsName) { |
| baseDir = new File(fsName.replace(FileSystem.DEFAULT_FS, "")); |
| } |
| |
| @Override |
| public void deleteOnExit(String fragmentSpillDir) throws IOException { |
| File dir = new File(baseDir, fragmentSpillDir); |
| dir.mkdirs(); |
| dir.deleteOnExit(); |
| } |
| |
| @Override |
| public WritableByteChannel createForWrite(String fileName) throws IOException { |
| return FileChannel.open(new File(baseDir, fileName).toPath(), StandardOpenOption.CREATE, StandardOpenOption.WRITE); |
| } |
| |
| @Override |
| public InputStream openForInput(String fileName) throws IOException { |
| return new CountingInputStream( |
| new BufferedInputStream( |
| new FileInputStream(new File(baseDir, fileName)))); |
| } |
| |
| @Override |
| public void deleteFile(String fileName) throws IOException { |
| new File(baseDir, fileName).delete(); |
| } |
| |
| @Override |
| public void deleteDir(String fragmentSpillDir) throws IOException { |
| File spillDir = new File(baseDir, fragmentSpillDir); |
| for (File spillFile : spillDir.listFiles()) { |
| spillFile.delete(); // IO exception if file delete fails |
| } |
| spillDir.delete();// IO exception if dir delete fails |
| } |
| |
| @Override |
| public long getWriteBytes(WritableByteChannel channel) |
| { |
| try { |
| return ((FileChannel)channel).position(); |
| } catch (Exception e) { |
| return 0; |
| } |
| } |
| |
| @Override |
| public long getReadBytes(InputStream inputStream) { |
| return ((CountingInputStream) inputStream).getCount(); |
| } |
| } |
| |
| private static class WritableByteChannelImpl implements WritableByteChannel |
| { |
| private final byte buffer[]; |
| private OutputStream out; |
| |
| WritableByteChannelImpl(byte[] buffer, OutputStream out) { |
| this.buffer = buffer; |
| this.out = out; |
| } |
| |
| @Override |
| public int write(ByteBuffer src) throws IOException { |
| int remaining = src.remaining(); |
| int totalWritten = 0; |
| synchronized (buffer) { |
| for (int posn = 0; posn < remaining; posn += buffer.length) { |
| int len = Math.min(buffer.length, remaining - posn); |
| src.get(buffer, 0, len); |
| out.write(buffer, 0, len); |
| totalWritten += len; |
| } |
| } |
| return totalWritten; |
| } |
| |
| @Override |
| public boolean isOpen() |
| { |
| return out != null; |
| } |
| |
| @Override |
| public void close() throws IOException { |
| out.close(); |
| out = null; |
| } |
| } |
| |
| private final Iterator<String> dirs; |
| |
| /** |
| * Set of directories to which this operator should write spill files in a round-robin |
| * fashion. The operator requires at least one spill directory, but can |
| * support any number. The admin must ensure that sufficient space exists |
| * on all directories as this operator does not check space availability |
| * before writing to the directories. |
| */ |
| |
| private Set<String> currSpillDirs = Sets.newTreeSet(); |
| |
| /** |
| * The base part of the file name for spill files. Each file has this |
| * name plus an appended spill serial number. |
| */ |
| |
| private final String spillDirName; |
| |
| private int fileCount = 0; |
| |
| private FileManager fileManager; |
| |
| private long readBytes; |
| |
| private long writeBytes; |
| |
| public SpillSet(FragmentContext context, PhysicalOperator popConfig) { |
| this(context.getConfig(), context.getHandle(), popConfig); |
| } |
| |
| public SpillSet(DrillConfig config, FragmentHandle handle, PhysicalOperator popConfig) { |
| String operName; |
| |
| // Set the spill options from the configuration |
| String spillFs; |
| List<String> dirList; |
| |
| // Set the operator name (used as part of the spill file name), |
| // and set oper. specific options (the config file defaults to using the |
| // common options; users may override those - per operator) |
| if (popConfig instanceof Sort) { |
| operName = "Sort"; |
| spillFs = config.getString(ExecConstants.EXTERNAL_SORT_SPILL_FILESYSTEM); |
| dirList = config.getStringList(ExecConstants.EXTERNAL_SORT_SPILL_DIRS); |
| } else if (popConfig instanceof HashAggregate) { |
| operName = "HashAgg"; |
| spillFs = config.getString(ExecConstants.HASHAGG_SPILL_FILESYSTEM); |
| dirList = config.getStringList(ExecConstants.HASHAGG_SPILL_DIRS); |
| } else if (popConfig instanceof HashJoinPOP) { |
| operName = "HashJoin"; |
| spillFs = config.getString(ExecConstants.HASHJOIN_SPILL_FILESYSTEM); |
| dirList = config.getStringList(ExecConstants.HASHJOIN_SPILL_DIRS); |
| } else { |
| // just use the common ones |
| operName = "Unknown"; |
| spillFs = config.getString(ExecConstants.SPILL_FILESYSTEM); |
| dirList = config.getStringList(ExecConstants.SPILL_DIRS); |
| } |
| |
| dirs = Iterators.cycle(dirList); |
| |
| // If more than one directory, semi-randomly choose an offset into |
| // the list to avoid overloading the first directory in the list. |
| |
| if (dirList.size() > 1) { |
| int hash = handle.getQueryId().hashCode() + |
| handle.getMajorFragmentId() + |
| handle.getMinorFragmentId() + |
| popConfig.getOperatorId(); |
| int offset = hash % dirList.size(); |
| for (int i = 0; i < offset; i++) { |
| dirs.next(); |
| } |
| } |
| |
| // Use the high-performance local file system if the local file |
| // system is selected and impersonation is off. (We use that |
| // as a proxy for a non-production Drill setup.) |
| |
| boolean impersonationEnabled = config.getBoolean(ExecConstants.IMPERSONATION_ENABLED); |
| if (spillFs.startsWith(FileSystem.DEFAULT_FS) && ! impersonationEnabled) { |
| fileManager = new LocalFileManager(spillFs); |
| } else { |
| fileManager = new HadoopFileManager(spillFs); |
| } |
| |
| spillDirName = String.format("%s_%s_%s-%s-%s", |
| QueryIdHelper.getQueryId(handle.getQueryId()), |
| operName, handle.getMajorFragmentId(), popConfig.getOperatorId(), handle.getMinorFragmentId()); |
| } |
| |
| public String getNextSpillFile() { |
| return getNextSpillFile(null); |
| } |
| |
| public String getNextSpillFile(String extraName) { |
| |
| // Identify the next directory from the round-robin list to |
| // the file created from this round of spilling. The directory must |
| // must have sufficient space for the output file. |
| |
| String spillDir = dirs.next(); |
| String currSpillPath = Joiner.on("/").join(spillDir, spillDirName); |
| currSpillDirs.add(currSpillPath); |
| |
| String outputFile = Joiner.on("/").join(currSpillPath, "spill" + ++fileCount); |
| if (extraName != null) { |
| outputFile += "_" + extraName; |
| } |
| |
| try { |
| fileManager.deleteOnExit(currSpillPath); |
| } catch (IOException e) { |
| // since this is meant to be used in a batches's spilling, we don't propagate the exception |
| logger.warn("Unable to mark spill directory " + currSpillPath + " for deleting on exit", e); |
| } |
| return outputFile; |
| } |
| |
| public boolean hasSpilled() { |
| return fileCount > 0; |
| } |
| |
| public int getFileCount() { return fileCount; } |
| |
| public InputStream openForInput(String fileName) throws IOException { |
| return fileManager.openForInput(fileName); |
| } |
| |
| public WritableByteChannel openForOutput(String fileName) throws IOException { |
| return fileManager.createForWrite(fileName); |
| } |
| |
| public void delete(String fileName) throws IOException { |
| fileManager.deleteFile(fileName); |
| } |
| |
| public long getWriteBytes() { return writeBytes; } |
| public long getReadBytes() { return readBytes; } |
| |
| public void close() { |
| for (String path : currSpillDirs) { |
| try { |
| fileManager.deleteDir(path); |
| } catch (IOException e) { |
| // since this is meant to be used in a batches's cleanup, we don't propagate the exception |
| logger.warn("Unable to delete spill directory " + path, e); |
| } |
| currSpillDirs.clear(); // in case close() is called again |
| } |
| } |
| |
| public long getPosition(InputStream inputStream) { |
| return fileManager.getReadBytes(inputStream); |
| } |
| |
| public long getPosition(WritableByteChannel channel) { |
| return fileManager.getWriteBytes(channel); |
| } |
| |
| public void tallyReadBytes(long readLength) { |
| readBytes += readLength; |
| } |
| |
| public void tallyWriteBytes(long writeLength) { |
| writeBytes += writeLength; |
| } |
| |
| public VectorSerializer.Writer writer(String fileName) throws IOException { |
| return VectorSerializer.writer(openForOutput(fileName)); |
| } |
| |
| public void close(VectorSerializer.Writer writer) throws IOException { |
| tallyWriteBytes(writer.getBytesWritten()); |
| writer.close(); |
| } |
| } |