| /* |
| * Copyright (c) 2014 DataTorrent, Inc. ALL Rights Reserved. |
| * |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| */ |
| package com.datatorrent.lib.io.fs; |
| |
| import com.datatorrent.api.Context.OperatorContext; |
| import com.datatorrent.api.Stats.OperatorStats; |
| import java.util.Collection; |
| import java.util.Set; |
| import org.apache.hadoop.fs.Path; |
| import org.slf4j.Logger; |
| import org.slf4j.LoggerFactory; |
| |
| /** |
| * Input operator that reads files from a directory. |
| * <p/> |
| * Provides the same functionality as the AbstractFSDirectoryInputOperator |
| * except that this utilized dynamic partitioning where the user can set the |
| * preferred number of pending files per operator as well as the max number of |
| * operators and define a repartition interval. When the repartition interval |
| * passes then a new number of operators are created to accommodate the |
| * remaining pending files. |
| * <p/> |
| * |
| * @since 1.0.4 |
| */ |
| public abstract class AbstractThroughputHashFSDirectoryInputOperator<T> extends AbstractFSDirectoryInputOperator<T> |
| { |
| private static final Logger LOG = LoggerFactory.getLogger(AbstractThroughputHashFSDirectoryInputOperator.class); |
| |
| private long repartitionInterval = 5L * 60L * 1000L; |
| private int preferredMaxPendingFilesPerOperator = 10; |
| private transient OperatorContext context; |
| |
| public void setup(OperatorContext context) |
| { |
| super.setup(context); |
| this.context = context; |
| } |
| |
| public void setRepartitionInterval(long repartitionInterval) |
| { |
| this.repartitionInterval = repartitionInterval; |
| } |
| |
| public long getRepartitionInterval() |
| { |
| return repartitionInterval; |
| } |
| |
| public void setPreferredMaxPendingFilesPerOperator(int pendingFilesPerOperator) |
| { |
| this.preferredMaxPendingFilesPerOperator = pendingFilesPerOperator; |
| } |
| |
| public int getPreferredMaxPendingFilesPerOperator() |
| { |
| return preferredMaxPendingFilesPerOperator; |
| } |
| |
| @Override |
| public void emitTuples() |
| { |
| if(System.currentTimeMillis() - scanIntervalMillis >= lastScanMillis) { |
| Set<Path> newPaths = scanner.scan(fs, filePath, processedFiles); |
| |
| for(Path newPath : newPaths) { |
| String newPathString = newPath.toString(); |
| pendingFiles.add(newPathString); |
| processedFiles.add(newPathString); |
| } |
| lastScanMillis = System.currentTimeMillis(); |
| } |
| |
| super.emitTuples(); |
| } |
| |
| @Override |
| public void endWindow() |
| { |
| super.endWindow(); |
| |
| if(context != null) { |
| int fileCount = failedFiles.size() + |
| pendingFiles.size() + |
| unfinishedFiles.size(); |
| if(currentFile != null) { |
| fileCount++; |
| } |
| context.setCounters(fileCount); |
| } |
| } |
| |
| @Override |
| protected int computedNewPartitionCount(Collection<Partition<AbstractFSDirectoryInputOperator<T>>> partitions, int incrementalCapacity) |
| { |
| LOG.debug("Called throughput."); |
| boolean isInitialParitition = partitions.iterator().next().getStats() == null; |
| int newOperatorCount; |
| int totalFileCount = 0; |
| |
| for(Partition<AbstractFSDirectoryInputOperator<T>> partition : partitions) { |
| AbstractFSDirectoryInputOperator<T> oper = partition.getPartitionedInstance(); |
| totalFileCount += oper.failedFiles.size(); |
| totalFileCount += oper.pendingFiles.size(); |
| totalFileCount += oper.unfinishedFiles.size(); |
| |
| if (oper.currentFile != null) { |
| totalFileCount++; |
| } |
| } |
| |
| if(!isInitialParitition) { |
| LOG.debug("definePartitions: Total File Count: {}", totalFileCount); |
| newOperatorCount = computeOperatorCount(totalFileCount); |
| } |
| else { |
| newOperatorCount = partitionCount; |
| } |
| |
| return newOperatorCount; |
| } |
| |
| private int computeOperatorCount(int totalFileCount) |
| { |
| int newOperatorCount = totalFileCount / preferredMaxPendingFilesPerOperator; |
| |
| if(totalFileCount % preferredMaxPendingFilesPerOperator > 0) { |
| newOperatorCount++; |
| } |
| if(newOperatorCount > partitionCount) { |
| newOperatorCount = partitionCount; |
| } |
| if(newOperatorCount == 0) { |
| newOperatorCount = 1; |
| } |
| |
| return newOperatorCount; |
| } |
| |
| @Override |
| public Response processStats(BatchedOperatorStats batchedOperatorStats) |
| { |
| int fileCount = 0; |
| |
| for(OperatorStats operatorStats: batchedOperatorStats.getLastWindowedStats()) { |
| if(operatorStats.counters != null) { |
| fileCount = (Integer) operatorStats.counters; |
| } |
| } |
| |
| Response response = new Response(); |
| |
| if(fileCount > 0 || |
| System.currentTimeMillis() - repartitionInterval <= lastRepartition) { |
| response.repartitionRequired = false; |
| return response; |
| } |
| |
| response.repartitionRequired = true; |
| return response; |
| } |
| } |