blob: cf6643cb63e9a32cad40be82b8b9736d7be19d82 [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.hadoop.tools.rumen;
import java.io.Closeable;
import java.io.IOException;
import java.io.OutputStream;
import java.util.Comparator;
import java.util.HashSet;
import java.util.LinkedList;
import java.util.List;
import java.util.PriorityQueue;
import java.util.Queue;
import java.util.Random;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.conf.Configured;
import org.apache.hadoop.fs.FileSystem;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IOUtils;
import org.apache.hadoop.util.Tool;
import org.apache.hadoop.util.ToolRunner;
public class Folder extends Configured implements Tool {
private long outputDuration = -1;
private long inputCycle = -1;
private double concentration = 1.0;
private long randomSeed = 0; // irrelevant if seeded == false
private boolean seeded = false;
private boolean debug = false;
private boolean allowMissorting = false;
private int skewBufferLength = 0;
private long startsAfter = -1;
static final private Log LOG = LogFactory.getLog(Folder.class);
private DeskewedJobTraceReader reader = null;
private Outputter<LoggedJob> outGen = null;
private List<Path> tempPaths = new LinkedList<Path>();
private Path tempDir = null;
private long firstJobSubmitTime;
private double timeDilation;
private double transcriptionRateFraction;
private int transcriptionRateInteger;
private Random random;
static private final long TICKS_PER_SECOND = 1000L;
// error return codes
static private final int NON_EXISTENT_FILES = 1;
static private final int NO_INPUT_CYCLE_LENGTH = 2;
static private final int EMPTY_JOB_TRACE = 3;
static private final int OUT_OF_ORDER_JOBS = 4;
static private final int ALL_JOBS_SIMULTANEOUS = 5;
static private final int IO_ERROR = 6;
static private final int OTHER_ERROR = 7;
private Set<Closeable> closees = new HashSet<Closeable>();
private Set<Path> deletees = new HashSet<Path>();
static long parseDuration(String durationString) {
String numeral = durationString.substring(0, durationString.length() - 1);
char durationCode = durationString.charAt(durationString.length() - 1);
long result = Integer.parseInt(numeral);
if (result <= 0) {
throw new IllegalArgumentException("Negative durations are not allowed");
}
switch (durationCode) {
case 'D':
case 'd':
return 24L * 60L * 60L * TICKS_PER_SECOND * result;
case 'H':
case 'h':
return 60L * 60L * TICKS_PER_SECOND * result;
case 'M':
case 'm':
return 60L * TICKS_PER_SECOND * result;
case 'S':
case 's':
return TICKS_PER_SECOND * result;
default:
throw new IllegalArgumentException("Missing or invalid duration code");
}
}
private int initialize(String[] args) throws IllegalArgumentException {
String tempDirName = null;
String inputPathName = null;
String outputPathName = null;
for (int i = 0; i < args.length; ++i) {
String thisArg = args[i];
if (thisArg.equalsIgnoreCase("-starts-after")) {
startsAfter = parseDuration(args[++i]);
} else if (thisArg.equalsIgnoreCase("-output-duration")) {
outputDuration = parseDuration(args[++i]);
} else if (thisArg.equalsIgnoreCase("-input-cycle")) {
inputCycle = parseDuration(args[++i]);
} else if (thisArg.equalsIgnoreCase("-concentration")) {
concentration = Double.parseDouble(args[++i]);
} else if (thisArg.equalsIgnoreCase("-debug")) {
debug = true;
} else if (thisArg.equalsIgnoreCase("-allow-missorting")) {
allowMissorting = true;
} else if (thisArg.equalsIgnoreCase("-seed")) {
seeded = true;
randomSeed = Long.parseLong(args[++i]);
} else if (thisArg.equalsIgnoreCase("-skew-buffer-length")) {
skewBufferLength = Integer.parseInt(args[++i]);
} else if (thisArg.equalsIgnoreCase("-temp-directory")) {
tempDirName = args[++i];
} else if (thisArg.equals("") || thisArg.startsWith("-")) {
throw new IllegalArgumentException("Illegal switch argument, "
+ thisArg + " at position " + i);
} else {
inputPathName = thisArg;
outputPathName = args[++i];
if (i != args.length - 1) {
throw new IllegalArgumentException("Too many non-switch arguments");
}
}
}
try {
Configuration conf = getConf();
Path inPath = new Path(inputPathName);
reader =
new DeskewedJobTraceReader(new JobTraceReader(inPath, conf),
skewBufferLength, !allowMissorting);
Path outPath = new Path(outputPathName);
outGen = new DefaultOutputter<LoggedJob>();
outGen.init(outPath, conf);
tempDir =
tempDirName == null ? outPath.getParent() : new Path(tempDirName);
FileSystem fs = tempDir.getFileSystem(getConf());
if (!fs.getFileStatus(tempDir).isDirectory()) {
throw new IOException("Your temp directory is not a directory");
}
if (inputCycle <= 0) {
LOG.error("You must have an input cycle length.");
return NO_INPUT_CYCLE_LENGTH;
}
if (outputDuration <= 0) {
outputDuration = 60L * 60L * TICKS_PER_SECOND;
}
if (inputCycle <= 0) {
inputCycle = outputDuration;
}
timeDilation = (double) outputDuration / (double) inputCycle;
random = seeded ? new Random(randomSeed) : new Random();
if (debug) {
randomSeed = random.nextLong();
LOG.warn("This run effectively has a -seed of " + randomSeed);
random = new Random(randomSeed);
seeded = true;
}
} catch (IOException e) {
e.printStackTrace(System.err);
return NON_EXISTENT_FILES;
}
return 0;
}
@Override
public int run(String[] args) throws IOException {
int result = initialize(args);
if (result != 0) {
return result;
}
return run();
}
public int run() throws IOException {
class JobEntryComparator implements
Comparator<Pair<LoggedJob, JobTraceReader>> {
public int compare(Pair<LoggedJob, JobTraceReader> p1,
Pair<LoggedJob, JobTraceReader> p2) {
LoggedJob j1 = p1.first();
LoggedJob j2 = p2.first();
return (j1.getSubmitTime() < j2.getSubmitTime()) ? -1 : (j1
.getSubmitTime() == j2.getSubmitTime()) ? 0 : 1;
}
}
// we initialize an empty heap so if we take an error before establishing
// a real one the finally code goes through
Queue<Pair<LoggedJob, JobTraceReader>> heap =
new PriorityQueue<Pair<LoggedJob, JobTraceReader>>();
try {
LoggedJob job = reader.nextJob();
if (job == null) {
LOG.error("The job trace is empty");
return EMPTY_JOB_TRACE;
}
// If starts-after time is specified, skip the number of jobs till we reach
// the starting time limit.
if (startsAfter > 0) {
LOG.info("starts-after time is specified. Initial job submit time : "
+ job.getSubmitTime());
long approximateTime = job.getSubmitTime() + startsAfter;
job = reader.nextJob();
long skippedCount = 0;
while (job != null && job.getSubmitTime() < approximateTime) {
job = reader.nextJob();
skippedCount++;
}
LOG.debug("Considering jobs with submit time greater than "
+ startsAfter + " ms. Skipped " + skippedCount + " jobs.");
if (job == null) {
LOG.error("No more jobs to process in the trace with 'starts-after'"+
" set to " + startsAfter + "ms.");
return EMPTY_JOB_TRACE;
}
LOG.info("The first job has a submit time of " + job.getSubmitTime());
}
firstJobSubmitTime = job.getSubmitTime();
long lastJobSubmitTime = firstJobSubmitTime;
int numberJobs = 0;
long currentIntervalEnd = Long.MIN_VALUE;
Path nextSegment = null;
Outputter<LoggedJob> tempGen = null;
if (debug) {
LOG.debug("The first job has a submit time of " + firstJobSubmitTime);
}
final Configuration conf = getConf();
try {
// At the top of this loop, skewBuffer has at most
// skewBufferLength entries.
while (job != null) {
final Random tempNameGenerator = new Random();
lastJobSubmitTime = job.getSubmitTime();
++numberJobs;
if (job.getSubmitTime() >= currentIntervalEnd) {
if (tempGen != null) {
tempGen.close();
}
nextSegment = null;
for (int i = 0; i < 3 && nextSegment == null; ++i) {
try {
nextSegment =
new Path(tempDir, "segment-" + tempNameGenerator.nextLong()
+ ".json.gz");
if (debug) {
LOG.debug("The next segment name is " + nextSegment);
}
FileSystem fs = nextSegment.getFileSystem(conf);
try {
if (!fs.exists(nextSegment)) {
break;
}
continue;
} catch (IOException e) {
// no code -- file did not already exist
}
} catch (IOException e) {
// no code -- file exists now, or directory bad. We try three
// times.
}
}
if (nextSegment == null) {
throw new RuntimeException("Failed to create a new file!");
}
if (debug) {
LOG.debug("Creating " + nextSegment
+ " for a job with a submit time of " + job.getSubmitTime());
}
deletees.add(nextSegment);
tempPaths.add(nextSegment);
tempGen = new DefaultOutputter<LoggedJob>();
tempGen.init(nextSegment, conf);
long currentIntervalNumber =
(job.getSubmitTime() - firstJobSubmitTime) / inputCycle;
currentIntervalEnd =
firstJobSubmitTime + ((currentIntervalNumber + 1) * inputCycle);
}
// the temp files contain UDadjusted times, but each temp file's
// content is in the same input cycle interval.
if (tempGen != null) {
tempGen.output(job);
}
job = reader.nextJob();
}
} catch (DeskewedJobTraceReader.OutOfOrderException e) {
return OUT_OF_ORDER_JOBS;
} finally {
if (tempGen != null) {
tempGen.close();
}
}
if (lastJobSubmitTime <= firstJobSubmitTime) {
LOG.error("All of your job[s] have the same submit time."
+ " Please just use your input file.");
return ALL_JOBS_SIMULTANEOUS;
}
double submitTimeSpan = lastJobSubmitTime - firstJobSubmitTime;
LOG.warn("Your input trace spans "
+ (lastJobSubmitTime - firstJobSubmitTime) + " ticks.");
double foldingRatio =
submitTimeSpan * (numberJobs + 1) / numberJobs / inputCycle;
if (debug) {
LOG.warn("run: submitTimeSpan = " + submitTimeSpan + ", numberJobs = "
+ numberJobs + ", inputCycle = " + inputCycle);
}
if (reader.neededSkewBufferSize() > 0) {
LOG.warn("You needed a -skew-buffer-length of "
+ reader.neededSkewBufferSize() + " but no more, for this input.");
}
double tProbability = timeDilation * concentration / foldingRatio;
if (debug) {
LOG.warn("run: timeDilation = " + timeDilation + ", concentration = "
+ concentration + ", foldingRatio = " + foldingRatio);
LOG.warn("The transcription probability is " + tProbability);
}
transcriptionRateInteger = (int) Math.floor(tProbability);
transcriptionRateFraction = tProbability - Math.floor(tProbability);
// Now read all the inputs in parallel
heap =
new PriorityQueue<Pair<LoggedJob, JobTraceReader>>(tempPaths.size(),
new JobEntryComparator());
for (Path tempPath : tempPaths) {
JobTraceReader thisReader = new JobTraceReader(tempPath, conf);
closees.add(thisReader);
LoggedJob streamFirstJob = thisReader.getNext();
long thisIndex =
(streamFirstJob.getSubmitTime() - firstJobSubmitTime) / inputCycle;
if (debug) {
LOG.debug("A job with submit time of "
+ streamFirstJob.getSubmitTime() + " is in interval # "
+ thisIndex);
}
adjustJobTimes(streamFirstJob);
if (debug) {
LOG.debug("That job's submit time is adjusted to "
+ streamFirstJob.getSubmitTime());
}
heap
.add(new Pair<LoggedJob, JobTraceReader>(streamFirstJob, thisReader));
}
Pair<LoggedJob, JobTraceReader> next = heap.poll();
while (next != null) {
maybeOutput(next.first());
if (debug) {
LOG.debug("The most recent job has an adjusted submit time of "
+ next.first().getSubmitTime());
LOG.debug(" Its replacement in the heap will come from input engine "
+ next.second());
}
LoggedJob replacement = next.second().getNext();
if (replacement == null) {
next.second().close();
if (debug) {
LOG.debug("That input engine is depleted.");
}
} else {
adjustJobTimes(replacement);
if (debug) {
LOG.debug("The replacement has an adjusted submit time of "
+ replacement.getSubmitTime());
}
heap.add(new Pair<LoggedJob, JobTraceReader>(replacement, next
.second()));
}
next = heap.poll();
}
} finally {
IOUtils.cleanup(null, reader);
if (outGen != null) {
outGen.close();
}
for (Pair<LoggedJob, JobTraceReader> heapEntry : heap) {
heapEntry.second().close();
}
for (Closeable closee : closees) {
closee.close();
}
if (!debug) {
Configuration conf = getConf();
for (Path deletee : deletees) {
FileSystem fs = deletee.getFileSystem(conf);
try {
fs.delete(deletee, false);
} catch (IOException e) {
// no code
}
}
}
}
return 0;
}
private void maybeOutput(LoggedJob job) throws IOException {
for (int i = 0; i < transcriptionRateInteger; ++i) {
outGen.output(job);
}
if (random.nextDouble() < transcriptionRateFraction) {
outGen.output(job);
}
}
private void adjustJobTimes(LoggedJob adjustee) {
long offsetInCycle =
(adjustee.getSubmitTime() - firstJobSubmitTime) % inputCycle;
long outputOffset = (long) ((double) offsetInCycle * timeDilation);
long adjustment =
firstJobSubmitTime + outputOffset - adjustee.getSubmitTime();
adjustee.adjustTimes(adjustment);
}
/**
* @param args
*/
public static void main(String[] args) {
Folder instance = new Folder();
int result = 0;
try {
result = ToolRunner.run(instance, args);
} catch (IOException e) {
e.printStackTrace(System.err);
System.exit(IO_ERROR);
} catch (Exception e) {
e.printStackTrace(System.err);
System.exit(OTHER_ERROR);
}
if (result != 0) {
System.exit(result);
}
return;
}
}