blob: ee776a59f237c6afcbded6e0fabd2cbbc1f4403b [file] [log] [blame]
/**
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.tez.mapreduce.input;
import javax.annotation.Nullable;
import java.io.IOException;
import java.util.Collection;
import java.util.Collections;
import java.util.LinkedList;
import java.util.List;
import java.util.concurrent.atomic.AtomicInteger;
import java.util.concurrent.locks.Condition;
import java.util.concurrent.locks.ReentrantLock;
import com.google.common.base.Function;
import org.apache.tez.common.Preconditions;
import com.google.common.collect.Lists;
import com.google.protobuf.ByteString;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;
import org.apache.hadoop.classification.InterfaceAudience.Public;
import org.apache.hadoop.classification.InterfaceStability.Evolving;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.mapred.JobConf;
import org.apache.hadoop.mapreduce.InputSplit;
import org.apache.tez.common.counters.TaskCounter;
import org.apache.tez.mapreduce.input.base.MRInputBase;
import org.apache.tez.mapreduce.lib.MRInputUtils;
import org.apache.tez.mapreduce.lib.MRReader;
import org.apache.tez.mapreduce.lib.MRReaderMapReduce;
import org.apache.tez.mapreduce.lib.MRReaderMapred;
import org.apache.tez.mapreduce.protos.MRRuntimeProtos.MRSplitProto;
import org.apache.tez.runtime.api.Event;
import org.apache.tez.runtime.api.Reader;
import org.apache.tez.runtime.api.InputContext;
import org.apache.tez.runtime.api.events.InputDataInformationEvent;
import org.apache.tez.runtime.library.api.KeyValueReader;
@Public
@Evolving
public class MultiMRInput extends MRInputBase {
private static final Logger LOG = LoggerFactory.getLogger(MultiMRInput.class);
public MultiMRInput(InputContext inputContext, int numPhysicalInputs) {
super(inputContext, numPhysicalInputs);
}
private final ReentrantLock lock = new ReentrantLock();
private final Condition condition = lock.newCondition();
private final AtomicInteger eventCount = new AtomicInteger(0);
private List<MRReader> readers = new LinkedList<MRReader>();
/**
* Create an {@link MultiMRInputConfigBuilder} to configure a {@link MultiMRInput}</p>
* The preferred usage model is to provide all of the parameters, and use methods to configure
* the Input.
* <p/>
* For legacy applications, which may already have a fully configured {@link
* org.apache.hadoop.conf.Configuration}
* instance, the inputFormat can be specified as null
* <p/>
* Typically, this will be used along with a custom {@link org.apache.tez.dag.api.VertexManagerPlugin}
* or {@link org.apache.tez.runtime.api.InputInitializer} to generate the multiple inputs to be
* used by each task. If this is not setup, this will work the same as {@link
* org.apache.tez.mapreduce.input.MRInput} </p>
* Grouping of splits is disabled by default.
*
* @param conf Configuration for the {@link MRInput}. This configuration instance will be
* modified in place
* @param inputFormat InputFormat derived class. This can be null. If the InputFormat specified
* is
* null, the provided configuration should be complete.
* @return {@link MultiMRInputConfigBuilder}
*/
public static MultiMRInputConfigBuilder createConfigBuilder(Configuration conf,
@Nullable Class<?> inputFormat) {
MultiMRInputConfigBuilder configBuilder = new MultiMRInputConfigBuilder(conf, inputFormat);
configBuilder.setInputClassName(MultiMRInput.class.getName()).groupSplits(false);
return configBuilder;
}
public static class MultiMRInputConfigBuilder extends MRInput.MRInputConfigBuilder {
private MultiMRInputConfigBuilder(Configuration conf, Class<?> inputFormat) {
super(conf, inputFormat);
}
}
@Override
public List<Event> initialize() throws IOException {
super.initialize();
LOG.info(getContext().getSourceVertexName() + " using newmapreduce API=" + useNewApi +
", numPhysicalInputs=" + getNumPhysicalInputs());
if (getNumPhysicalInputs() == 0) {
getContext().inputIsReady();
}
return null;
}
public Collection<KeyValueReader> getKeyValueReaders() throws InterruptedException, IOException {
lock.lock();
try {
while (eventCount.get() != getNumPhysicalInputs()) {
condition.await();
}
} finally {
lock.unlock();
}
return Collections
.unmodifiableCollection(Lists.transform(readers, new Function<MRReader, KeyValueReader>() {
@Override
public KeyValueReader apply(MRReader input) {
return input;
}
}));
}
@Override
public Reader getReader() throws Exception {
throw new UnsupportedOperationException("getReader not supported. use getKeyValueReaders");
}
@Override
public void handleEvents(List<Event> inputEvents) throws Exception {
lock.lock();
try {
if (getNumPhysicalInputs() == 0) {
throw new IllegalStateException(
"Unexpected event. MultiMRInput has been setup to receive 0 events");
}
Preconditions.checkState(eventCount.get() + inputEvents.size() <= getNumPhysicalInputs(),
"Unexpected event. All physical sources already initialized");
for (Event event : inputEvents) {
MRReader reader = initFromEvent((InputDataInformationEvent) event);
readers.add(reader);
if (eventCount.incrementAndGet() == getNumPhysicalInputs()) {
getContext().inputIsReady();
condition.signal();
}
}
} finally {
lock.unlock();
}
}
private MRReader initFromEvent(InputDataInformationEvent event) throws IOException {
Preconditions.checkState(event != null, "Event must be specified");
if (LOG.isDebugEnabled()) {
LOG.debug(getContext().getSourceVertexName() + " initializing Reader: " + eventCount.get());
}
MRSplitProto splitProto = MRSplitProto.parseFrom(ByteString.copyFrom(event.getUserPayload()));
MRReader reader = null;
JobConf localJobConf = new JobConf(jobConf);
long splitLength = -1;
if (useNewApi) {
InputSplit split = MRInputUtils.getNewSplitDetailsFromEvent(splitProto, localJobConf);
try {
splitLength = split.getLength();
} catch (InterruptedException e) {
LOG.warn("Got interrupted while reading split length: ", e);
}
reader = new MRReaderMapReduce(localJobConf, split,
getContext().getCounters(), inputRecordCounter, getContext().getApplicationId()
.getClusterTimestamp(), getContext().getTaskVertexIndex(), getContext()
.getApplicationId().getId(), getContext().getTaskIndex(), getContext()
.getTaskAttemptNumber(), getContext());
if (LOG.isDebugEnabled()) {
LOG.debug(getContext().getSourceVertexName() + " split Details -> SplitClass: " +
split.getClass().getName() + ", NewSplit: " + split + ", length: " + splitLength);
}
} else {
org.apache.hadoop.mapred.InputSplit split =
MRInputUtils.getOldSplitDetailsFromEvent(splitProto, localJobConf);
splitLength = split.getLength();
reader = new MRReaderMapred(localJobConf, split,
getContext().getCounters(), inputRecordCounter, getContext());
if (LOG.isDebugEnabled()) {
LOG.debug(getContext().getSourceVertexName() + " split Details -> SplitClass: " +
split.getClass().getName() + ", OldSplit: " + split + ", length: " + splitLength);
}
}
if (splitLength != -1) {
getContext().getCounters().findCounter(TaskCounter.INPUT_SPLIT_LENGTH_BYTES)
.increment(splitLength);
}
LOG.info(getContext().getSourceVertexName() + " initialized RecordReader from event");
return reader;
}
@Override
public List<Event> close() throws Exception {
for (MRReader reader : readers) {
reader.close();
}
long inputRecords = getContext().getCounters()
.findCounter(TaskCounter.INPUT_RECORDS_PROCESSED).getValue();
getContext().getStatisticsReporter().reportItemsProcessed(inputRecords);
return null;
}
@Override
public void start() throws Exception {
Preconditions.checkState(getNumPhysicalInputs() >= 0, "Expecting zero or more physical inputs");
}
}