blob: 078e348775407af789c13861244bea14fd56de68 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.drill.exec.physical.config;
import java.util.List;
import java.util.Map;
import com.google.common.base.Preconditions;
import com.google.common.collect.ArrayListMultimap;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.Lists;
import com.google.common.collect.Maps;
import org.apache.drill.common.expression.LogicalExpression;
import org.apache.drill.exec.physical.EndpointAffinity;
import org.apache.drill.exec.physical.MinorFragmentEndpoint;
import org.apache.drill.exec.physical.base.AbstractExchange;
import org.apache.drill.exec.physical.base.PhysicalOperator;
import org.apache.drill.exec.physical.base.Sender;
import org.apache.drill.exec.planner.fragment.ParallelizationInfo;
import org.apache.drill.exec.proto.CoordinationProtos.DrillbitEndpoint;
import com.fasterxml.jackson.annotation.JsonProperty;
/**
* DeMuxExchange is opposite of MuxExchange. It is used when the sender has overhead that is proportional to the
* number of receivers. DeMuxExchange is run one instance per Drillbit endpoint which collects and distributes data
* belonging to local receiving fragments running on the same Drillbit.
*
* Example:
* On a 3 node cluster, if the sender has 10 receivers on each node each sender requires 30 buffers. By inserting
* DeMuxExchange, we create one receiver per node which means total of 3 receivers for each sender. If the number of
* senders is 10, we use 10*3 buffers instead of 10*30. DeMuxExchange has a overhead of buffer space that is equal to
* number of local receivers. In this case each DeMuxExchange needs 10 buffers, so total of 3*10 buffers.
*/
public abstract class AbstractDeMuxExchange extends AbstractExchange {
protected final LogicalExpression expr;
// Ephemeral info used when creating execution fragments.
protected Map<Integer, MinorFragmentEndpoint> receiverToSenderMapping;
protected ArrayListMultimap<Integer, MinorFragmentEndpoint> senderToReceiversMapping;
private boolean isSenderReceiverMappingCreated;
public AbstractDeMuxExchange(@JsonProperty("child") PhysicalOperator child, @JsonProperty("expr") LogicalExpression expr) {
super(child);
this.expr = expr;
}
@JsonProperty("expr")
public LogicalExpression getExpression(){
return expr;
}
@Override
public ParallelizationInfo getSenderParallelizationInfo(List<DrillbitEndpoint> receiverFragmentEndpoints) {
Preconditions.checkArgument(receiverFragmentEndpoints != null && receiverFragmentEndpoints.size() > 0,
"Receiver fragment endpoint list should not be empty");
// We want to run one demux sender per Drillbit endpoint.
// Identify the number of unique Drillbit endpoints in receiver fragment endpoints.
List<DrillbitEndpoint> drillbitEndpoints = ImmutableSet.copyOf(receiverFragmentEndpoints).asList();
List<EndpointAffinity> affinities = Lists.newArrayList();
for(DrillbitEndpoint ep : drillbitEndpoints) {
affinities.add(new EndpointAffinity(ep, Double.POSITIVE_INFINITY));
}
return ParallelizationInfo.create(affinities.size(), affinities.size(), affinities);
}
@Override
public ParallelizationInfo getReceiverParallelizationInfo(List<DrillbitEndpoint> senderFragmentEndpoints) {
return ParallelizationInfo.UNLIMITED_WIDTH_NO_ENDPOINT_AFFINITY;
}
@Override
public Sender getSender(int minorFragmentId, PhysicalOperator child) {
createSenderReceiverMapping();
List<MinorFragmentEndpoint> receivers = senderToReceiversMapping.get(minorFragmentId);
if (receivers == null || receivers.size() <= 0) {
throw new IllegalStateException(String.format("Failed to find receivers for sender [%d]", minorFragmentId));
}
return new HashPartitionSender(receiverMajorFragmentId, child, expr, receivers);
}
/**
* In DeMuxExchange, sender fragment parallelization and endpoint assignment depends on receiver fragment endpoint
* assignments.
*/
@Override
public ParallelizationDependency getParallelizationDependency() {
return ParallelizationDependency.SENDER_DEPENDS_ON_RECEIVER;
}
protected void createSenderReceiverMapping() {
if (isSenderReceiverMappingCreated) {
return;
}
senderToReceiversMapping = ArrayListMultimap.create();
receiverToSenderMapping = Maps.newHashMap();
// Find the list of receiver fragment ids assigned to each Drillbit endpoint
ArrayListMultimap<DrillbitEndpoint, Integer> endpointReceiverList = ArrayListMultimap.create();
int receiverFragmentId = 0;
for(DrillbitEndpoint receiverLocation : receiverLocations) {
endpointReceiverList.put(receiverLocation, receiverFragmentId);
receiverFragmentId++;
}
int senderFragmentId = 0;
for(DrillbitEndpoint senderLocation : senderLocations) {
final List<Integer> receiverMinorFragmentIds = endpointReceiverList.get(senderLocation);
for(Integer receiverId : receiverMinorFragmentIds) {
receiverToSenderMapping.put(receiverId, new MinorFragmentEndpoint(senderFragmentId, senderLocation));
senderToReceiversMapping.put(senderFragmentId,
new MinorFragmentEndpoint(receiverId, receiverLocations.get(receiverId)));
}
senderFragmentId++;
}
isSenderReceiverMappingCreated = true;
}
}