blob: 51139bcc6942e59b551b163942049a9d61f2195c [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.pen;
import java.util.Collection;
import java.util.Collections;
import java.util.HashMap;
import java.util.HashSet;
import java.util.IdentityHashMap;
import java.util.Iterator;
import java.util.LinkedList;
import java.util.List;
import java.util.Map;
import java.util.Set;
import java.util.Comparator;
import java.io.IOException;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.PhysicalOperator;
import org.apache.pig.backend.hadoop.executionengine.physicalLayer.plans.PhysicalPlan;
import org.apache.pig.data.BagFactory;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.Tuple;
import org.apache.pig.impl.PigContext;
import org.apache.pig.newplan.logical.relational.LOCogroup;
import org.apache.pig.newplan.logical.relational.LOJoin;
import org.apache.pig.newplan.logical.relational.LOCross;
import org.apache.pig.newplan.logical.relational.LODistinct;
import org.apache.pig.newplan.logical.relational.LOFilter;
import org.apache.pig.newplan.logical.relational.LOForEach;
import org.apache.pig.newplan.logical.relational.LOLimit;
import org.apache.pig.newplan.logical.relational.LOLoad;
import org.apache.pig.newplan.logical.relational.LOSort;
import org.apache.pig.newplan.logical.relational.LOSplit;
import org.apache.pig.newplan.logical.relational.LOSplitOutput;
import org.apache.pig.newplan.logical.relational.LOUnion;
import org.apache.pig.newplan.logical.relational.LOStore;
import org.apache.pig.newplan.logical.relational.LogicalRelationalOperator;
import org.apache.pig.newplan.logical.relational.LogicalPlan;
import org.apache.pig.newplan.logical.relational.LogicalRelationalNodesVisitor;
import org.apache.pig.newplan.Operator;
import org.apache.pig.impl.logicalLayer.FrontendException;
import org.apache.pig.impl.util.IdentityHashSet;
import org.apache.pig.impl.util.Pair;
import org.apache.pig.impl.io.FileSpec;
import org.apache.pig.pen.util.LineageTracer;
import org.apache.pig.pen.util.MetricEvaluation;
import org.apache.pig.pen.util.PreOrderDepthFirstWalker;
import org.apache.pig.pen.util.ExampleTuple;
public class LineageTrimmingVisitor extends LogicalRelationalNodesVisitor {
LogicalPlan plan = null;
Map<LOLoad, DataBag> baseData;
Map<FileSpec, DataBag> inputToDataMap;
Map<Operator, PhysicalOperator> LogToPhyMap = null;
PhysicalPlan physPlan = null;
double completeness = 100.0;
Log log = LogFactory.getLog(getClass());
Map<Operator, Collection<IdentityHashSet<Tuple>>> AffinityGroups = new HashMap<Operator, Collection<IdentityHashSet<Tuple>>>();
Map<Operator, LineageTracer> Lineage = new HashMap<Operator, LineageTracer>();
boolean continueTrimming;
PigContext pc;
private ExampleGenerator eg;
public LineageTrimmingVisitor(LogicalPlan plan,
Map<LOLoad, DataBag> baseData,
ExampleGenerator eg,
Map<Operator, PhysicalOperator> LogToPhyMap,
PhysicalPlan physPlan, PigContext pc) throws IOException, InterruptedException {
super(plan, new PreOrderDepthFirstWalker(plan));
// this.baseData.putAll(baseData);
this.baseData = baseData;
this.plan = plan;
this.LogToPhyMap = LogToPhyMap;
this.pc = pc;
this.physPlan = physPlan;
this.eg = eg;
this.inputToDataMap = new HashMap<FileSpec, DataBag>();
init();
}
public void init() throws IOException, InterruptedException {
Map<Operator, DataBag> data = eg.getData();
LineageTracer lineage = eg.getLineage();
Map<LogicalRelationalOperator, Collection<IdentityHashSet<Tuple>>> OpToEqClasses = eg.getLoToEqClassMap();
for (Operator leaf : plan.getSinks()) {
Lineage.put(leaf, lineage);
AffinityGroups.put(leaf, eg.getEqClasses());
}
completeness = MetricEvaluation.getCompleteness(null,
data, OpToEqClasses, true);
LogToPhyMap = eg.getLogToPhyMap();
continueTrimming = true;
}
@Override
public void visit(LOCogroup cg) throws FrontendException {
// can't separate CoGroup from succeeding ForEach
if (plan.getSuccessors(cg) != null && plan.getSuccessors(cg).get(0) instanceof LOForEach)
return;
if (continueTrimming) {
try {
continueTrimming = checkCompleteness(cg);
LineageTracer lineage = null;
// create affinity groups
if (cg.getInputs(plan).size() == 1) {
lineage = eg.getLineage();
AffinityGroups.put(cg.getInputs(plan).get(0), eg.getEqClasses());
Lineage.put(cg.getInputs(plan).get(0), lineage);
} else {
for (Operator input : cg.getInputs(plan)) {
Lineage.put(input, eg.getLineage());
AffinityGroups.put(input, eg.getEqClasses());
}
}
} catch (Exception e) {
throw new FrontendException("Exception : "+e.getMessage());
}
}
}
@Override
public void visit(LOJoin join) throws FrontendException {
if (continueTrimming) {
processOperator(join);
}
}
@Override
public void visit(LOCross cs) throws FrontendException {
if(continueTrimming)
processOperator(cs);
}
@Override
public void visit(LODistinct dt) throws FrontendException {
if(continueTrimming)
processOperator(dt);
}
@Override
public void visit(LOFilter filter) throws FrontendException {
if (continueTrimming)
processOperator(filter);
}
@Override
public void visit(LOStore store) throws FrontendException {
if (continueTrimming)
processOperator(store);
}
@Override
public void visit(LOForEach forEach) throws FrontendException {
if (continueTrimming)
processOperator(forEach);
}
@Override
public void visit(LOLimit limOp) throws FrontendException {
if(continueTrimming)
processOperator(limOp);
}
@Override
public void visit(LOLoad load) throws FrontendException {
if (continueTrimming)
processOperator(load);
}
@Override
public void visit(LOSort s) throws FrontendException {
if(continueTrimming)
processOperator(s);
}
@Override
public void visit(LOSplit split) throws FrontendException {
if(continueTrimming)
processOperator(split);
}
@Override
public void visit(LOSplitOutput split) throws FrontendException {
if(continueTrimming)
processOperator(split);
}
@Override
public void visit(LOUnion u) throws FrontendException {
if(continueTrimming)
processOperator(u);
}
private Map<LOLoad, DataBag> PruneBaseDataConstrainedCoverage(
Map<LOLoad, DataBag> baseData,
LineageTracer lineage,
Collection<IdentityHashSet<Tuple>> equivalenceClasses) {
IdentityHashMap<Tuple, Collection<Tuple>> membershipMap = lineage
.getMembershipMap();
IdentityHashMap<Tuple, Double> lineageGroupWeights = lineage
.getWeightedCounts(2f, 1);
// compute a mapping from lineage group to the set of equivalence
// classes covered by it
// IdentityHashMap<Tuple, Set<Integer>> lineageGroupToEquivClasses = new
// IdentityHashMap<Tuple, Set<Integer>>();
IdentityHashMap<Tuple, Set<IdentityHashSet<Tuple>>> lineageGroupToEquivClasses = new IdentityHashMap<Tuple, Set<IdentityHashSet<Tuple>>>();
for (IdentityHashSet<Tuple> equivClass : equivalenceClasses) {
for (Object t : equivClass) {
Tuple lineageGroup = lineage.getRepresentative((Tuple) t);
// Set<Integer> entry =
// lineageGroupToEquivClasses.get(lineageGroup);
Set<IdentityHashSet<Tuple>> entry = lineageGroupToEquivClasses
.get(lineageGroup);
if (entry == null) {
// entry = new HashSet<Integer>();
entry = new HashSet<IdentityHashSet<Tuple>>();
lineageGroupToEquivClasses.put(lineageGroup, entry);
}
// entry.add(equivClassId);
entry.add(equivClass);
}
}
// select lineage groups such that we cover all equivalence classes
IdentityHashSet<Tuple> selectedLineageGroups = new IdentityHashSet<Tuple>();
while (!lineageGroupToEquivClasses.isEmpty()) {
// greedily find the lineage group with the best "score", where
// score = # equiv classes covered / group weight
double bestWeight = -1;
Tuple bestLineageGroup = null;
Set<IdentityHashSet<Tuple>> bestEquivClassesCovered = null;
int bestNumEquivClassesCovered = 0;
for (Tuple lineageGroup : lineageGroupToEquivClasses.keySet()) {
double weight = lineageGroupWeights.get(lineageGroup);
Set<IdentityHashSet<Tuple>> equivClassesCovered = lineageGroupToEquivClasses
.get(lineageGroup);
int numEquivClassesCovered = equivClassesCovered.size();
if ((numEquivClassesCovered > bestNumEquivClassesCovered) ||
(numEquivClassesCovered == bestNumEquivClassesCovered && weight < bestWeight)) {
if (selectedLineageGroups.contains(lineageGroup)) {
bestLineageGroup = lineageGroup;
bestEquivClassesCovered = equivClassesCovered;
continue;
}
bestWeight = weight;
bestLineageGroup = lineageGroup;
bestNumEquivClassesCovered = numEquivClassesCovered;
bestEquivClassesCovered = equivClassesCovered;
}
}
// add the best-scoring lineage group to the set of ones we plan to
// retain
selectedLineageGroups.add(bestLineageGroup);
// make copy of bestEquivClassesCovered (or else the code that
// follows won't work correctly, because removing from the reference
// set)
Set<IdentityHashSet<Tuple>> toCopy = bestEquivClassesCovered;
bestEquivClassesCovered = new HashSet<IdentityHashSet<Tuple>>();
bestEquivClassesCovered.addAll(toCopy);
// remove the classes we've now covered
Collection<Tuple> toRemove = new LinkedList<Tuple>();
for (Tuple lineageGroup : lineageGroupToEquivClasses.keySet()) {
Set<IdentityHashSet<Tuple>> equivClasses = lineageGroupToEquivClasses
.get(lineageGroup);
for (Iterator<IdentityHashSet<Tuple>> it = equivClasses
.iterator(); it.hasNext();) {
IdentityHashSet<Tuple> equivClass = it.next();
if (bestEquivClassesCovered.contains(equivClass)) {
it.remove();
}
}
if (equivClasses.size() == 0)
toRemove.add(lineageGroup);
}
for (Tuple removeMe : toRemove)
lineageGroupToEquivClasses.remove(removeMe);
}
// revise baseData to only contain the tuples that are part of
// selectedLineageGroups
IdentityHashSet<Tuple> tuplesToRetain = new IdentityHashSet<Tuple>();
for (Tuple lineageGroup : selectedLineageGroups) {
Collection<Tuple> members = membershipMap.get(lineageGroup);
for (Tuple t : members)
tuplesToRetain.add(t);
}
Map<LOLoad, DataBag> newBaseData = new HashMap<LOLoad, DataBag>();
for (LOLoad loadOp : baseData.keySet()) {
DataBag data = baseData.get(loadOp);
// DataBag newData = new DataBag();
DataBag newData = BagFactory.getInstance().newDefaultBag();
for (Iterator<Tuple> it = data.iterator(); it.hasNext();) {
Tuple t = it.next();
if (tuplesToRetain.contains(t))
newData.add(t);
}
newBaseData.put(loadOp, newData);
}
return newBaseData;
}
private void processLoad(LOLoad ld) throws FrontendException {
// prune base records
if (inputToDataMap.get(ld.getFileSpec()) != null) {
baseData.put(ld, inputToDataMap.get(ld.getFileSpec()));
return;
}
DataBag data = baseData.get(ld);
if (data == null || data.size() < 2)
return;
Set<Tuple> realData = new HashSet<Tuple>(), syntheticData = new HashSet<Tuple>();
for (Iterator<Tuple> it = data.iterator(); it.hasNext(); ) {
Tuple t = it.next();
if (((ExampleTuple)t).synthetic)
syntheticData.add(t);
else
realData.add(t);
}
Map<LOLoad, DataBag> newBaseData = new HashMap<LOLoad, DataBag>();
DataBag newData = BagFactory.getInstance().newDefaultBag();
newBaseData.put(ld, newData);
for (Map.Entry<LOLoad, DataBag> entry : baseData.entrySet()) {
if (entry.getKey() != ld) {
if (!entry.getKey().getFileSpec().equals(ld.getFileSpec()))
newBaseData.put(entry.getKey(), entry.getValue());
else
newBaseData.put(entry.getKey(), newData);
}
}
if (checkNewBaseData(newData, newBaseData, realData))
checkNewBaseData(newData, newBaseData, syntheticData);
inputToDataMap.put(ld.getFileSpec(), baseData.get(ld));
}
private boolean checkNewBaseData(DataBag data, Map<LOLoad, DataBag> newBaseData, Set<Tuple> loadData) throws FrontendException {
List<Pair<Tuple, Double>> sortedBase = new LinkedList<Pair<Tuple, Double>>();
DataBag oldData = BagFactory.getInstance().newDefaultBag();
oldData.addAll(data);
double tmpCompleteness = completeness;
for (Tuple t : loadData) {
data.add(t);
// obtain the derived data
Map<Operator, DataBag> derivedData;
try {
derivedData = eg.getData(newBaseData);
} catch (Exception e) {
throw new FrontendException("Exception: "+e.getMessage());
}
double newCompleteness = MetricEvaluation.getCompleteness(null,
derivedData, eg.getLoToEqClassMap(), true);
sortedBase.add(new Pair<Tuple, Double>(t, Double.valueOf(newCompleteness)));
if (newCompleteness >= tmpCompleteness)
break;
}
Collections.sort(sortedBase, new Comparator<Pair<Tuple, Double>>() {
@Override
public int compare(Pair<Tuple, Double> o1,
Pair<Tuple, Double> o2) {
return o1.second > o2.second ? -1 : o1.second == o2.second ? 0 : 1;
}
}
);
data.clear();
data.addAll(oldData);
for (Pair<Tuple, Double> p : sortedBase) {
data.add(p.first);
// obtain the derived data
Map<Operator, DataBag> derivedData;
try {
derivedData = eg.getData(newBaseData);
} catch (Exception e) {
throw new FrontendException("Exception: "+e.getMessage());
}
double newCompleteness = MetricEvaluation.getCompleteness(null,
derivedData, eg.getLoToEqClassMap(), true);
if (newCompleteness >= completeness) {
completeness = newCompleteness;
baseData.putAll(newBaseData);
return false;
}
}
return true;
}
private void processOperator(LogicalRelationalOperator op) throws FrontendException {
try {
if (op instanceof LOLoad) {
processLoad((LOLoad) op);
return;
}
continueTrimming = checkCompleteness(op);
if (plan.getPredecessors(op) == null)
return;
if (continueTrimming == false)
return;
Operator childOp = plan.getPredecessors(op).get(0);
if (op instanceof LOForEach && childOp instanceof LOCogroup)
{
LOCogroup cg = (LOCogroup) childOp;
for (Operator input : cg.getInputs(plan)) {
AffinityGroups.put(input, eg.getEqClasses());
Lineage.put(input, eg.getLineage());
}
} else {
List<Operator> childOps = plan.getPredecessors(op);
for (Operator lo : childOps) {
AffinityGroups.put(lo, eg.getEqClasses());
Lineage.put(lo, eg.getLineage());
}
}
} catch (Exception e) {
e.printStackTrace(System.out);
throw new FrontendException("Exception: "+e.getMessage());
}
}
private boolean checkCompleteness(LogicalRelationalOperator op) throws Exception {
LineageTracer lineage = Lineage.get(op);
Lineage.remove(op);
Collection<IdentityHashSet<Tuple>> affinityGroups = AffinityGroups
.get(op);
AffinityGroups.remove(op);
Map<LOLoad, DataBag> newBaseData = PruneBaseDataConstrainedCoverage(
baseData, lineage, affinityGroups);
// obtain the derived data
Map<Operator, DataBag> derivedData = eg.getData(newBaseData);
double newCompleteness = MetricEvaluation.getCompleteness(null,
derivedData, eg.getLoToEqClassMap(), true);
if (newCompleteness >= completeness) {
completeness = newCompleteness;
baseData.putAll(newBaseData);
} else {
continueTrimming = false;
}
return continueTrimming;
}
Map<LOLoad, DataBag> getBaseData() {
return baseData;
}
}