blob: 35dbcc472f66f515ec7ba6307f20f2b27273401d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.pig.pen.util;
import java.util.*;
import org.apache.pig.data.Tuple;
public class LineageTracer {
// Use textbook Union-Find data structure, with counts associated with items
// note: we test for equality by comparing tuple references, not by calling
// the "equals()" method
// the "IdentityHashMap" data structure is based on reference equality
IdentityHashMap<Tuple, Tuple> parents = new IdentityHashMap<Tuple, Tuple>();
IdentityHashMap<Tuple, Integer> counts = new IdentityHashMap<Tuple, Integer>(); // has
// one
// entry
// per
// unique
// tuple
// being
// tracked
IdentityHashMap<Tuple, Integer> ranks = new IdentityHashMap<Tuple, Integer>();
// insert a new tuple (if a tuple is inserted multiple times, it gets a
// count > 1)
public void insert(Tuple t) {
if (parents.containsKey(t)) {
counts.put(t, counts.get(t) + 1);
} else {
parents.put(t, t);
counts.put(t, 1);
ranks.put(t, 0);
}
}
// union two tuple sets
public void union(Tuple t1, Tuple t2) {
link(getRepresentative(t1), getRepresentative(t2));
}
// find the set representative of a given tuple
public Tuple getRepresentative(Tuple t) {
Tuple tParent = parents.get(t);
if (tParent != t) {
tParent = getRepresentative(tParent);
parents.put(t, tParent);
}
return tParent;
}
private void link(Tuple t1, Tuple t2) {
int t1Rank = ranks.get(t1);
int t2Rank = ranks.get(t2);
if (t1Rank > t2Rank) {
parents.put(t2, t1);
} else {
parents.put(t1, t2);
if (t1Rank == t2Rank)
ranks.put(t2, t2Rank + 1);
}
}
// get the cardinality of each tuple set (identified by a representative
// tuple)
public IdentityHashMap<Tuple, Double> getCounts() {
return getWeightedCounts(2f, 1f);
}
// get the cardinality of each tuple set, weighted in a special way
// weighting works like this: if a tuple set contains one or more tuples
// from the "specialTuples" set, we multiply its value by "multiplier"
// public IdentityHashMap<Tuple, Integer>
// getWeightedCounts(IdentityHashSet<Tuple> specialTuples, int multiplier) {
// IdentityHashMap<Tuple, Integer> repCounts = new IdentityHashMap<Tuple,
// Integer>();
// IdentityHashSet<Tuple> specialSets = new IdentityHashSet<Tuple>();
//
// for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
// Tuple t = e.getKey();
//
// int newCount = counts.get(t);
// Tuple rep = getRepresentative(t);
// int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0;
// repCounts.put(rep, oldCount + newCount);
// if (specialTuples.contains(t)) specialSets.add(rep);
// }
//
// for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet()) {
// if (specialSets.contains(e.getKey())) e.setValue(e.getValue() *
// multiplier);
// }
//
// return repCounts;
// }
public IdentityHashMap<Tuple, Double> getWeightedCounts(
float syntheticMultipler, float omittableMultiplier) {
IdentityHashMap<Tuple, Double> repCounts = new IdentityHashMap<Tuple, Double>();
for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
Tuple t = e.getKey();
float newCount = counts.get(t);
if (((ExampleTuple) t).synthetic)
newCount = newCount * syntheticMultipler;
if (((ExampleTuple) t).omittable)
newCount = newCount * omittableMultiplier;
Tuple rep = getRepresentative(t);
double oldCount = (repCounts.containsKey(rep)) ? repCounts.get(rep)
: 0;
repCounts.put(rep, oldCount + newCount);
// if (specialTuples.contains(t)) specialSets.add(rep);
}
/*
* for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet())
* { if (specialSets.contains(e.getKey())) e.setValue(e.getValue()
* multiplier); }
*/
return repCounts;
}
// get all members of the set containing t
public Collection<Tuple> getMembers(Tuple t) {
Tuple representative = getRepresentative(t);
Collection<Tuple> members = new LinkedList<Tuple>();
for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
Tuple t1 = e.getKey();
if (getRepresentative(t1) == representative)
members.add(t1);
}
return members;
}
// get a mapping from set representatives to members
public IdentityHashMap<Tuple, Collection<Tuple>> getMembershipMap() {
IdentityHashMap<Tuple, Collection<Tuple>> map = new IdentityHashMap<Tuple, Collection<Tuple>>();
for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
Tuple t = e.getKey();
Tuple representative = getRepresentative(t);
Collection<Tuple> members = map.get(representative);
if (members == null) {
members = new LinkedList<Tuple>();
map.put(representative, members);
}
members.add(t);
}
return map;
}
}