src/org/apache/pig/pen/util/LineageTracer.java - pig - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
 package org.apache.pig.pen.util;

 import java.util.*;

 import org.apache.pig.data.Tuple;

 public class LineageTracer {

     // Use textbook Union-Find data structure, with counts associated with items

     // note: we test for equality by comparing tuple references, not by calling
     // the "equals()" method
     // the "IdentityHashMap" data structure is based on reference equality
     IdentityHashMap<Tuple, Tuple> parents = new IdentityHashMap<Tuple, Tuple>();
     IdentityHashMap<Tuple, Integer> counts = new IdentityHashMap<Tuple, Integer>(); // has
     // one
     // entry
     // per
     // unique
     // tuple
     // being
     // tracked
     IdentityHashMap<Tuple, Integer> ranks = new IdentityHashMap<Tuple, Integer>();

     // insert a new tuple (if a tuple is inserted multiple times, it gets a
     // count > 1)
     public void insert(Tuple t) {
         if (parents.containsKey(t)) {
             counts.put(t, counts.get(t) + 1);
         } else {
             parents.put(t, t);
             counts.put(t, 1);
             ranks.put(t, 0);
         }
     }

     // union two tuple sets
     public void union(Tuple t1, Tuple t2) {
         link(getRepresentative(t1), getRepresentative(t2));
     }

     // find the set representative of a given tuple
     public Tuple getRepresentative(Tuple t) {
         Tuple tParent = parents.get(t);
         if (tParent != t) {
             tParent = getRepresentative(tParent);
             parents.put(t, tParent);
         }
         return tParent;
     }

     private void link(Tuple t1, Tuple t2) {
         int t1Rank = ranks.get(t1);
         int t2Rank = ranks.get(t2);
         if (t1Rank > t2Rank) {
             parents.put(t2, t1);
         } else {
             parents.put(t1, t2);
             if (t1Rank == t2Rank)
                 ranks.put(t2, t2Rank + 1);
         }
     }

     // get the cardinality of each tuple set (identified by a representative
     // tuple)
     public IdentityHashMap<Tuple, Double> getCounts() {
         return getWeightedCounts(2f, 1f);
     }

     // get the cardinality of each tuple set, weighted in a special way
     // weighting works like this: if a tuple set contains one or more tuples
     // from the "specialTuples" set, we multiply its value by "multiplier"
     // public IdentityHashMap<Tuple, Integer>
     // getWeightedCounts(IdentityHashSet<Tuple> specialTuples, int multiplier) {
     // IdentityHashMap<Tuple, Integer> repCounts = new IdentityHashMap<Tuple,
     // Integer>();
     // IdentityHashSet<Tuple> specialSets = new IdentityHashSet<Tuple>();
     //
     // for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
     // Tuple t = e.getKey();
     //
     // int newCount = counts.get(t);
     // Tuple rep = getRepresentative(t);
     // int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0;
     // repCounts.put(rep, oldCount + newCount);
     // if (specialTuples.contains(t)) specialSets.add(rep);
     // }
     //
     // for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet()) {
     // if (specialSets.contains(e.getKey())) e.setValue(e.getValue() *
     // multiplier);
     // }
     //
     // return repCounts;
     // }

     public IdentityHashMap<Tuple, Double> getWeightedCounts(
             float syntheticMultipler, float omittableMultiplier) {
         IdentityHashMap<Tuple, Double> repCounts = new IdentityHashMap<Tuple, Double>();

         for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
             Tuple t = e.getKey();

             float newCount = counts.get(t);
             if (((ExampleTuple) t).synthetic)
                 newCount = newCount * syntheticMultipler;
             if (((ExampleTuple) t).omittable)
                 newCount = newCount * omittableMultiplier;

             Tuple rep = getRepresentative(t);
             double oldCount = (repCounts.containsKey(rep)) ? repCounts.get(rep)
                     : 0;
             repCounts.put(rep, oldCount + newCount);
             // if (specialTuples.contains(t)) specialSets.add(rep);
         }
         /*
          * for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet())
          * { if (specialSets.contains(e.getKey())) e.setValue(e.getValue()
          * multiplier); }
          */
         return repCounts;
     }

     // get all members of the set containing t
     public Collection<Tuple> getMembers(Tuple t) {
         Tuple representative = getRepresentative(t);

         Collection<Tuple> members = new LinkedList<Tuple>();
         for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
             Tuple t1 = e.getKey();
             if (getRepresentative(t1) == representative)
                 members.add(t1);
         }
         return members;
     }

     // get a mapping from set representatives to members
     public IdentityHashMap<Tuple, Collection<Tuple>> getMembershipMap() {
         IdentityHashMap<Tuple, Collection<Tuple>> map = new IdentityHashMap<Tuple, Collection<Tuple>>();
         for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
             Tuple t = e.getKey();

             Tuple representative = getRepresentative(t);
             Collection<Tuple> members = map.get(representative);
             if (members == null) {
                 members = new LinkedList<Tuple>();
                 map.put(representative, members);
             }
             members.add(t);
         }
         return map;
     }
 }
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	* See the License for the specific language governing permissions and
	* limitations under the License.
	*/
	package org.apache.pig.pen.util;

	import java.util.*;

	import org.apache.pig.data.Tuple;

	public class LineageTracer {

	// Use textbook Union-Find data structure, with counts associated with items

	// note: we test for equality by comparing tuple references, not by calling
	// the "equals()" method
	// the "IdentityHashMap" data structure is based on reference equality
	IdentityHashMap<Tuple, Tuple> parents = new IdentityHashMap<Tuple, Tuple>();
	IdentityHashMap<Tuple, Integer> counts = new IdentityHashMap<Tuple, Integer>(); // has
	// one
	// entry
	// per
	// unique
	// tuple
	// being
	// tracked
	IdentityHashMap<Tuple, Integer> ranks = new IdentityHashMap<Tuple, Integer>();

	// insert a new tuple (if a tuple is inserted multiple times, it gets a
	// count > 1)
	public void insert(Tuple t) {
	if (parents.containsKey(t)) {
	counts.put(t, counts.get(t) + 1);
	} else {
	parents.put(t, t);
	counts.put(t, 1);
	ranks.put(t, 0);
	}
	}

	// union two tuple sets
	public void union(Tuple t1, Tuple t2) {
	link(getRepresentative(t1), getRepresentative(t2));
	}

	// find the set representative of a given tuple
	public Tuple getRepresentative(Tuple t) {
	Tuple tParent = parents.get(t);
	if (tParent != t) {
	tParent = getRepresentative(tParent);
	parents.put(t, tParent);
	}
	return tParent;
	}

	private void link(Tuple t1, Tuple t2) {
	int t1Rank = ranks.get(t1);
	int t2Rank = ranks.get(t2);
	if (t1Rank > t2Rank) {
	parents.put(t2, t1);
	} else {
	parents.put(t1, t2);
	if (t1Rank == t2Rank)
	ranks.put(t2, t2Rank + 1);
	}
	}

	// get the cardinality of each tuple set (identified by a representative
	// tuple)
	public IdentityHashMap<Tuple, Double> getCounts() {
	return getWeightedCounts(2f, 1f);
	}

	// get the cardinality of each tuple set, weighted in a special way
	// weighting works like this: if a tuple set contains one or more tuples
	// from the "specialTuples" set, we multiply its value by "multiplier"
	// public IdentityHashMap<Tuple, Integer>
	// getWeightedCounts(IdentityHashSet<Tuple> specialTuples, int multiplier) {
	// IdentityHashMap<Tuple, Integer> repCounts = new IdentityHashMap<Tuple,
	// Integer>();
	// IdentityHashSet<Tuple> specialSets = new IdentityHashSet<Tuple>();
	//
	// for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
	// Tuple t = e.getKey();
	//
	// int newCount = counts.get(t);
	// Tuple rep = getRepresentative(t);
	// int oldCount = (repCounts.containsKey(rep))? repCounts.get(rep) : 0;
	// repCounts.put(rep, oldCount + newCount);
	// if (specialTuples.contains(t)) specialSets.add(rep);
	// }
	//
	// for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet()) {
	// if (specialSets.contains(e.getKey())) e.setValue(e.getValue() *
	// multiplier);
	// }
	//
	// return repCounts;
	// }

	public IdentityHashMap<Tuple, Double> getWeightedCounts(
	float syntheticMultipler, float omittableMultiplier) {
	IdentityHashMap<Tuple, Double> repCounts = new IdentityHashMap<Tuple, Double>();

	for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
	Tuple t = e.getKey();

	float newCount = counts.get(t);
	if (((ExampleTuple) t).synthetic)
	newCount = newCount * syntheticMultipler;
	if (((ExampleTuple) t).omittable)
	newCount = newCount * omittableMultiplier;

	Tuple rep = getRepresentative(t);
	double oldCount = (repCounts.containsKey(rep)) ? repCounts.get(rep)
	: 0;
	repCounts.put(rep, oldCount + newCount);
	// if (specialTuples.contains(t)) specialSets.add(rep);
	}
	/*
	* for (IdentityHashMap.Entry<Tuple, Integer> e : repCounts.entrySet())
	* { if (specialSets.contains(e.getKey())) e.setValue(e.getValue()
	* multiplier); }
	*/
	return repCounts;
	}

	// get all members of the set containing t
	public Collection<Tuple> getMembers(Tuple t) {
	Tuple representative = getRepresentative(t);

	Collection<Tuple> members = new LinkedList<Tuple>();
	for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
	Tuple t1 = e.getKey();
	if (getRepresentative(t1) == representative)
	members.add(t1);
	}
	return members;
	}

	// get a mapping from set representatives to members
	public IdentityHashMap<Tuple, Collection<Tuple>> getMembershipMap() {
	IdentityHashMap<Tuple, Collection<Tuple>> map = new IdentityHashMap<Tuple, Collection<Tuple>>();
	for (IdentityHashMap.Entry<Tuple, Integer> e : counts.entrySet()) {
	Tuple t = e.getKey();

	Tuple representative = getRepresentative(t);
	Collection<Tuple> members = map.get(representative);
	if (members == null) {
	members = new LinkedList<Tuple>();
	map.put(representative, members);
	}
	members.add(t);
	}
	return map;
	}
	}