blob: 932036eed7c0aba384b91e75b2e869b99d793a8e [file] [log] [blame]
package edu.uci.ics.asterix.runtime.evaluators.common;
import java.io.DataOutput;
import java.io.IOException;
import edu.uci.ics.asterix.formats.nontagged.AqlSerializerDeserializerProvider;
import edu.uci.ics.asterix.om.base.AFloat;
import edu.uci.ics.asterix.om.base.AMutableFloat;
import edu.uci.ics.asterix.om.types.ATypeTag;
import edu.uci.ics.asterix.om.types.BuiltinType;
import edu.uci.ics.asterix.om.types.EnumDeserializer;
import edu.uci.ics.fuzzyjoin.similarity.SimilarityMetricJaccard;
import edu.uci.ics.hyracks.algebricks.common.exceptions.AlgebricksException;
import edu.uci.ics.hyracks.algebricks.runtime.base.IEvaluator;
import edu.uci.ics.hyracks.algebricks.runtime.base.IEvaluatorFactory;
import edu.uci.ics.hyracks.api.dataflow.value.ISerializerDeserializer;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.ArrayBackedValueStorage;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.IDataOutputProvider;
import edu.uci.ics.hyracks.dataflow.common.data.accessors.IFrameTupleReference;
// assumes that both arguments are sorted by the same ordering
public class SimilarityJaccardEvaluator implements IEvaluator {
// assuming type indicator in serde format
protected final int typeIndicatorSize = 1;
protected final DataOutput out;
protected final ArrayBackedValueStorage argOut = new ArrayBackedValueStorage();
protected final IEvaluator firstOrdListEval;
protected final IEvaluator secondOrdListEval;
protected final AsterixOrderedListIterator fstOrdListIter = new AsterixOrderedListIterator();
protected final AsterixOrderedListIterator sndOrdListIter = new AsterixOrderedListIterator();
protected final AsterixUnorderedListIterator fstUnordListIter = new AsterixUnorderedListIterator();
protected final AsterixUnorderedListIterator sndUnordListIter = new AsterixUnorderedListIterator();
protected AbstractAsterixListIterator firstListIter;
protected AbstractAsterixListIterator secondListIter;
protected final SimilarityMetricJaccard jaccard = new SimilarityMetricJaccard();
protected final AMutableFloat aFloat = new AMutableFloat(0);
@SuppressWarnings("unchecked")
protected final ISerializerDeserializer<AFloat> floatSerde = AqlSerializerDeserializerProvider.INSTANCE
.getSerializerDeserializer(BuiltinType.AFLOAT);
protected ATypeTag itemTypeTag;
protected ATypeTag firstTypeTag;
protected ATypeTag secondTypeTag;
protected int firstStart = -1;
protected int secondStart = -1;
protected float jaccSim = 0.0f;
public SimilarityJaccardEvaluator(IEvaluatorFactory[] args, IDataOutputProvider output) throws AlgebricksException {
out = output.getDataOutput();
firstOrdListEval = args[0].createEvaluator(argOut);
secondOrdListEval = args[1].createEvaluator(argOut);
}
@Override
public void evaluate(IFrameTupleReference tuple) throws AlgebricksException {
runArgEvals(tuple);
if (!checkArgTypes(firstTypeTag, secondTypeTag))
return;
jaccSim = computeResult(argOut.getBytes(), firstStart, secondStart, firstTypeTag);
try {
writeResult(jaccSim);
} catch (IOException e) {
throw new AlgebricksException(e);
}
}
protected void runArgEvals(IFrameTupleReference tuple) throws AlgebricksException {
argOut.reset();
firstStart = argOut.getLength();
firstOrdListEval.evaluate(tuple);
secondStart = argOut.getLength();
secondOrdListEval.evaluate(tuple);
firstTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getBytes()[firstStart]);
secondTypeTag = EnumDeserializer.ATYPETAGDESERIALIZER.deserialize(argOut.getBytes()[secondStart]);
}
protected float computeResult(byte[] bytes, int firstStart, int secondStart, ATypeTag argType)
throws AlgebricksException {
firstListIter.reset(bytes, firstStart);
secondListIter.reset(bytes, secondStart);
// Check for special case where one of the lists is empty, since list
// types won't match.
if (firstListIter.size() == 0 || secondListIter.size() == 0) {
try {
writeResult(0.0f);
} catch (IOException e) {
throw new AlgebricksException(e);
}
}
if (firstTypeTag == ATypeTag.ANY || secondTypeTag == ATypeTag.ANY)
throw new AlgebricksException("\n Jaccard can only be called on homogenous lists");
return jaccard.getSimilarity(firstListIter, secondListIter);
}
protected boolean checkArgTypes(ATypeTag typeTag1, ATypeTag typeTag2) throws AlgebricksException {
// jaccard between null and anything else is 0
if (typeTag1 == ATypeTag.NULL || typeTag2 == ATypeTag.NULL) {
try {
writeResult(0.0f);
} catch (IOException e) {
throw new AlgebricksException(e);
}
return false;
}
switch (typeTag1) {
case ORDEREDLIST: {
firstListIter = fstOrdListIter;
break;
}
case UNORDEREDLIST: {
firstListIter = fstUnordListIter;
break;
}
default: {
throw new AlgebricksException("Invalid types " + typeTag1 + " given as arguments to jaccard.");
}
}
switch (typeTag2) {
case ORDEREDLIST: {
secondListIter = sndOrdListIter;
break;
}
case UNORDEREDLIST: {
secondListIter = sndUnordListIter;
break;
}
default: {
throw new AlgebricksException("Invalid types " + typeTag2 + " given as arguments to jaccard.");
}
}
return true;
}
protected void writeResult(float jacc) throws IOException {
aFloat.setValue(jacc);
floatSerde.serialize(aFloat, out);
}
}