blob: d981e835d26f18c6f849db8345fdf159d57e462d [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.datasketches.pig.frequencies;
import java.util.Iterator;
import org.apache.pig.EvalFunc;
import org.apache.pig.data.DataBag;
import org.apache.pig.data.DataByteArray;
import org.apache.pig.data.DataType;
import org.apache.pig.data.Tuple;
import org.apache.pig.data.TupleFactory;
import org.apache.pig.impl.logicalLayer.schema.Schema;
import org.testng.Assert;
import org.testng.annotations.Test;
import org.apache.datasketches.ArrayOfStringsSerDe;
import org.apache.datasketches.frequencies.ItemsSketch;
import org.apache.datasketches.pig.tuple.PigUtil;
@SuppressWarnings("javadoc")
public class FrequentStringsSketchToEstimatesTest {
@Test
public void nullInput() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
DataBag bag = func.exec(null);
Assert.assertNull(bag);
}
@Test
public void emptyInput() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
DataBag bag = func.exec(TupleFactory.getInstance().newTuple());
Assert.assertNull(bag);
}
@Test
public void emptySketch() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
ItemsSketch<String> sketch = new ItemsSketch<>(8);
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
DataBag bag = func.exec(inputTuple);
Assert.assertNotNull(bag);
Assert.assertEquals(bag.size(), 0);
}
@Test
public void exact() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
ItemsSketch<String> sketch = new ItemsSketch<>(8);
sketch.update("a");
sketch.update("a");
sketch.update("b");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
DataBag bag = func.exec(inputTuple);
Assert.assertNotNull(bag);
Assert.assertEquals(bag.size(), 2);
Iterator<Tuple> it = bag.iterator();
Tuple tuple1 = it.next();
Assert.assertEquals(tuple1.size(), 4);
Assert.assertEquals((String)tuple1.get(0), "a");
Assert.assertEquals((long)tuple1.get(1), 2L);
Assert.assertEquals((long)tuple1.get(2), 2L);
Assert.assertEquals((long)tuple1.get(3), 2L);
Tuple tuple2 = it.next();
Assert.assertEquals(tuple2.size(), 4);
Assert.assertEquals((String)tuple2.get(0), "b");
Assert.assertEquals((long)tuple2.get(1), 1L);
Assert.assertEquals((long)tuple2.get(2), 1L);
Assert.assertEquals((long)tuple2.get(3), 1L);
}
@Test
public void estimation() throws Exception {
ItemsSketch<String> sketch = new ItemsSketch<>(8);
sketch.update("1", 1000);
sketch.update("2", 500);
sketch.update("3", 200);
sketch.update("4", 100);
sketch.update("5", 50);
sketch.update("6", 20);
sketch.update("7", 10);
sketch.update("8", 5);
sketch.update("9", 2);
sketch.update("10");
Tuple inputTuple = PigUtil.objectsToTuple(new DataByteArray(sketch.toByteArray(new ArrayOfStringsSerDe())));
EvalFunc<DataBag> func1 = new FrequentStringsSketchToEstimates("NO_FALSE_POSITIVES");
DataBag bag1 = func1.exec(inputTuple);
Assert.assertNotNull(bag1);
Assert.assertTrue(bag1.size() < 10);
EvalFunc<DataBag> func2 = new FrequentStringsSketchToEstimates("NO_FALSE_NEGATIVES");
DataBag bag2 = func2.exec(inputTuple);
Assert.assertNotNull(bag2);
Assert.assertTrue(bag2.size() < 10);
Assert.assertTrue(bag1.size() < bag2.size());
}
@Test
public void schema() throws Exception {
EvalFunc<DataBag> func = new FrequentStringsSketchToEstimates();
Schema schema = func.outputSchema(null);
Assert.assertNotNull(schema);
Assert.assertEquals(schema.size(), 1);
Assert.assertEquals(schema.getField(0).type, DataType.BAG);
Assert.assertEquals(schema.getField(0).schema.size(), 1);
Assert.assertEquals(schema.getField(0).schema.getField(0).type, DataType.TUPLE);
Assert.assertEquals(schema.getField(0).schema.getField(0).schema.size(), 4);
Assert.assertEquals(schema.getField(0).schema.getField(0).schema.getField(0).type, DataType.CHARARRAY);
Assert.assertEquals(schema.getField(0).schema.getField(0).schema.getField(1).type, DataType.LONG);
Assert.assertEquals(schema.getField(0).schema.getField(0).schema.getField(2).type, DataType.LONG);
Assert.assertEquals(schema.getField(0).schema.getField(0).schema.getField(3).type, DataType.LONG);
}
}