blob: 92b0b2d9af3ef683dc8941d9857927b1e3d84185 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.sysds.test.functions.transform;
import org.junit.Assert;
import org.junit.Test;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.common.Types.ExecMode;
import org.apache.sysds.common.Types.FileFormat;
import org.apache.sysds.runtime.io.MatrixReaderFactory;
import org.apache.sysds.runtime.util.DataConverter;
import org.apache.sysds.test.AutomatedTestBase;
import org.apache.sysds.test.TestConfiguration;
import org.apache.sysds.test.TestUtils;
import org.apache.sysds.utils.Statistics;
public class TransformFrameEncodeApplyTest extends AutomatedTestBase
{
private final static String TEST_NAME1 = "TransformFrameEncodeApply";
private final static String TEST_DIR = "functions/transform/";
private final static String TEST_CLASS_DIR = TEST_DIR + TransformFrameEncodeApplyTest.class.getSimpleName() + "/";
//dataset and transform tasks without missing values
private final static String DATASET1 = "homes3/homes.csv";
private final static String SPEC1 = "homes3/homes.tfspec_recode.json";
private final static String SPEC1b = "homes3/homes.tfspec_recode2.json";
private final static String SPEC2 = "homes3/homes.tfspec_dummy.json";
private final static String SPEC2b = "homes3/homes.tfspec_dummy2.json";
private final static String SPEC3 = "homes3/homes.tfspec_bin.json"; //recode
private final static String SPEC3b = "homes3/homes.tfspec_bin2.json"; //recode
private final static String SPEC6 = "homes3/homes.tfspec_recode_dummy.json";
private final static String SPEC6b = "homes3/homes.tfspec_recode_dummy2.json";
private final static String SPEC7 = "homes3/homes.tfspec_binDummy.json"; //recode+dummy
private final static String SPEC7b = "homes3/homes.tfspec_binDummy2.json"; //recode+dummy
private final static String SPEC8 = "homes3/homes.tfspec_hash.json";
private final static String SPEC8b = "homes3/homes.tfspec_hash2.json";
private final static String SPEC9 = "homes3/homes.tfspec_hash_recode.json";
private final static String SPEC9b = "homes3/homes.tfspec_hash_recode2.json";
//dataset and transform tasks with missing values
private final static String DATASET2 = "homes/homes.csv";
private final static String SPEC4 = "homes3/homes.tfspec_impute.json";
private final static String SPEC4b = "homes3/homes.tfspec_impute2.json";
private final static String SPEC5 = "homes3/homes.tfspec_omit.json";
private final static String SPEC5b = "homes3/homes.tfspec_omit2.json";
private static final int[] BIN_col3 = new int[]{1,4,2,3,3,2,4};
private static final int[] BIN_col8 = new int[]{1,2,2,2,2,2,3};
public enum TransformType {
RECODE,
DUMMY,
RECODE_DUMMY,
BIN,
BIN_DUMMY,
IMPUTE,
OMIT,
HASH,
HASH_RECODE,
}
@Override
public void setUp() {
TestUtils.clearAssertionInformation();
addTestConfiguration(TEST_NAME1, new TestConfiguration(TEST_CLASS_DIR, TEST_NAME1, new String[] { "y" }) );
}
@Test
public void testHomesRecodeIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.RECODE, false);
}
@Test
public void testHomesRecodeIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.RECODE, false);
}
@Test
public void testHomesRecodeIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.RECODE, false);
}
@Test
public void testHomesDummycodeIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.DUMMY, false);
}
@Test
public void testHomesDummycodeIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.DUMMY, false);
}
@Test
public void testHomesDummycodeIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.DUMMY, false);
}
@Test
public void testHomesRecodeDummycodeIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.RECODE_DUMMY, false);
}
@Test
public void testHomesRecodeDummycodeIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.RECODE_DUMMY, false);
}
@Test
public void testHomesRecodeDummycodeIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.RECODE_DUMMY, false);
}
@Test
public void testHomesBinningIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.BIN, false);
}
// @Test
// public void testHomesBinningIDsSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.BIN, false);
// }
@Test
public void testHomesBinningIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.BIN, false);
}
@Test
public void testHomesBinningDummyIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.BIN_DUMMY, false);
}
// @Test
// public void testHomesBinningDummyIDsSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.BIN_DUMMY, false);
// }
@Test
public void testHomesBinningDummyIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.BIN_DUMMY, false);
}
@Test
public void testHomesOmitIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.OMIT, false);
}
@Test
public void testHomesOmitIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.OMIT, false);
}
@Test
public void testHomesOmitIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.OMIT, false);
}
@Test
public void testHomesImputeIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.IMPUTE, false);
}
@Test
public void testHomesImputeIDsSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.IMPUTE, false);
}
@Test
public void testHomesImputeIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.IMPUTE, false);
}
@Test
public void testHomesRecodeColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.RECODE, true);
}
@Test
public void testHomesRecodeColnamesSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.RECODE, true);
}
@Test
public void testHomesRecodeColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.RECODE, true);
}
@Test
public void testHomesDummycodeColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.DUMMY, true);
}
@Test
public void testHomesDummycodeColnamesSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.DUMMY, true);
}
@Test
public void testHomesDummycodeColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.DUMMY, true);
}
@Test
public void testHomesRecodeDummycodeColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.RECODE_DUMMY, true);
}
@Test
public void testHomesRecodeDummycodeColnamesSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.RECODE_DUMMY, true);
}
@Test
public void testHomesRecodeDummycodeColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.RECODE_DUMMY, true);
}
@Test
public void testHomesBinningColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.BIN, true);
}
// @Test
// public void testHomesBinningColnamesSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.BIN, true);
// }
@Test
public void testHomesBinningColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.BIN, true);
}
@Test
public void testHomesBinningDummyColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.BIN_DUMMY, true);
}
// @Test
// public void testHomesBinningDummyColnamesSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.BIN_DUMMY, true);
// }
@Test
public void testHomesBinningDummyColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.BIN_DUMMY, true);
}
@Test
public void testHomesOmitColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.OMIT, true);
}
@Test
public void testHomesOmitvColnamesSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.OMIT, true);
}
@Test
public void testHomesOmitvColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.OMIT, true);
}
@Test
public void testHomesImputeColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.IMPUTE, true);
}
@Test
public void testHomesImputeColnamesSparkCSV() {
runTransformTest(ExecMode.SPARK, "csv", TransformType.IMPUTE, true);
}
@Test
public void testHomesImputeColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.IMPUTE, true);
}
@Test
public void testHomesHashColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.HASH, true);
}
//TODO fix spark implementation feature hashing (w/o recode)
// @Test
// public void testHomesHashColnamesSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.HASH, true);
// }
@Test
public void testHomesHashColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.HASH, true);
}
@Test
public void testHomesHashIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.HASH, false);
}
//TODO fix spark implementation feature hashing (w/o recode)
// @Test
// public void testHomesHashIDsSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.HASH, false);
// }
@Test
public void testHomesHashIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.HASH, false);
}
@Test
public void testHomesHashRecodeColnamesSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.HASH_RECODE, true);
}
//TODO fix spark implementation feature hashing (w/o recode)
// @Test
// public void testHomesHashRecodeColnamesSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.HASH_RECODE, true);
// }
@Test
public void testHomesHashRecodeColnamesHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.HASH_RECODE, true);
}
@Test
public void testHomesHashRecodeIDsSingleNodeCSV() {
runTransformTest(ExecMode.SINGLE_NODE, "csv", TransformType.HASH_RECODE, false);
}
//TODO fix spark implementation feature hashing (w/o recode)
// @Test
// public void testHomesHashRecodeIDsSparkCSV() {
// runTransformTest(ExecMode.SPARK, "csv", TransformType.HASH_RECODE, false);
// }
@Test
public void testHomesHashRecodeIDsHybridCSV() {
runTransformTest(ExecMode.HYBRID, "csv", TransformType.HASH_RECODE, false);
}
private void runTransformTest( ExecMode rt, String ofmt, TransformType type, boolean colnames )
{
boolean sparkConfigOld = DMLScript.USE_LOCAL_SPARK_CONFIG;
if( rtplatform == ExecMode.SPARK || rtplatform == ExecMode.HYBRID)
DMLScript.USE_LOCAL_SPARK_CONFIG = true;
ExecMode rtold = rtplatform;
rtplatform = rt;
//set transform specification
String SPEC = null; String DATASET = null;
switch( type ) {
case RECODE: SPEC = colnames?SPEC1b:SPEC1; DATASET = DATASET1; break;
case DUMMY: SPEC = colnames?SPEC2b:SPEC2; DATASET = DATASET1; break;
case BIN: SPEC = colnames?SPEC3b:SPEC3; DATASET = DATASET1; break;
case IMPUTE: SPEC = colnames?SPEC4b:SPEC4; DATASET = DATASET2; break;
case OMIT: SPEC = colnames?SPEC5b:SPEC5; DATASET = DATASET2; break;
case RECODE_DUMMY: SPEC = colnames?SPEC6b:SPEC6; DATASET = DATASET1; break;
case BIN_DUMMY: SPEC = colnames?SPEC7b:SPEC7; DATASET = DATASET1; break;
case HASH: SPEC = colnames?SPEC8b:SPEC8; DATASET = DATASET1; break;
case HASH_RECODE: SPEC = colnames?SPEC9b:SPEC9; DATASET = DATASET1; break;
}
if( !ofmt.equals("csv") )
throw new RuntimeException("Unsupported test output format");
try
{
getAndLoadTestConfiguration(TEST_NAME1);
String HOME = SCRIPT_DIR + TEST_DIR;
fullDMLScriptName = HOME + TEST_NAME1 + ".dml";
programArgs = new String[]{"-nvargs",
"DATA=" + HOME + "input/" + DATASET,
"TFSPEC=" + HOME + "input/" + SPEC,
"TFDATA1=" + output("tfout1"),
"TFDATA2=" + output("tfout2"),
"OFMT=" + ofmt };
runTest(true, false, null, -1);
//read input/output and compare
double[][] R1 = DataConverter.convertToDoubleMatrix(MatrixReaderFactory
.createMatrixReader(FileFormat.CSV)
.readMatrixFromHDFS(output("tfout1"), -1L, -1L, 1000, -1));
double[][] R2 = DataConverter.convertToDoubleMatrix(MatrixReaderFactory
.createMatrixReader(FileFormat.CSV)
.readMatrixFromHDFS(output("tfout2"), -1L, -1L, 1000, -1));
TestUtils.compareMatrices(R1, R2, R1.length, R1[0].length, 0);
if( rt == ExecMode.HYBRID ) {
Assert.assertEquals("Wrong number of executed Spark instructions: " +
Statistics.getNoOfExecutedSPInst(), new Long(0), new Long(Statistics.getNoOfExecutedSPInst()));
}
//additional checks for binning as encode-decode impossible
//TODO fix distributed binning as well
if( type == TransformType.BIN ) {
for(int i=0; i<7; i++) {
Assert.assertEquals(BIN_col3[i], R1[i][2], 1e-8);
Assert.assertEquals(BIN_col8[i], R1[i][7], 1e-8);
}
}
else if( type == TransformType.BIN_DUMMY ) {
Assert.assertEquals(14, R1[0].length);
for(int i=0; i<7; i++) {
for(int j=0; j<4; j++) { //check dummy coded
Assert.assertEquals((j==BIN_col3[i]-1)?
1:0, R1[i][2+j], 1e-8);
}
for(int j=0; j<3; j++) { //check dummy coded
Assert.assertEquals((j==BIN_col8[i]-1)?
1:0, R1[i][10+j], 1e-8);
}
}
}
}
catch(Exception ex) {
throw new RuntimeException(ex);
}
finally {
rtplatform = rtold;
DMLScript.USE_LOCAL_SPARK_CONFIG = sparkConfigOld;
}
}
}