SAMOA-26: Fix the ArffLoader bug (asp188)
Fix #24
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
index feb5702..dc22bb8 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
@@ -19,6 +19,7 @@
* limitations under the License.
* #L%
*/
+
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
@@ -30,7 +31,6 @@
import java.util.logging.Logger;
/**
- *
* @author abifet
*/
public class ArffLoader implements Serializable {
@@ -87,15 +87,16 @@
while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
- && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+ && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each item
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
// System.out.println(streamTokenizer.nval + "Num ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
- numAttribute++;
+ //numAttribute++;
- } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
- || streamTokenizer.ttype == 34)) {
+ } else if (streamTokenizer.sval != null && (
+ streamTokenizer.ttype == StreamTokenizer.TT_WORD
+ || streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) {
// System.out.println(streamTokenizer.sval + "Str");
boolean isNumeric = attributes.get(numAttribute).isNumeric();
double value;
@@ -104,12 +105,14 @@
} else if (isNumeric == true) {
value = Double.valueOf(streamTokenizer.sval).doubleValue();
} else {
- value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval);
+ value = this.instanceInformation.attribute(numAttribute).indexOfValue(
+ streamTokenizer.sval);
}
this.setValue(instance, numAttribute, value, isNumeric);
- numAttribute++;
+ //numAttribute++;
}
+ numAttribute++;
streamTokenizer.nextToken();
}
streamTokenizer.nextToken();
@@ -119,13 +122,15 @@
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
+ //System.out.println(instance);
return (numAttribute > 0) ? instance : null;
}
private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
- if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
- valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
+ if (this.instanceInformation.attribute(numAttribute).isNominal) {
+ valueAttribute = value;
+ //this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
// System.out.println(value +"/"+valueAttribute+" ");
} else {
@@ -144,7 +149,7 @@
private Instance readInstanceSparse() {
// Return a Sparse Instance
Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes()
- // + 1);
+ // + 1);
// System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
ArrayList<Double> attributeValues = new ArrayList<Double>();
@@ -154,7 +159,7 @@
streamTokenizer.nextToken(); // Remove the '{' char
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
- && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+ && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
// For each item
// streamTokenizer.nextToken();
@@ -171,18 +176,22 @@
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
// System.out.print(streamTokenizer.nval + " ");
- this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
+ this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
+ streamTokenizer.nval, true);
// numAttribute++;
- } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+ } else if (streamTokenizer.sval != null && (
+ streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
// System.out.print(streamTokenizer.sval + "-");
if (attributes.get(numAttribute).isNumeric()) {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
- Double.valueOf(streamTokenizer.sval).doubleValue(), true);
+ Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
- this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation
- .attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
+ this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
+ this.instanceInformation
+ .attribute(numAttribute).indexOfValue(streamTokenizer.sval),
+ false);
}
}
streamTokenizer.nextToken();
@@ -202,16 +211,19 @@
arrayIndexValues[i] = indexValues.get(i).intValue();
arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
}
- instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
+ instance.addSparseValues(arrayIndexValues, arrayAttributeValues,
+ this.instanceInformation.numAttributes());
return instance;
}
- private void setSparseValue(Instance instance, List<Integer> indexValues, List<Double> attributeValues,
- int numAttribute, double value, boolean isNumber) {
+ private void setSparseValue(Instance instance, List<Integer> indexValues,
+ List<Double> attributeValues,
+ int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
- valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
+ valueAttribute =
+ this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
} else {
valueAttribute = value;
}
@@ -235,7 +247,7 @@
streamTokenizer.nextToken(); // Remove the '{' char
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
- && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+ && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
// For each item
// streamTokenizer.nextToken();
@@ -249,15 +261,18 @@
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
// numAttribute++;
- } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+ } else if (streamTokenizer.sval != null && (
+ streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
// System.out.print(streamTokenizer.sval +
// "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
if (attributes.get(numAttribute).isNumeric()) {
- this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
+ this.setValue(instance, numAttribute,
+ Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setValue(instance, numAttribute,
- this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
+ this.instanceInformation.attribute(numAttribute)
+ .indexOfValue(streamTokenizer.sval), false);
// numAttribute++;
}
}
@@ -287,7 +302,8 @@
while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each line
// if (streamTokenizer.ttype == '@') {
- if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
+ if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+ && streamTokenizer.sval.startsWith("@") == true) {
// streamTokenizer.nextToken();
String token = streamTokenizer.sval.toUpperCase();
if (token.startsWith("@RELATION")) {
@@ -305,22 +321,12 @@
String type = streamTokenizer.sval;
// System.out.println("* " + name + ":" + type + " ");
if (streamTokenizer.ttype == '{') {
+ parseDoubleBrackests(name);
+ } else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file
streamTokenizer.nextToken();
- List<String> attributeLabels = new ArrayList<String>();
- while (streamTokenizer.ttype != '}') {
-
- if (streamTokenizer.sval != null) {
- attributeLabels.add(streamTokenizer.sval);
- // System.out.print(streamTokenizer.sval + ",");
- } else {
- attributeLabels.add(Double.toString(streamTokenizer.nval));
- // System.out.print(streamTokenizer.nval + ",");
- }
-
- streamTokenizer.nextToken();
+ if (streamTokenizer.ttype == '{') {
+ parseDoubleBrackests(name);
}
- // System.out.println();
- attributes.add(new Attribute(name, attributeLabels));
} else {
// Add attribute
attributes.add(new Attribute(name));
@@ -341,6 +347,27 @@
return new InstanceInformation(relation, attributes);
}
+ private void parseDoubleBrackests(String name) throws IOException {
+
+ streamTokenizer.nextToken();
+ List<String> attributeLabels = new ArrayList<String>();
+ while (streamTokenizer.ttype != '}') {
+
+ if (streamTokenizer.sval != null) {
+ attributeLabels.add(streamTokenizer.sval);
+ // System.out.print(streamTokenizer.sval + ",");
+ } else {
+ attributeLabels.add(Double.toString(streamTokenizer.nval));
+ // System.out.print(streamTokenizer.nval + ",");
+ }
+
+ streamTokenizer.nextToken();
+ }
+ // System.out.println();
+ attributes.add(new Attribute(name, attributeLabels));
+
+ }
+
private void initStreamTokenizer(Reader reader) {
BufferedReader br = new BufferedReader(reader);
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
index 8609d6e..6ebd678 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
@@ -32,37 +32,38 @@
import java.util.Map;
/**
- *
* @author abifet
*/
public class Attribute implements Serializable {
public static final String ARFF_ATTRIBUTE = "@attribute";
public static final String ARFF_ATTRIBUTE_NUMERIC = "NUMERIC";
+ public static final String ARFF_ATTRIBUTE_NOMINAL = "NOMINAL";
+ public static final String ARFF_ATTRIBUTE_DATE = "DATE";
/**
- *
- */
+ *
+ */
protected boolean isNominal;
/**
- *
- */
+ *
+ */
protected boolean isNumeric;
/**
- *
- */
+ *
+ */
protected boolean isDate;
/**
- *
- */
+ *
+ */
protected String name;
/**
- *
- */
+ *
+ */
protected List<String> attributeValues;
/**
- *
+ *
* @return
*/
public List<String> getAttributeValues() {
@@ -70,12 +71,12 @@
}
/**
- *
- */
+ *
+ */
protected int index;
/**
- *
+ *
* @param string
*/
public Attribute(String string) {
@@ -84,7 +85,7 @@
}
/**
- *
+ *
* @param attributeName
* @param attributeValues
*/
@@ -95,14 +96,14 @@
}
/**
- *
- */
+ *
+ */
public Attribute() {
this("");
}
/**
- *
+ *
* @return
*/
public boolean isNominal() {
@@ -110,7 +111,7 @@
}
/**
- *
+ *
* @return
*/
public String name() {
@@ -118,7 +119,7 @@
}
/**
- *
+ *
* @param value
* @return
*/
@@ -127,7 +128,7 @@
}
/**
- *
+ *
* @return
*/
public boolean isNumeric() {
@@ -135,20 +136,19 @@
}
/**
- *
+ *
* @return
*/
public int numValues() {
if (isNumeric()) {
return 0;
- }
- else {
+ } else {
return attributeValues.size();
}
}
/**
- *
+ *
* @return
*/
public int index() { // RuleClassifier
@@ -167,7 +167,7 @@
private Map<String, Integer> valuesStringAttribute;
/**
- *
+ *
* @param value
* @return
*/
@@ -198,7 +198,13 @@
text.append(ARFF_ATTRIBUTE).append(" ").append(Utils.quote(this.name)).append(" ");
- text.append(ARFF_ATTRIBUTE_NUMERIC);
+ if (isNominal) {
+ text.append(ARFF_ATTRIBUTE_NOMINAL);
+ } else if (isNumeric) {
+ text.append(ARFF_ATTRIBUTE_NUMERIC);
+ } else if (isDate) {
+ text.append(ARFF_ATTRIBUTE_DATE);
+ }
return text.toString();
}
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
index 984675e..57d1bfd 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
@@ -25,7 +25,6 @@
*/
/**
- *
* @author abifet
*/
public class DenseInstance extends SingleLabelInstance {
@@ -62,9 +61,10 @@
public String toString() {
StringBuffer text = new StringBuffer();
- for (int i = 0; i < this.instanceInformation.numAttributes(); i++) {
- if (i > 0)
+ for (int i = 0; i < this.instanceData.numAttributes(); i++) {
+ if (i > 0) {
text.append(",");
+ }
text.append(this.value(i));
}
text.append(",").append(this.weight());
diff --git a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java
new file mode 100644
index 0000000..62fd7b7
--- /dev/null
+++ b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java
@@ -0,0 +1,108 @@
+package com.yahoo.labs.samoa.instances;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2014 - 2015 Apache Software Foundation
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.StringReader;
+
+import static org.junit.Assert.assertEquals;
+
+public class ArffLoaderTest {
+
+ private ArffLoader loader;
+
+ private StringReader reader;
+
+ @Before
+ public void setUp() {
+ String inputString = "@relation test.txt\n"
+ + "\n"
+ + "@attribute Dur numeric\n"
+ + "@attribute Proto {udp,tcp,icmp,arp,ipx/spx,ipv6-icmp,pim,esp,igmp,rtcp,rtp,ipv6,udt}\n"
+ + "@attribute Dir {' <->',' <?>',' ->',' ?>',' who',' <-',' <?'}\n"
+ + "@attribute State {CON,PA_PA,PA_FRA, ...}\n"
+ + "@attribute sTos numeric\n"
+ + "@attribute dTos numeric\n"
+ + "@attribute TotPkts numeric\n"
+ + "@attribute TotBytes numeric\n"
+ + "@attribute SrcBytes numeric\n"
+ + "@attribute class {Background,Normal,Botnet}\n"
+ + "\n"
+ + "@data\n"
+ + "\n"
+ + "1065.731934,udp,' <->',...,0,0,2,252,145,Background\n"
+ + "1471.787109,udp,' <->',CON,0,0,2,252,145,Background";
+ reader = new StringReader(inputString);
+ int size = 0;
+ int classAttribute = 10;
+ loader = new ArffLoader(reader, size, classAttribute);
+
+ }
+
+ @Test
+ public void testGetHeader() {
+ InstanceInformation header = loader.getStructure();
+ assertEquals(10, header.numAttributes());
+ assertEquals(9, header.classIndex());
+ assertEquals(true, header.attribute(0).isNumeric());
+ assertEquals(false, header.attribute(1).isNumeric());
+ assertEquals(false, header.attribute(2).isNumeric());
+ assertEquals(false, header.attribute(3).isNumeric());
+ assertEquals(true, header.attribute(4).isNumeric());
+ assertEquals(true, header.attribute(5).isNumeric());
+ assertEquals(true, header.attribute(6).isNumeric());
+ assertEquals(true, header.attribute(7).isNumeric());
+ assertEquals(true, header.attribute(8).isNumeric());
+ assertEquals(false, header.attribute(9).isNumeric());
+
+ assertEquals(7, header.attribute(2).numValues());
+ assertEquals(" <->", header.attribute(2).value(0));
+ assertEquals(" <?>", header.attribute(2).value(1));
+ assertEquals(" ->", header.attribute(2).value(2));
+ assertEquals(" ?>", header.attribute(2).value(3));
+ assertEquals(" who", header.attribute(2).value(4));
+ assertEquals(" <-", header.attribute(2).value(5));
+ assertEquals(" <?", header.attribute(2).value(6));
+
+ assertEquals(3, header.attribute(9).numValues());
+ assertEquals("Background", header.attribute(9).value(0));
+ assertEquals("Normal", header.attribute(9).value(1));
+ assertEquals("Botnet", header.attribute(9).value(2));
+
+ }
+
+ @Test
+ public void testReadInstance() {
+ Instance instance = loader.readInstance(reader);
+ assertEquals(1065.731934, instance.value(0), 0);
+ assertEquals(0, instance.value(1), 0);
+ assertEquals(0, instance.value(2), 0);
+ assertEquals(3, instance.value(3), 0);
+ assertEquals(0, instance.value(4), 0);
+ assertEquals(0, instance.value(5), 0);
+ assertEquals(2, instance.value(6), 0);
+ assertEquals(252, instance.value(7), 0);
+ assertEquals(145, instance.value(8), 0);
+ assertEquals(0, instance.value(9), 0);
+ }
+}
\ No newline at end of file