SAMOA-26: Fix the ArffLoader bug (asp188)
Fix #24
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
index feb5702..dc22bb8 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/ArffLoader.java
@@ -19,6 +19,7 @@
  * limitations under the License.
  * #L%
  */
+
 import java.io.BufferedReader;
 import java.io.IOException;
 import java.io.Reader;
@@ -30,7 +31,6 @@
 import java.util.logging.Logger;
 
 /**
- * 
  * @author abifet
  */
 public class ArffLoader implements Serializable {
@@ -87,15 +87,16 @@
       while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
         // For each line
         while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
-            && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+               && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
           // For each item
           if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
             // System.out.println(streamTokenizer.nval + "Num ");
             this.setValue(instance, numAttribute, streamTokenizer.nval, true);
-            numAttribute++;
+            //numAttribute++;
 
-          } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
-              || streamTokenizer.ttype == 34)) {
+          } else if (streamTokenizer.sval != null && (
+              streamTokenizer.ttype == StreamTokenizer.TT_WORD
+              || streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) {
             // System.out.println(streamTokenizer.sval + "Str");
             boolean isNumeric = attributes.get(numAttribute).isNumeric();
             double value;
@@ -104,12 +105,14 @@
             } else if (isNumeric == true) {
               value = Double.valueOf(streamTokenizer.sval).doubleValue();
             } else {
-              value = this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval);
+              value = this.instanceInformation.attribute(numAttribute).indexOfValue(
+                  streamTokenizer.sval);
             }
 
             this.setValue(instance, numAttribute, value, isNumeric);
-            numAttribute++;
+            //numAttribute++;
           }
+          numAttribute++;
           streamTokenizer.nextToken();
         }
         streamTokenizer.nextToken();
@@ -119,13 +122,15 @@
     } catch (IOException ex) {
       Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
     }
+    //System.out.println(instance);
     return (numAttribute > 0) ? instance : null;
   }
 
   private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
     double valueAttribute;
-    if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
-      valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
+    if (this.instanceInformation.attribute(numAttribute).isNominal) {
+      valueAttribute = value;
+      //this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
       // System.out.println(value +"/"+valueAttribute+" ");
 
     } else {
@@ -144,7 +149,7 @@
   private Instance readInstanceSparse() {
     // Return a Sparse Instance
     Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes()
-                                                       // + 1);
+    // + 1);
     // System.out.println(this.instanceInformation.numAttributes());
     int numAttribute;
     ArrayList<Double> attributeValues = new ArrayList<Double>();
@@ -154,7 +159,7 @@
       streamTokenizer.nextToken(); // Remove the '{' char
       // For each line
       while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
-          && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+             && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
         while (streamTokenizer.ttype != '}') {
           // For each item
           // streamTokenizer.nextToken();
@@ -171,18 +176,22 @@
 
           if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
             // System.out.print(streamTokenizer.nval + " ");
-            this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
+            this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
+                                streamTokenizer.nval, true);
             // numAttribute++;
 
-          } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+          } else if (streamTokenizer.sval != null && (
+              streamTokenizer.ttype == StreamTokenizer.TT_WORD
               || streamTokenizer.ttype == 34)) {
             // System.out.print(streamTokenizer.sval + "-");
             if (attributes.get(numAttribute).isNumeric()) {
               this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
-                  Double.valueOf(streamTokenizer.sval).doubleValue(), true);
+                                  Double.valueOf(streamTokenizer.sval).doubleValue(), true);
             } else {
-              this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.instanceInformation
-                  .attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
+              this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
+                                  this.instanceInformation
+                                      .attribute(numAttribute).indexOfValue(streamTokenizer.sval),
+                                  false);
             }
           }
           streamTokenizer.nextToken();
@@ -202,16 +211,19 @@
       arrayIndexValues[i] = indexValues.get(i).intValue();
       arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
     }
-    instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
+    instance.addSparseValues(arrayIndexValues, arrayAttributeValues,
+                             this.instanceInformation.numAttributes());
     return instance;
 
   }
 
-  private void setSparseValue(Instance instance, List<Integer> indexValues, List<Double> attributeValues,
-      int numAttribute, double value, boolean isNumber) {
+  private void setSparseValue(Instance instance, List<Integer> indexValues,
+                              List<Double> attributeValues,
+                              int numAttribute, double value, boolean isNumber) {
     double valueAttribute;
     if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
-      valueAttribute = this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
+      valueAttribute =
+          this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
     } else {
       valueAttribute = value;
     }
@@ -235,7 +247,7 @@
       streamTokenizer.nextToken(); // Remove the '{' char
       // For each line
       while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
-          && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
+             && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
         while (streamTokenizer.ttype != '}') {
           // For each item
           // streamTokenizer.nextToken();
@@ -249,15 +261,18 @@
             this.setValue(instance, numAttribute, streamTokenizer.nval, true);
             // numAttribute++;
 
-          } else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+          } else if (streamTokenizer.sval != null && (
+              streamTokenizer.ttype == StreamTokenizer.TT_WORD
               || streamTokenizer.ttype == 34)) {
             // System.out.print(streamTokenizer.sval +
             // "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
             if (attributes.get(numAttribute).isNumeric()) {
-              this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
+              this.setValue(instance, numAttribute,
+                            Double.valueOf(streamTokenizer.sval).doubleValue(), true);
             } else {
               this.setValue(instance, numAttribute,
-                  this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval), false);
+                            this.instanceInformation.attribute(numAttribute)
+                                .indexOfValue(streamTokenizer.sval), false);
               // numAttribute++;
             }
           }
@@ -287,7 +302,8 @@
       while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
         // For each line
         // if (streamTokenizer.ttype == '@') {
-        if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
+        if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
+            && streamTokenizer.sval.startsWith("@") == true) {
           // streamTokenizer.nextToken();
           String token = streamTokenizer.sval.toUpperCase();
           if (token.startsWith("@RELATION")) {
@@ -305,22 +321,12 @@
             String type = streamTokenizer.sval;
             // System.out.println("* " + name + ":" + type + " ");
             if (streamTokenizer.ttype == '{') {
+              parseDoubleBrackests(name);
+            } else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file
               streamTokenizer.nextToken();
-              List<String> attributeLabels = new ArrayList<String>();
-              while (streamTokenizer.ttype != '}') {
-
-                if (streamTokenizer.sval != null) {
-                  attributeLabels.add(streamTokenizer.sval);
-                  // System.out.print(streamTokenizer.sval + ",");
-                } else {
-                  attributeLabels.add(Double.toString(streamTokenizer.nval));
-                  // System.out.print(streamTokenizer.nval + ",");
-                }
-
-                streamTokenizer.nextToken();
+              if (streamTokenizer.ttype == '{') {
+                parseDoubleBrackests(name);
               }
-              // System.out.println();
-              attributes.add(new Attribute(name, attributeLabels));
             } else {
               // Add attribute
               attributes.add(new Attribute(name));
@@ -341,6 +347,27 @@
     return new InstanceInformation(relation, attributes);
   }
 
+  private void parseDoubleBrackests(String name) throws IOException {
+
+    streamTokenizer.nextToken();
+    List<String> attributeLabels = new ArrayList<String>();
+    while (streamTokenizer.ttype != '}') {
+
+      if (streamTokenizer.sval != null) {
+        attributeLabels.add(streamTokenizer.sval);
+        // System.out.print(streamTokenizer.sval + ",");
+      } else {
+        attributeLabels.add(Double.toString(streamTokenizer.nval));
+        // System.out.print(streamTokenizer.nval + ",");
+      }
+
+      streamTokenizer.nextToken();
+    }
+    // System.out.println();
+    attributes.add(new Attribute(name, attributeLabels));
+
+  }
+
   private void initStreamTokenizer(Reader reader) {
     BufferedReader br = new BufferedReader(reader);
 
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
index 8609d6e..6ebd678 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/Attribute.java
@@ -32,37 +32,38 @@
 import java.util.Map;
 
 /**
- * 
  * @author abifet
  */
 public class Attribute implements Serializable {
 
   public static final String ARFF_ATTRIBUTE = "@attribute";
   public static final String ARFF_ATTRIBUTE_NUMERIC = "NUMERIC";
+  public static final String ARFF_ATTRIBUTE_NOMINAL = "NOMINAL";
+  public static final String ARFF_ATTRIBUTE_DATE = "DATE";
 
   /**
-     *
-     */
+   *
+   */
   protected boolean isNominal;
   /**
-     *
-     */
+   *
+   */
   protected boolean isNumeric;
   /**
-     *
-     */
+   *
+   */
   protected boolean isDate;
   /**
-     *
-     */
+   *
+   */
   protected String name;
   /**
-     *
-     */
+   *
+   */
   protected List<String> attributeValues;
 
   /**
-   * 
+   *
    * @return
    */
   public List<String> getAttributeValues() {
@@ -70,12 +71,12 @@
   }
 
   /**
-     *
-     */
+   *
+   */
   protected int index;
 
   /**
-   * 
+   *
    * @param string
    */
   public Attribute(String string) {
@@ -84,7 +85,7 @@
   }
 
   /**
-   * 
+   *
    * @param attributeName
    * @param attributeValues
    */
@@ -95,14 +96,14 @@
   }
 
   /**
-     *
-     */
+   *
+   */
   public Attribute() {
     this("");
   }
 
   /**
-   * 
+   *
    * @return
    */
   public boolean isNominal() {
@@ -110,7 +111,7 @@
   }
 
   /**
-   * 
+   *
    * @return
    */
   public String name() {
@@ -118,7 +119,7 @@
   }
 
   /**
-   * 
+   *
    * @param value
    * @return
    */
@@ -127,7 +128,7 @@
   }
 
   /**
-   * 
+   *
    * @return
    */
   public boolean isNumeric() {
@@ -135,20 +136,19 @@
   }
 
   /**
-   * 
+   *
    * @return
    */
   public int numValues() {
     if (isNumeric()) {
       return 0;
-    }
-    else {
+    } else {
       return attributeValues.size();
     }
   }
 
   /**
-   * 
+   *
    * @return
    */
   public int index() { // RuleClassifier
@@ -167,7 +167,7 @@
   private Map<String, Integer> valuesStringAttribute;
 
   /**
-   * 
+   *
    * @param value
    * @return
    */
@@ -198,7 +198,13 @@
 
     text.append(ARFF_ATTRIBUTE).append(" ").append(Utils.quote(this.name)).append(" ");
 
-    text.append(ARFF_ATTRIBUTE_NUMERIC);
+    if (isNominal) {
+      text.append(ARFF_ATTRIBUTE_NOMINAL);
+    } else if (isNumeric) {
+      text.append(ARFF_ATTRIBUTE_NUMERIC);
+    } else if (isDate) {
+      text.append(ARFF_ATTRIBUTE_DATE);
+    }
 
     return text.toString();
   }
diff --git a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
index 984675e..57d1bfd 100644
--- a/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
+++ b/samoa-instances/src/main/java/com/yahoo/labs/samoa/instances/DenseInstance.java
@@ -25,7 +25,6 @@
  */
 
 /**
- * 
  * @author abifet
  */
 public class DenseInstance extends SingleLabelInstance {
@@ -62,9 +61,10 @@
   public String toString() {
     StringBuffer text = new StringBuffer();
 
-    for (int i = 0; i < this.instanceInformation.numAttributes(); i++) {
-      if (i > 0)
+    for (int i = 0; i < this.instanceData.numAttributes(); i++) {
+      if (i > 0) {
         text.append(",");
+      }
       text.append(this.value(i));
     }
     text.append(",").append(this.weight());
diff --git a/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java
new file mode 100644
index 0000000..62fd7b7
--- /dev/null
+++ b/samoa-instances/src/test/java/com/yahoo/labs/samoa/instances/ArffLoaderTest.java
@@ -0,0 +1,108 @@
+package com.yahoo.labs.samoa.instances;
+
+/*
+ * #%L
+ * SAMOA
+ * %%
+ * Copyright (C) 2014 - 2015 Apache Software Foundation
+ * %%
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ * 
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ * 
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ * #L%
+ */
+
+import org.junit.Before;
+import org.junit.Test;
+
+import java.io.StringReader;
+
+import static org.junit.Assert.assertEquals;
+
+public class ArffLoaderTest {
+
+  private ArffLoader loader;
+
+  private StringReader reader;
+
+  @Before
+  public void setUp() {
+    String inputString = "@relation test.txt\n"
+                         + "\n"
+                         + "@attribute Dur numeric\n"
+                         + "@attribute Proto {udp,tcp,icmp,arp,ipx/spx,ipv6-icmp,pim,esp,igmp,rtcp,rtp,ipv6,udt}\n"
+                         + "@attribute Dir {' <->',' <?>',' ->',' ?>',' who',' <-',' <?'}\n"
+                         + "@attribute State {CON,PA_PA,PA_FRA, ...}\n"
+                         + "@attribute sTos numeric\n"
+                         + "@attribute dTos numeric\n"
+                         + "@attribute TotPkts numeric\n"
+                         + "@attribute TotBytes numeric\n"
+                         + "@attribute SrcBytes numeric\n"
+                         + "@attribute class {Background,Normal,Botnet}\n"
+                         + "\n"
+                         + "@data\n"
+                         + "\n"
+                         + "1065.731934,udp,' <->',...,0,0,2,252,145,Background\n"
+                         + "1471.787109,udp,' <->',CON,0,0,2,252,145,Background";
+    reader = new StringReader(inputString);
+    int size = 0;
+    int classAttribute = 10;
+    loader = new ArffLoader(reader, size, classAttribute);
+
+  }
+
+  @Test
+  public void testGetHeader() {
+    InstanceInformation header = loader.getStructure();
+    assertEquals(10, header.numAttributes());
+    assertEquals(9, header.classIndex());
+    assertEquals(true, header.attribute(0).isNumeric());
+    assertEquals(false, header.attribute(1).isNumeric());
+    assertEquals(false, header.attribute(2).isNumeric());
+    assertEquals(false, header.attribute(3).isNumeric());
+    assertEquals(true, header.attribute(4).isNumeric());
+    assertEquals(true, header.attribute(5).isNumeric());
+    assertEquals(true, header.attribute(6).isNumeric());
+    assertEquals(true, header.attribute(7).isNumeric());
+    assertEquals(true, header.attribute(8).isNumeric());
+    assertEquals(false, header.attribute(9).isNumeric());
+
+    assertEquals(7, header.attribute(2).numValues());
+    assertEquals(" <->", header.attribute(2).value(0));
+    assertEquals(" <?>", header.attribute(2).value(1));
+    assertEquals(" ->", header.attribute(2).value(2));
+    assertEquals(" ?>", header.attribute(2).value(3));
+    assertEquals(" who", header.attribute(2).value(4));
+    assertEquals(" <-", header.attribute(2).value(5));
+    assertEquals(" <?", header.attribute(2).value(6));
+
+    assertEquals(3, header.attribute(9).numValues());
+    assertEquals("Background", header.attribute(9).value(0));
+    assertEquals("Normal", header.attribute(9).value(1));
+    assertEquals("Botnet", header.attribute(9).value(2));
+
+  }
+
+  @Test
+  public void testReadInstance() {
+    Instance instance = loader.readInstance(reader);
+    assertEquals(1065.731934, instance.value(0), 0);
+    assertEquals(0, instance.value(1), 0);
+    assertEquals(0, instance.value(2), 0);
+    assertEquals(3, instance.value(3), 0);
+    assertEquals(0, instance.value(4), 0);
+    assertEquals(0, instance.value(5), 0);
+    assertEquals(2, instance.value(6), 0);
+    assertEquals(252, instance.value(7), 0);
+    assertEquals(145, instance.value(8), 0);
+    assertEquals(0, instance.value(9), 0);
+  }
+}
\ No newline at end of file