blob: 3d314f0650fef2961515b6380c22b7d03d7b068f [file] [log] [blame]
package org.apache.samoa.instances;
/*
* #%L
* SAMOA
* %%
* Copyright (C) 2014 - 2015 Apache Software Foundation
* %%
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
* #L%
*/
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.Serializable;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* @author abifet
*/
public class ArffLoader implements Serializable {
protected InstanceInformation instanceInformation;
transient protected StreamTokenizer streamTokenizer;
protected Reader reader;
protected int size;
protected int classAttribute;
public ArffLoader() {
}
public ArffLoader(Reader reader, int size, int classAttribute) {
this.reader = reader;
this.size = size;
this.classAttribute = classAttribute;
initStreamTokenizer(reader);
}
public InstanceInformation getStructure() {
return this.instanceInformation;
}
public Instance readInstance(Reader reader) {
if (streamTokenizer == null) {
initStreamTokenizer(reader);
}
while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
try {
streamTokenizer.nextToken();
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (streamTokenizer.ttype == '{') {
return readInstanceSparse();
// return readDenseInstanceSparse();
} else {
return readInstanceDense();
}
}
public Instance readInstanceDense() {
Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
// System.out.println(this.instanceInformation.numAttributes());
int numAttribute = 0;
try {
while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each item
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
// System.out.println(streamTokenizer.nval + "Num ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (
streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) {
// System.out.println(streamTokenizer.sval + "Str");
boolean isNumeric = attributes.get(numAttribute).isNumeric();
double value;
if ("?".equals(streamTokenizer.sval)) {
value = Double.NaN; // Utils.missingValue();
} else if (isNumeric == true) {
value = Double.valueOf(streamTokenizer.sval).doubleValue();
} else {
value = this.instanceInformation.attribute(numAttribute).indexOfValue(
streamTokenizer.sval);
}
this.setValue(instance, numAttribute, value, isNumeric);
//numAttribute++;
}
numAttribute++;
streamTokenizer.nextToken();
}
streamTokenizer.nextToken();
// System.out.println("EOL");
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
//System.out.println(instance);
return (numAttribute > 0) ? instance : null;
}
private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute = value;
//this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
// System.out.println(value +"/"+valueAttribute+" ");
} else {
valueAttribute = value;
// System.out.println(value +"/"+valueAttribute+" ");
}
if (this.instanceInformation.classIndex() == numAttribute) {
instance.setClassValue(valueAttribute);
// System.out.println(value
// +"<"+this.instanceInformation.classIndex()+">");
} else {
instance.setValue(numAttribute, valueAttribute);
}
}
private Instance readInstanceSparse() {
// Return a Sparse Instance
Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes()
// + 1);
// System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
ArrayList<Double> attributeValues = new ArrayList<Double>();
List<Integer> indexValues = new ArrayList<Integer>();
try {
// while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
// For each item
// streamTokenizer.nextToken();
// while (streamTokenizer.ttype != '}'){
// System.out.println(streamTokenizer.nval +"-"+
// streamTokenizer.sval);
// numAttribute = (int) streamTokenizer.nval;
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
numAttribute = (int) streamTokenizer.nval;
} else {
numAttribute = Integer.parseInt(streamTokenizer.sval);
}
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
// System.out.print(streamTokenizer.nval + " ");
this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
streamTokenizer.nval, true);
// numAttribute++;
} else if (streamTokenizer.sval != null && (
streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
// System.out.print(streamTokenizer.sval + "-");
if (attributes.get(numAttribute).isNumeric()) {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute,
this.instanceInformation
.attribute(numAttribute).indexOfValue(streamTokenizer.sval),
false);
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); // Remove the '}' char
}
streamTokenizer.nextToken();
// System.out.println("EOL");
// }
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
int[] arrayIndexValues = new int[attributeValues.size()];
double[] arrayAttributeValues = new double[attributeValues.size()];
for (int i = 0; i < arrayIndexValues.length; i++) {
arrayIndexValues[i] = indexValues.get(i).intValue();
arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
}
instance.addSparseValues(arrayIndexValues, arrayAttributeValues,
this.instanceInformation.numAttributes());
return instance;
}
private void setSparseValue(Instance instance, List<Integer> indexValues,
List<Double> attributeValues,
int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) {
valueAttribute =
this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value));
} else {
valueAttribute = value;
}
if (this.instanceInformation.classIndex() == numAttribute) {
instance.setClassValue(valueAttribute);
} else {
// instance.setValue(numAttribute, valueAttribute);
indexValues.add(numAttribute);
attributeValues.add(valueAttribute);
}
// System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value);
}
private Instance readDenseInstanceSparse() {
// Returns a dense instance
Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1);
// System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
try {
// while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
// For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
// For each item
// streamTokenizer.nextToken();
// while (streamTokenizer.ttype != '}'){
// System.out.print(streamTokenizer.nval+":");
numAttribute = (int) streamTokenizer.nval;
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
// System.out.print(streamTokenizer.nval + " ");
this.setValue(instance, numAttribute, streamTokenizer.nval, true);
// numAttribute++;
} else if (streamTokenizer.sval != null && (
streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
// System.out.print(streamTokenizer.sval +
// "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
if (attributes.get(numAttribute).isNumeric()) {
this.setValue(instance, numAttribute,
Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setValue(instance, numAttribute,
this.instanceInformation.attribute(numAttribute)
.indexOfValue(streamTokenizer.sval), false);
// numAttribute++;
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); // Remove the '}' char
}
streamTokenizer.nextToken();
// System.out.println("EOL");
// }
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return instance;
}
protected List<Attribute> attributes;
private InstanceInformation getHeader() {
String relation = "file stream";
// System.out.println("RELATION " + relation);
attributes = new ArrayList<Attribute>();
try {
streamTokenizer.nextToken();
while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
// For each line
// if (streamTokenizer.ttype == '@') {
if (streamTokenizer.ttype == StreamTokenizer.TT_WORD
&& streamTokenizer.sval.startsWith("@") == true) {
// streamTokenizer.nextToken();
String token = streamTokenizer.sval.toUpperCase();
if (token.startsWith("@RELATION")) {
streamTokenizer.nextToken();
relation = streamTokenizer.sval;
// System.out.println("RELATION " + relation);
} else if (token.startsWith("@ATTRIBUTE")) {
streamTokenizer.nextToken();
String name = streamTokenizer.sval;
// System.out.println("* " + name);
if (name == null) {
name = Double.toString(streamTokenizer.nval);
}
streamTokenizer.nextToken();
String type = streamTokenizer.sval;
// System.out.println("* " + name + ":" + type + " ");
if (streamTokenizer.ttype == '{') {
parseDoubleBrackests(name);
} else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file
streamTokenizer.nextToken();
if (streamTokenizer.ttype == '{') {
parseDoubleBrackests(name);
}
} else {
// Add attribute
attributes.add(new Attribute(name));
}
} else if (token.startsWith("@DATA")) {
// System.out.print("END");
streamTokenizer.nextToken();
break;
}
}
streamTokenizer.nextToken();
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return new InstanceInformation(relation, attributes);
}
private void parseDoubleBrackests(String name) throws IOException {
streamTokenizer.nextToken();
List<String> attributeLabels = new ArrayList<String>();
while (streamTokenizer.ttype != '}') {
if (streamTokenizer.sval != null) {
attributeLabels.add(streamTokenizer.sval);
// System.out.print(streamTokenizer.sval + ",");
} else {
attributeLabels.add(Double.toString(streamTokenizer.nval));
// System.out.print(streamTokenizer.nval + ",");
}
streamTokenizer.nextToken();
}
// System.out.println();
attributes.add(new Attribute(name, attributeLabels));
}
private void initStreamTokenizer(Reader reader) {
BufferedReader br = new BufferedReader(reader);
// Init streamTokenizer
streamTokenizer = new StreamTokenizer(br);
streamTokenizer.resetSyntax();
streamTokenizer.whitespaceChars(0, ' ');
streamTokenizer.wordChars(' ' + 1, '\u00FF');
streamTokenizer.whitespaceChars(',', ',');
streamTokenizer.commentChar('%');
streamTokenizer.quoteChar('"');
streamTokenizer.quoteChar('\'');
streamTokenizer.ordinaryChar('{');
streamTokenizer.ordinaryChar('}');
streamTokenizer.eolIsSignificant(true);
this.instanceInformation = this.getHeader();
if (classAttribute < 0) {
this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
// System.out.print(this.instanceInformation.classIndex());
} else if (classAttribute > 0) {
this.instanceInformation.setClassIndex(classAttribute - 1);
}
}
}