blob: 8df657b128eaf37642326326025be1b5e82edb65 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
*/
package org.apache.samoa.instances;
import java.io.BufferedReader;
import java.io.IOException;
import java.io.Reader;
import java.io.StreamTokenizer;
import java.util.ArrayList;
import java.util.List;
import java.util.logging.Level;
import java.util.logging.Logger;
/**
* The Class ArffLoader. Loads an Arff file with sparse or dense format.
*/
public class ArffLoader implements Loader {
/**
* The instance information.
*/
protected InstanceInformation instanceInformation;
protected InstancesHeader streamHeader;
/**
* The stream tokenizer.
*/
protected transient StreamTokenizer streamTokenizer;
protected Range range;
protected List<Attribute> auxAttributes;
/**
* Instantiates a new arff loader.
*
* @param reader the reader
* @param size the size
* @param classAttribute the class attribute
*/
public ArffLoader(Reader reader, int size, int classAttribute) {
// size is not used
this(reader);
if (classAttribute < 0) {
this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1);
//System.out.print(this.instanceInformation.classIndex());
} else if (classAttribute > 0) {
this.instanceInformation.setClassIndex(classAttribute - 1);
}
}
/**
* Instantiates a new arff loader.
*
* @param reader the reader
*/
public ArffLoader(Reader reader) {
this(reader, null);
}
/**
* Instantiates a new arff loader.
*
* @param reader the reader
* @param range
*/
public ArffLoader(Reader reader, Range range) {
this.range = range;
BufferedReader br = new BufferedReader(reader);
//Init streamTokenizer
streamTokenizer = new StreamTokenizer(br);
streamTokenizer.resetSyntax();
streamTokenizer.whitespaceChars(0, ' ');
streamTokenizer.wordChars(' ' + 1, '\u00FF');
streamTokenizer.whitespaceChars(',', ',');
streamTokenizer.commentChar('%');
streamTokenizer.quoteChar('"');
streamTokenizer.quoteChar('\'');
streamTokenizer.ordinaryChar('{');
streamTokenizer.ordinaryChar('}');
streamTokenizer.eolIsSignificant(true);
this.instanceInformation = this.getHeader();
if (range != null) { //is MultiLabel
this.instanceInformation.setRangeOutputIndices(range);
}
}
/**
* Gets the structure.
*
* @return the structure
*/
public InstanceInformation getStructure() {
return this.instanceInformation;
}
/**
* Reads instance. It detects if it is dense or sparse.
*
* @return the instance
*/
public Instance readInstance() {
while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
try {
streamTokenizer.nextToken();
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (streamTokenizer.ttype == '{') {
return readInstanceSparse();
// return readDenseInstanceSparse();
} else {
return readInstanceDense();
}
}
/**
* Reads instance. It detects if it is dense or sparse.
*
* @return the instance
*/
public Instance readInstance(Reader reader) {
while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) {
try {
streamTokenizer.nextToken();
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
}
if (streamTokenizer.ttype == '{') {
return readInstanceSparse();
// return readDenseInstanceSparse();
} else {
return readInstanceDense();
}
}
/**
* Reads a dense instance from the file.
*
* @return the instance
*/
public Instance readInstanceDense() {
Instance instance = newDenseInstance(this.instanceInformation.numAttributes());
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute = 0;
try {
while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each item
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.println(streamTokenizer.nval + "Num ");
instance.setValue(numAttribute, streamTokenizer.nval);//this.setValue(instance, numAttribute, streamTokenizer.nval, true);
++numAttribute;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) {
//System.out.println(streamTokenizer.sval + "Str");
boolean isNumeric = this.auxAttributes.get(numAttribute).isNumeric();
double value;
if ("?".equals(streamTokenizer.sval)) {
value = Double.NaN; //Utils.missingValue();
} else if (isNumeric == true) {
value = Double.valueOf(streamTokenizer.sval).doubleValue();
} else {
value = this.auxAttributes.get(numAttribute).indexOfValue(streamTokenizer.sval);
}
instance.setValue(numAttribute, value);//this.setValue(instance, numAttribute, value, isNumeric);
++numAttribute;
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken();
//System.out.println("EOL");
}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return (numAttribute > 0) ? instance : null;
}
protected void setValue(Instance instance, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.auxAttributes.get(numAttribute).isNominal) {
valueAttribute = value;//this.auxAttributes.get(numAttribute).indexOfValue(Double.toString(value));
//System.out.println(value +"/"+valueAttribute+" ");
} else {
valueAttribute = value;
//System.out.println(value +"/"+valueAttribute+" ");
}
if (this.instanceInformation.classIndex() == numAttribute) {
setClassValue(instance, valueAttribute);
//System.out.println(value +"<"+this.instanceInformation.classIndex()+">");
} else {
//if(numAttribute>this.instanceInformation.classIndex())
// numAttribute--;
instance.setValue(numAttribute, valueAttribute);
}
}
/**
* Reads a sparse instance.
*
* @return the instance
*/
private Instance readInstanceSparse() {
//Return a Sparse Instance
Instance instance = newSparseInstance(1.0); //, null); //(this.instanceInformation.numAttributes() + 1);
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
ArrayList<Double> attributeValues = new ArrayList<Double>();
List<Integer> indexValues = new ArrayList<Integer>();
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.println(streamTokenizer.nval +"-"+ streamTokenizer.sval);
//numAttribute = (int) streamTokenizer.nval;
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
numAttribute = (int) streamTokenizer.nval;
} else {
numAttribute = Integer.parseInt(streamTokenizer.sval);
}
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) {
//System.out.print(streamTokenizer.sval + "-");
if (this.auxAttributes.get(numAttribute).isNumeric()) {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
this.setSparseValue(instance, indexValues, attributeValues, numAttribute, this.auxAttributes.get(numAttribute).indexOfValue(streamTokenizer.sval), false);
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
int[] arrayIndexValues = new int[attributeValues.size()];
double[] arrayAttributeValues = new double[attributeValues.size()];
for (int i = 0; i < arrayIndexValues.length; i++) {
arrayIndexValues[i] = indexValues.get(i).intValue();
arrayAttributeValues[i] = attributeValues.get(i).doubleValue();
}
instance.addSparseValues(arrayIndexValues, arrayAttributeValues, this.instanceInformation.numAttributes());
return instance;
}
private void setSparseValue(Instance instance, List<Integer> indexValues, List<Double> attributeValues, int numAttribute, double value, boolean isNumber) {
double valueAttribute;
if (isNumber && this.auxAttributes.get(numAttribute).isNominal) {
valueAttribute = this.auxAttributes.get(numAttribute).indexOfValue(Double.toString(value));
} else {
valueAttribute = value;
}
//if (this.instanceInformation.classIndex() == numAttribute) {
// setClassValue(instance, valueAttribute);
//} else {
//instance.setValue(numAttribute, valueAttribute);
indexValues.add(numAttribute);
attributeValues.add(valueAttribute);
//}
//System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value);
}
//protected List<Attribute> inputAttributes;
// protected List<Attribute> outputAttributes;
/**
* Reads an instance sparse and returns a dense one.
*
* @return the instance
*/
private Instance readDenseInstanceSparse() {
//Returns a dense instance
Instance instance = newDenseInstance(this.instanceInformation.numAttributes());
//System.out.println(this.instanceInformation.numAttributes());
int numAttribute;
try {
//while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
streamTokenizer.nextToken(); // Remove the '{' char
//For each line
while (streamTokenizer.ttype != StreamTokenizer.TT_EOL
&& streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
while (streamTokenizer.ttype != '}') {
//For each item
//streamTokenizer.nextToken();
//while (streamTokenizer.ttype != '}'){
//System.out.print(streamTokenizer.nval+":");
numAttribute = (int) streamTokenizer.nval;
streamTokenizer.nextToken();
if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) {
//System.out.print(streamTokenizer.nval + " ");
instance.setValue(numAttribute, streamTokenizer.nval);//this.setValue(instance, numAttribute, streamTokenizer.nval, true);
//numAttribute++;
} else if (streamTokenizer.sval != null && (streamTokenizer.ttype == StreamTokenizer.TT_WORD
|| streamTokenizer.ttype == 34)) {
//System.out.print(streamTokenizer.sval + "/"+this.auxAttributes.get(numAttribute).indexOfValue(streamTokenizer.sval)+" ");
if (this.auxAttributes.get(numAttribute).isNumeric()) {
instance.setValue(numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue());//this.setValue(instance, numAttribute, Double.valueOf(streamTokenizer.sval).doubleValue(), true);
} else {
instance.setValue(numAttribute, this.auxAttributes.get(numAttribute).indexOfValue(streamTokenizer.sval));//this.setValue(instance, numAttribute, this.auxAttributes.get(numAttribute).indexOfValue(streamTokenizer.sval), false);
//numAttribute++;
}
}
streamTokenizer.nextToken();
}
streamTokenizer.nextToken(); //Remove the '}' char
}
streamTokenizer.nextToken();
//System.out.println("EOL");
//}
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
return instance;
}
private InstanceInformation getHeader() {
//commented JD
//this.range.setUpper(10000); //TO DO: Create a new range object with isInRange that does not need the upper limit
String relation = "file stream";
//System.out.println("RELATION " + relation);
//inputAttributes = new ArrayList<Attribute>();
//outputAttributes = new ArrayList<Attribute>();
//ArrayList<Attribute>
auxAttributes = new ArrayList<Attribute>();//JD
int numAttributes = 0;
try {
streamTokenizer.nextToken();
while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) {
//For each line
//if (streamTokenizer.ttype == '@') {
if (streamTokenizer.ttype == StreamTokenizer.TT_WORD && streamTokenizer.sval.startsWith("@") == true) {
//streamTokenizer.nextToken();
String token = streamTokenizer.sval.toUpperCase();
if (token.startsWith("@RELATION")) {
streamTokenizer.nextToken();
relation = streamTokenizer.sval;
// System.out.println("RELATION " + relation);
} else if (token.startsWith("@ATTRIBUTE")) {
streamTokenizer.nextToken();
String name = streamTokenizer.sval;
//System.out.println("* " + name);
if (name == null) {
name = Double.toString(streamTokenizer.nval);
}
streamTokenizer.nextToken();
String type = streamTokenizer.sval;
// System.out.println("* " + name + ":" + type + " ");
if (streamTokenizer.ttype == '{') {
streamTokenizer.nextToken();
List<String> attributeLabels = new ArrayList<String>();
while (streamTokenizer.ttype != '}') {
if (streamTokenizer.sval != null) {
attributeLabels.add(streamTokenizer.sval);
// System.out.print(streamTokenizer.sval + ",");
} else {
attributeLabels.add(Double.toString(streamTokenizer.nval));
//System.out.print(streamTokenizer.nval + ",");
}
streamTokenizer.nextToken();
}
// System.out.println();
//attributes.add(new Attribute(name, attributeLabels));
//commented JD
/* if (this.range.isInRange(numAttribute)) {
outputAttributes.add(new Attribute(name, attributeLabels));
} else {
inputAttributes.add(new Attribute(name, attributeLabels));
}*/
auxAttributes.add(new Attribute(name, attributeLabels));
++numAttributes;
} else {
// Add attribute
//commented JD
/*if (this.range.isInRange(numAttribute)) {
outputAttributes.add(new Attribute(name));
} else {
inputAttributes.add(new Attribute(name));
}*/
auxAttributes.add(new Attribute(name));
++numAttributes;
}
} else if (token.startsWith("@DATA")) {
//System.out.print("END");
streamTokenizer.nextToken();
break;
}
}
streamTokenizer.nextToken();
}
if (range != null) {
this.range.setUpper(numAttributes);
}
/*if (range==null) //is single-target. All instances should go to inputAtrributes (see setClassIndex(int) from InstanceInformation )
inputAttributes=auxAttributes;
else//is multi-target
{
this.range.setUpper(numAttribute);
for (int i=0; i<auxAttributes.size();i++)
{
//if (this.range.isInRange(i))
// outputAttributes.add(auxAttributes.get(i));
//else
inputAttributes.add(auxAttributes.get(i));
}
}*/
} catch (IOException ex) {
Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex);
}
// this.range.setUpper(inputAttributes.size()+outputAttributes.size());
return new InstanceInformation(relation, auxAttributes);
}
protected Instance newSparseInstance(double d, double[] res) {
Instance inst = new SparseInstance(d, res); //is it dense?
//inst.setInstanceInformation(this.instanceInformation);
return inst;
}
protected Instance newSparseInstance(double d) {
Instance inst = new SparseInstance(d);
//inst.setInstanceInformation(this.instanceInformation);
return inst;
}
protected Instance newDenseInstance(int numberAttributes) {
Instance inst = new DenseInstance(numberAttributes);
//inst.setInstanceInformation(this.instanceInformation);
return inst;
}
private void setClassValue(Instance instance, double valueAttribute) {
instance.setValue(this.instanceInformation.classIndex(), valueAttribute);
}
}