| package org.apache.samoa.instances; |
| |
| /* |
| * #%L |
| * SAMOA |
| * %% |
| * Copyright (C) 2014 - 2015 Apache Software Foundation |
| * %% |
| * Licensed under the Apache License, Version 2.0 (the "License"); |
| * you may not use this file except in compliance with the License. |
| * You may obtain a copy of the License at |
| * |
| * http://www.apache.org/licenses/LICENSE-2.0 |
| * |
| * Unless required by applicable law or agreed to in writing, software |
| * distributed under the License is distributed on an "AS IS" BASIS, |
| * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. |
| * See the License for the specific language governing permissions and |
| * limitations under the License. |
| * #L% |
| */ |
| |
| import java.io.BufferedReader; |
| import java.io.IOException; |
| import java.io.Reader; |
| import java.io.Serializable; |
| import java.io.StreamTokenizer; |
| import java.util.ArrayList; |
| import java.util.List; |
| import java.util.logging.Level; |
| import java.util.logging.Logger; |
| |
| /** |
| * @author abifet |
| */ |
| public class ArffLoader implements Serializable { |
| |
| protected InstanceInformation instanceInformation; |
| |
| transient protected StreamTokenizer streamTokenizer; |
| |
| protected Reader reader; |
| |
| protected int size; |
| |
| protected int classAttribute; |
| |
| public ArffLoader() { |
| } |
| |
| public ArffLoader(Reader reader, int size, int classAttribute) { |
| this.reader = reader; |
| this.size = size; |
| this.classAttribute = classAttribute; |
| initStreamTokenizer(reader); |
| } |
| |
| public InstanceInformation getStructure() { |
| return this.instanceInformation; |
| } |
| |
| public Instance readInstance(Reader reader) { |
| if (streamTokenizer == null) { |
| initStreamTokenizer(reader); |
| } |
| while (streamTokenizer.ttype == StreamTokenizer.TT_EOL) { |
| try { |
| streamTokenizer.nextToken(); |
| } catch (IOException ex) { |
| Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); |
| } |
| } |
| if (streamTokenizer.ttype == '{') { |
| return readInstanceSparse(); |
| // return readDenseInstanceSparse(); |
| } else { |
| return readInstanceDense(); |
| } |
| |
| } |
| |
| public Instance readInstanceDense() { |
| Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1); |
| // System.out.println(this.instanceInformation.numAttributes()); |
| int numAttribute = 0; |
| try { |
| while (numAttribute == 0 && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| // For each line |
| while (streamTokenizer.ttype != StreamTokenizer.TT_EOL |
| && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| // For each item |
| if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { |
| // System.out.println(streamTokenizer.nval + "Num "); |
| this.setValue(instance, numAttribute, streamTokenizer.nval, true); |
| //numAttribute++; |
| |
| } else if (streamTokenizer.sval != null && ( |
| streamTokenizer.ttype == StreamTokenizer.TT_WORD |
| || streamTokenizer.ttype == 34 || streamTokenizer.ttype == 39)) { |
| // System.out.println(streamTokenizer.sval + "Str"); |
| boolean isNumeric = attributes.get(numAttribute).isNumeric(); |
| double value; |
| if ("?".equals(streamTokenizer.sval)) { |
| value = Double.NaN; // Utils.missingValue(); |
| } else if (isNumeric == true) { |
| value = Double.valueOf(streamTokenizer.sval).doubleValue(); |
| } else { |
| value = this.instanceInformation.attribute(numAttribute).indexOfValue( |
| streamTokenizer.sval); |
| } |
| |
| this.setValue(instance, numAttribute, value, isNumeric); |
| //numAttribute++; |
| } |
| numAttribute++; |
| streamTokenizer.nextToken(); |
| } |
| streamTokenizer.nextToken(); |
| // System.out.println("EOL"); |
| } |
| |
| } catch (IOException ex) { |
| Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); |
| } |
| //System.out.println(instance); |
| return (numAttribute > 0) ? instance : null; |
| } |
| |
| private void setValue(Instance instance, int numAttribute, double value, boolean isNumber) { |
| double valueAttribute; |
| if (this.instanceInformation.attribute(numAttribute).isNominal) { |
| valueAttribute = value; |
| //this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); |
| // System.out.println(value +"/"+valueAttribute+" "); |
| |
| } else { |
| valueAttribute = value; |
| // System.out.println(value +"/"+valueAttribute+" "); |
| } |
| if (this.instanceInformation.classIndex() == numAttribute) { |
| instance.setClassValue(valueAttribute); |
| // System.out.println(value |
| // +"<"+this.instanceInformation.classIndex()+">"); |
| } else { |
| instance.setValue(numAttribute, valueAttribute); |
| } |
| } |
| |
| private Instance readInstanceSparse() { |
| // Return a Sparse Instance |
| Instance instance = new SparseInstance(1.0, null); // (this.instanceInformation.numAttributes() |
| // + 1); |
| // System.out.println(this.instanceInformation.numAttributes()); |
| int numAttribute; |
| ArrayList<Double> attributeValues = new ArrayList<Double>(); |
| List<Integer> indexValues = new ArrayList<Integer>(); |
| try { |
| // while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| streamTokenizer.nextToken(); // Remove the '{' char |
| // For each line |
| while (streamTokenizer.ttype != StreamTokenizer.TT_EOL |
| && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| while (streamTokenizer.ttype != '}') { |
| // For each item |
| // streamTokenizer.nextToken(); |
| // while (streamTokenizer.ttype != '}'){ |
| // System.out.println(streamTokenizer.nval +"-"+ |
| // streamTokenizer.sval); |
| // numAttribute = (int) streamTokenizer.nval; |
| if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { |
| numAttribute = (int) streamTokenizer.nval; |
| } else { |
| numAttribute = Integer.parseInt(streamTokenizer.sval); |
| } |
| streamTokenizer.nextToken(); |
| |
| if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { |
| // System.out.print(streamTokenizer.nval + " "); |
| this.setSparseValue(instance, indexValues, attributeValues, numAttribute, |
| streamTokenizer.nval, true); |
| // numAttribute++; |
| |
| } else if (streamTokenizer.sval != null && ( |
| streamTokenizer.ttype == StreamTokenizer.TT_WORD |
| || streamTokenizer.ttype == 34)) { |
| // System.out.print(streamTokenizer.sval + "-"); |
| if (attributes.get(numAttribute).isNumeric()) { |
| this.setSparseValue(instance, indexValues, attributeValues, numAttribute, |
| Double.valueOf(streamTokenizer.sval).doubleValue(), true); |
| } else { |
| this.setSparseValue(instance, indexValues, attributeValues, numAttribute, |
| this.instanceInformation |
| .attribute(numAttribute).indexOfValue(streamTokenizer.sval), |
| false); |
| } |
| } |
| streamTokenizer.nextToken(); |
| } |
| streamTokenizer.nextToken(); // Remove the '}' char |
| } |
| streamTokenizer.nextToken(); |
| // System.out.println("EOL"); |
| // } |
| |
| } catch (IOException ex) { |
| Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); |
| } |
| int[] arrayIndexValues = new int[attributeValues.size()]; |
| double[] arrayAttributeValues = new double[attributeValues.size()]; |
| for (int i = 0; i < arrayIndexValues.length; i++) { |
| arrayIndexValues[i] = indexValues.get(i).intValue(); |
| arrayAttributeValues[i] = attributeValues.get(i).doubleValue(); |
| } |
| instance.addSparseValues(arrayIndexValues, arrayAttributeValues, |
| this.instanceInformation.numAttributes()); |
| return instance; |
| |
| } |
| |
| private void setSparseValue(Instance instance, List<Integer> indexValues, |
| List<Double> attributeValues, |
| int numAttribute, double value, boolean isNumber) { |
| double valueAttribute; |
| if (isNumber && this.instanceInformation.attribute(numAttribute).isNominal) { |
| valueAttribute = |
| this.instanceInformation.attribute(numAttribute).indexOfValue(Double.toString(value)); |
| } else { |
| valueAttribute = value; |
| } |
| if (this.instanceInformation.classIndex() == numAttribute) { |
| instance.setClassValue(valueAttribute); |
| } else { |
| // instance.setValue(numAttribute, valueAttribute); |
| indexValues.add(numAttribute); |
| attributeValues.add(valueAttribute); |
| } |
| // System.out.println(numAttribute+":"+valueAttribute+","+this.instanceInformation.classIndex()+","+value); |
| } |
| |
| private Instance readDenseInstanceSparse() { |
| // Returns a dense instance |
| Instance instance = new DenseInstance(this.instanceInformation.numAttributes() + 1); |
| // System.out.println(this.instanceInformation.numAttributes()); |
| int numAttribute; |
| try { |
| // while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| streamTokenizer.nextToken(); // Remove the '{' char |
| // For each line |
| while (streamTokenizer.ttype != StreamTokenizer.TT_EOL |
| && streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| while (streamTokenizer.ttype != '}') { |
| // For each item |
| // streamTokenizer.nextToken(); |
| // while (streamTokenizer.ttype != '}'){ |
| // System.out.print(streamTokenizer.nval+":"); |
| numAttribute = (int) streamTokenizer.nval; |
| streamTokenizer.nextToken(); |
| |
| if (streamTokenizer.ttype == StreamTokenizer.TT_NUMBER) { |
| // System.out.print(streamTokenizer.nval + " "); |
| this.setValue(instance, numAttribute, streamTokenizer.nval, true); |
| // numAttribute++; |
| |
| } else if (streamTokenizer.sval != null && ( |
| streamTokenizer.ttype == StreamTokenizer.TT_WORD |
| || streamTokenizer.ttype == 34)) { |
| // System.out.print(streamTokenizer.sval + |
| // "/"+this.instanceInformation.attribute(numAttribute).indexOfValue(streamTokenizer.sval)+" "); |
| if (attributes.get(numAttribute).isNumeric()) { |
| this.setValue(instance, numAttribute, |
| Double.valueOf(streamTokenizer.sval).doubleValue(), true); |
| } else { |
| this.setValue(instance, numAttribute, |
| this.instanceInformation.attribute(numAttribute) |
| .indexOfValue(streamTokenizer.sval), false); |
| // numAttribute++; |
| } |
| } |
| streamTokenizer.nextToken(); |
| } |
| streamTokenizer.nextToken(); // Remove the '}' char |
| } |
| streamTokenizer.nextToken(); |
| // System.out.println("EOL"); |
| // } |
| |
| } catch (IOException ex) { |
| Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); |
| } |
| return instance; |
| } |
| |
| protected List<Attribute> attributes; |
| |
| private InstanceInformation getHeader() { |
| |
| String relation = "file stream"; |
| // System.out.println("RELATION " + relation); |
| attributes = new ArrayList<Attribute>(); |
| try { |
| streamTokenizer.nextToken(); |
| while (streamTokenizer.ttype != StreamTokenizer.TT_EOF) { |
| // For each line |
| // if (streamTokenizer.ttype == '@') { |
| if (streamTokenizer.ttype == StreamTokenizer.TT_WORD |
| && streamTokenizer.sval.startsWith("@") == true) { |
| // streamTokenizer.nextToken(); |
| String token = streamTokenizer.sval.toUpperCase(); |
| if (token.startsWith("@RELATION")) { |
| streamTokenizer.nextToken(); |
| relation = streamTokenizer.sval; |
| // System.out.println("RELATION " + relation); |
| } else if (token.startsWith("@ATTRIBUTE")) { |
| streamTokenizer.nextToken(); |
| String name = streamTokenizer.sval; |
| // System.out.println("* " + name); |
| if (name == null) { |
| name = Double.toString(streamTokenizer.nval); |
| } |
| streamTokenizer.nextToken(); |
| String type = streamTokenizer.sval; |
| // System.out.println("* " + name + ":" + type + " "); |
| if (streamTokenizer.ttype == '{') { |
| parseDoubleBrackests(name); |
| } else if (streamTokenizer.ttype == 10) {//for the buggy non-formal input arff file |
| streamTokenizer.nextToken(); |
| if (streamTokenizer.ttype == '{') { |
| parseDoubleBrackests(name); |
| } |
| } else { |
| // Add attribute |
| attributes.add(new Attribute(name)); |
| } |
| |
| } else if (token.startsWith("@DATA")) { |
| // System.out.print("END"); |
| streamTokenizer.nextToken(); |
| break; |
| } |
| } |
| streamTokenizer.nextToken(); |
| } |
| |
| } catch (IOException ex) { |
| Logger.getLogger(ArffLoader.class.getName()).log(Level.SEVERE, null, ex); |
| } |
| return new InstanceInformation(relation, attributes); |
| } |
| |
| private void parseDoubleBrackests(String name) throws IOException { |
| |
| streamTokenizer.nextToken(); |
| List<String> attributeLabels = new ArrayList<String>(); |
| while (streamTokenizer.ttype != '}') { |
| |
| if (streamTokenizer.sval != null) { |
| attributeLabels.add(streamTokenizer.sval); |
| // System.out.print(streamTokenizer.sval + ","); |
| } else { |
| attributeLabels.add(Double.toString(streamTokenizer.nval)); |
| // System.out.print(streamTokenizer.nval + ","); |
| } |
| |
| streamTokenizer.nextToken(); |
| } |
| // System.out.println(); |
| attributes.add(new Attribute(name, attributeLabels)); |
| |
| } |
| |
| private void initStreamTokenizer(Reader reader) { |
| BufferedReader br = new BufferedReader(reader); |
| |
| // Init streamTokenizer |
| streamTokenizer = new StreamTokenizer(br); |
| |
| streamTokenizer.resetSyntax(); |
| streamTokenizer.whitespaceChars(0, ' '); |
| streamTokenizer.wordChars(' ' + 1, '\u00FF'); |
| streamTokenizer.whitespaceChars(',', ','); |
| streamTokenizer.commentChar('%'); |
| streamTokenizer.quoteChar('"'); |
| streamTokenizer.quoteChar('\''); |
| streamTokenizer.ordinaryChar('{'); |
| streamTokenizer.ordinaryChar('}'); |
| streamTokenizer.eolIsSignificant(true); |
| |
| this.instanceInformation = this.getHeader(); |
| if (classAttribute < 0) { |
| this.instanceInformation.setClassIndex(this.instanceInformation.numAttributes() - 1); |
| // System.out.print(this.instanceInformation.classIndex()); |
| } else if (classAttribute > 0) { |
| this.instanceInformation.setClassIndex(classAttribute - 1); |
| } |
| } |
| } |