blob: 7021e7dde45875e373060c38ae4121dd8ecff14a [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed with
* this work for additional information regarding copyright ownership.
* The ASF licenses this file to You under the Apache License, Version 2.0
* (the "License"); you may not use this file except in compliance with
* the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
package org.apache.ignite.examples.ml.util;
/**
* The names of popular datasets used in examples.
*/
public enum MLSandboxDatasets {
/** Movielens dataset with ratings. */
MOVIELENS("examples/src/main/resources/datasets/ratings.csv", true, ","),
/** The full Iris dataset from Machine Learning Repository. */
IRIS("examples/src/main/resources/datasets/iris.txt", false, "\t"),
/** The Titanic dataset from Kaggle competition. */
TITANIC("examples/src/main/resources/datasets/titanic.csv", true, ";"),
/** The 1st and 2nd classes from the Iris dataset. */
TWO_CLASSED_IRIS("examples/src/main/resources/datasets/two_classed_iris.csv", false, "\t"),
/** The dataset is about different computers' properties based on https://archive.ics.uci.edu/ml/datasets/Computer+Hardware. */
CLEARED_MACHINES("examples/src/main/resources/datasets/cleared_machines.csv", false, ";"),
/**
* The health data is related to death rate based on; doctor availability, hospital availability,
* annual per capita income, and population density people per square mile.
*/
MORTALITY_DATA("examples/src/main/resources/datasets/mortalitydata.csv", false, ";"),
/**
* The preprocessed Glass dataset from the Machine Learning Repository https://archive.ics.uci.edu/ml/datasets/Glass+Identification
* There are 3 classes with labels: 1 {building_windows_float_processed}, 3 {vehicle_windows_float_processed}, 7 {headlamps}.
* Feature names: 'Na-Sodium', 'Mg-Magnesium', 'Al-Aluminum', 'Ba-Barium', 'Fe-Iron'.
*/
GLASS_IDENTIFICATION("examples/src/main/resources/datasets/glass_identification.csv", false, ";"),
/** The Wine recognition data. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/wine/">here</a>. */
WINE_RECOGNITION("examples/src/main/resources/datasets/wine.txt", false, ","),
/** The Boston house-prices dataset. Could be found <a href="https://archive.ics.uci.edu/ml/machine-learning-databases/housing/">here</a>. */
BOSTON_HOUSE_PRICES("examples/src/main/resources/datasets/boston_housing_dataset.txt", false, ","),
/** Example from book Barber D. Bayesian reasoning and machine learning. Chapter 10. */
ENGLISH_VS_SCOTTISH("examples/src/main/resources/datasets/english_vs_scottish_binary_dataset.csv", true, ","),
/** Wholesale customers dataset. Could be found <a href="https://archive.ics.uci.edu/ml/datasets/Wholesale+customers">here</a>. */
WHOLESALE_CUSTOMERS("examples/src/main/resources/datasets/wholesale_customers.csv", true, ","),
/** Fraud detection problem [part of whole dataset]. Could be found <a href="https://www.kaggle.com/mlg-ulb/creditcardfraud/">here</a>. */
FRAUD_DETECTION("examples/src/main/resources/datasets/fraud_detection.csv", false, ","),
/** A dataset with discrete and continuous features. */
MIXED_DATASET("examples/src/main/resources/datasets/mixed_dataset.csv", true, ","),
/** A dataset with categorical features and labels. */
MUSHROOMS("examples/src/main/resources/datasets/mushrooms.csv", true, ",");
/** Filename. */
private final String filename;
/** The csv file has header. */
private final boolean hasHeader;
/** The separator between words. */
private final String separator;
/**
* @param filename Filename.
* @param hasHeader The csv file has header.
* @param separator The special sign to separate the line on words.
*/
MLSandboxDatasets(final String filename, boolean hasHeader, String separator) {
this.filename = filename;
this.hasHeader = hasHeader;
this.separator = separator;
}
/** */
public String getFileName() { return filename; }
/** */
public boolean hasHeader() {
return hasHeader;
}
/** */
public String getSeparator() {
return separator;
}
}