blob: 7f0cb93e3bcba9f269a4a3d39154eaee490ba9d6 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one or more
// contributor license agreements. See the NOTICE file distributed with
// this work for additional information regarding copyright ownership.
// The ASF licenses this file to You under the Apache License, Version 2.0
// (the "License"); you may not use this file except in compliance with
// the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
= Pipelines API
Apache Ignite ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow. This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is mostly inspired by the scikit-learn and Apache Spark projects.
* **Preprocessor Model **- This is an algorithm which can transform one DataSet into another DataSet.
* **Preprocessor Trainer**- This is an algorithm which can be fit on a DataSet to produce a PreprocessorModel.
* **Pipeline **- A Pipeline chains multiple Trainers and Preprocessors together to specify an ML workflow.
* **Parameter **- All ML Trainers and Preprocessor Trainers now share a common API for specifying parameters.
CAUTION: The Pipeline API is experimental and could be changed in the next releases.
The Pipeline could replace the pieces of code with .fit() method calls as in the next examples:
[tabs]
--
tab:Without Pipeline API[]
[source, java]
----
final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 3, 4, 5, 6, 8, 10).labeled(1);
TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
.split(0.75);
Preprocessor<Integer, Vector> imputingPreprocessor = new ImputerTrainer<Integer, Vector>()
.fit(ignite,
dataCache,
vectorizer
);
Preprocessor<Integer, Vector> minMaxScalerPreprocessor = new MinMaxScalerTrainer<Integer, Vector>()
.fit(ignite,
dataCache,
imputingPreprocessor
);
Preprocessor<Integer, Vector> normalizationPreprocessor = new NormalizationTrainer<Integer, Vector>()
.withP(1)
.fit(ignite,
dataCache,
minMaxScalerPreprocessor
);
// Tune hyper-parameters with K-fold Cross-Validation on the split training set.
DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();
CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();
ParamGrid paramGrid = new ParamGrid()
.addHyperParam("maxDeep", trainerCV::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
.addHyperParam("minImpurityDecrease", trainerCV::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});
scoreCalculator
.withIgnite(ignite)
.withUpstreamCache(dataCache)
.withTrainer(trainerCV)
.withMetric(MetricName.ACCURACY)
.withFilter(split.getTrainFilter())
.isRunningOnPipeline(false)
.withPreprocessor(normalizationPreprocessor)
.withAmountOfFolds(3)
.withParamGrid(paramGrid);
CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
----
tab:With Pipeline API[]
[source, java]
----
final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 4, 5, 6, 8).labeled(1);
TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
.split(0.75);
DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();
Pipeline<Integer, Vector, Integer, Double> pipeline = new Pipeline<Integer, Vector, Integer, Double>()
.addVectorizer(vectorizer)
.addPreprocessingTrainer(new ImputerTrainer<Integer, Vector>())
.addPreprocessingTrainer(new MinMaxScalerTrainer<Integer, Vector>())
.addTrainer(trainer);
CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();
ParamGrid paramGrid = new ParamGrid()
.addHyperParam("maxDeep", trainer::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
.addHyperParam("minImpurityDecrease", trainer::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});
scoreCalculator
.withIgnite(ignite)
.withUpstreamCache(dataCache)
.withPipeline(pipeline)
.withMetric(MetricName.ACCURACY)
.withFilter(split.getTrainFilter())
.withAmountOfFolds(3)
.withParamGrid(paramGrid);
CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
----
--
The full code could be found in the https://github.com/apache/ignite/blob/master/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid_and_pipeline.java[Titanic tutorial].