docs/_docs/machine-learning/model-selection/pipeline-api.adoc - ignite - Git at Google

 // Licensed to the Apache Software Foundation (ASF) under one or more
 // contributor license agreements.  See the NOTICE file distributed with
 // this work for additional information regarding copyright ownership.
 // The ASF licenses this file to You under the Apache License, Version 2.0
 // (the "License"); you may not use this file except in compliance with
 // the License.  You may obtain a copy of the License at
 //
 // http://www.apache.org/licenses/LICENSE-2.0
 //
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 = Pipelines API

 Apache Ignite ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow. This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is mostly inspired by the scikit-learn and Apache Spark projects.

 * **Preprocessor Model **- This is an algorithm which can transform one DataSet into another DataSet.

 * **Preprocessor Trainer**- This is an algorithm which can be fit on a DataSet to produce a PreprocessorModel.

 * **Pipeline **-  A Pipeline chains multiple Trainers and Preprocessors together to specify an ML workflow.

 * **Parameter **- All ML Trainers and Preprocessor Trainers now share a common API for specifying parameters.

 CAUTION: The Pipeline API is experimental and could be changed in the next releases.


 The Pipeline could replace the pieces of code with .fit() method calls as in the next examples:


 [tabs]
 --
 tab:Without Pipeline API[]

 [source, java]
 ----
 final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 3, 4, 5, 6, 8, 10).labeled(1);

 TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
   .split(0.75);

 Preprocessor<Integer, Vector> imputingPreprocessor = new ImputerTrainer<Integer, Vector>()
   .fit(ignite,
        dataCache,
        vectorizer
       );

 Preprocessor<Integer, Vector> minMaxScalerPreprocessor = new MinMaxScalerTrainer<Integer, Vector>()
   .fit(ignite,
        dataCache,
        imputingPreprocessor
       );

 Preprocessor<Integer, Vector> normalizationPreprocessor = new NormalizationTrainer<Integer, Vector>()
   .withP(1)
   .fit(ignite,
        dataCache,
        minMaxScalerPreprocessor
       );

 // Tune hyper-parameters with K-fold Cross-Validation on the split training set.

 DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();

 CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();

 ParamGrid paramGrid = new ParamGrid()
   .addHyperParam("maxDeep", trainerCV::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
   .addHyperParam("minImpurityDecrease", trainerCV::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});

 scoreCalculator
   .withIgnite(ignite)
   .withUpstreamCache(dataCache)
   .withTrainer(trainerCV)
   .withMetric(MetricName.ACCURACY)
   .withFilter(split.getTrainFilter())
   .isRunningOnPipeline(false)
   .withPreprocessor(normalizationPreprocessor)
   .withAmountOfFolds(3)
   .withParamGrid(paramGrid);

 CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
 ----

 tab:With Pipeline API[]

 [source, java]
 ----
 final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 4, 5, 6, 8).labeled(1);

 TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
   .split(0.75);

 DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();

 Pipeline<Integer, Vector, Integer, Double> pipeline = new Pipeline<Integer, Vector, Integer, Double>()
   .addVectorizer(vectorizer)
   .addPreprocessingTrainer(new ImputerTrainer<Integer, Vector>())
   .addPreprocessingTrainer(new MinMaxScalerTrainer<Integer, Vector>())
   .addTrainer(trainer);

 CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();

 ParamGrid paramGrid = new ParamGrid()
   .addHyperParam("maxDeep", trainer::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
   .addHyperParam("minImpurityDecrease", trainer::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});

 scoreCalculator
   .withIgnite(ignite)
   .withUpstreamCache(dataCache)
   .withPipeline(pipeline)
   .withMetric(MetricName.ACCURACY)
   .withFilter(split.getTrainFilter())
   .withAmountOfFolds(3)
   .withParamGrid(paramGrid);


 CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
 ----
 --

 The full code could be found in the https://github.com/apache/ignite/blob/master/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid_and_pipeline.java[Titanic tutorial].
	// Licensed to the Apache Software Foundation (ASF) under one or more
	// contributor license agreements. See the NOTICE file distributed with
	// this work for additional information regarding copyright ownership.
	// The ASF licenses this file to You under the Apache License, Version 2.0
	// (the "License"); you may not use this file except in compliance with
	// the License. You may obtain a copy of the License at
	//
	// http://www.apache.org/licenses/LICENSE-2.0
	//
	// Unless required by applicable law or agreed to in writing, software
	// distributed under the License is distributed on an "AS IS" BASIS,
	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	// See the License for the specific language governing permissions and
	// limitations under the License.
	= Pipelines API

	Apache Ignite ML standardizes APIs for machine learning algorithms to make it easier to combine multiple algorithms into a single pipeline, or workflow. This section covers the key concepts introduced by the Pipelines API, where the pipeline concept is mostly inspired by the scikit-learn and Apache Spark projects.

	* Preprocessor Model - This is an algorithm which can transform one DataSet into another DataSet.

	* Preprocessor Trainer- This is an algorithm which can be fit on a DataSet to produce a PreprocessorModel.

	* Pipeline - A Pipeline chains multiple Trainers and Preprocessors together to specify an ML workflow.

	* Parameter - All ML Trainers and Preprocessor Trainers now share a common API for specifying parameters.

	CAUTION: The Pipeline API is experimental and could be changed in the next releases.


	The Pipeline could replace the pieces of code with .fit() method calls as in the next examples:


	[tabs]
	--
	tab:Without Pipeline API[]

	[source, java]
	----
	final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 3, 4, 5, 6, 8, 10).labeled(1);

	TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
	.split(0.75);

	Preprocessor<Integer, Vector> imputingPreprocessor = new ImputerTrainer<Integer, Vector>()
	.fit(ignite,
	dataCache,
	vectorizer
	);

	Preprocessor<Integer, Vector> minMaxScalerPreprocessor = new MinMaxScalerTrainer<Integer, Vector>()
	.fit(ignite,
	dataCache,
	imputingPreprocessor
	);

	Preprocessor<Integer, Vector> normalizationPreprocessor = new NormalizationTrainer<Integer, Vector>()
	.withP(1)
	.fit(ignite,
	dataCache,
	minMaxScalerPreprocessor
	);

	// Tune hyper-parameters with K-fold Cross-Validation on the split training set.

	DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();

	CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();

	ParamGrid paramGrid = new ParamGrid()
	.addHyperParam("maxDeep", trainerCV::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
	.addHyperParam("minImpurityDecrease", trainerCV::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});

	scoreCalculator
	.withIgnite(ignite)
	.withUpstreamCache(dataCache)
	.withTrainer(trainerCV)
	.withMetric(MetricName.ACCURACY)
	.withFilter(split.getTrainFilter())
	.isRunningOnPipeline(false)
	.withPreprocessor(normalizationPreprocessor)
	.withAmountOfFolds(3)
	.withParamGrid(paramGrid);

	CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
	----

	tab:With Pipeline API[]

	[source, java]
	----
	final Vectorizer<Integer, Vector, Integer, Double> vectorizer = new DummyVectorizer<Integer>(0, 4, 5, 6, 8).labeled(1);

	TrainTestSplit<Integer, Vector> split = new TrainTestDatasetSplitter<Integer, Vector>()
	.split(0.75);

	DecisionTreeClassificationTrainer trainerCV = new DecisionTreeClassificationTrainer();

	Pipeline<Integer, Vector, Integer, Double> pipeline = new Pipeline<Integer, Vector, Integer, Double>()
	.addVectorizer(vectorizer)
	.addPreprocessingTrainer(new ImputerTrainer<Integer, Vector>())
	.addPreprocessingTrainer(new MinMaxScalerTrainer<Integer, Vector>())
	.addTrainer(trainer);

	CrossValidation<DecisionTreeNode, Integer, Vector> scoreCalculator = new CrossValidation<>();

	ParamGrid paramGrid = new ParamGrid()
	.addHyperParam("maxDeep", trainer::withMaxDeep, new Double[] {1.0, 2.0, 3.0, 4.0, 5.0, 10.0})
	.addHyperParam("minImpurityDecrease", trainer::withMinImpurityDecrease, new Double[] {0.0, 0.25, 0.5});

	scoreCalculator
	.withIgnite(ignite)
	.withUpstreamCache(dataCache)
	.withPipeline(pipeline)
	.withMetric(MetricName.ACCURACY)
	.withFilter(split.getTrainFilter())
	.withAmountOfFolds(3)
	.withParamGrid(paramGrid);


	CrossValidationResult crossValidationRes = scoreCalculator.tuneHyperParameters();
	----
	--

	The full code could be found in the https://github.com/apache/ignite/blob/master/examples/src/main/java/org/apache/ignite/examples/ml/tutorial/Step_8_CV_with_Param_Grid_and_pipeline.java[Titanic tutorial].