examples/src/main/python/ml/simple_params_example.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 from __future__ import print_function

 import pprint
 import sys

 from pyspark import SparkContext
 from pyspark.ml.classification import LogisticRegression
 from pyspark.mllib.linalg import DenseVector
 from pyspark.mllib.regression import LabeledPoint
 from pyspark.sql import SQLContext

 """
 A simple example demonstrating ways to specify parameters for Estimators and Transformers.
 Run with:
   bin/spark-submit examples/src/main/python/ml/simple_params_example.py
 """

 if __name__ == "__main__":
     if len(sys.argv) > 1:
         print("Usage: simple_params_example", file=sys.stderr)
         exit(1)
     sc = SparkContext(appName="PythonSimpleParamsExample")
     sqlContext = SQLContext(sc)

     # prepare training data.
     # We create an RDD of LabeledPoints and convert them into a DataFrame.
     # A LabeledPoint is an Object with two fields named label and features
     # and Spark SQL identifies these fields and creates the schema appropriately.
     training = sc.parallelize([
         LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
         LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
         LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
         LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF()

     # Create a LogisticRegression instance with maxIter = 10.
     # This instance is an Estimator.
     lr = LogisticRegression(maxIter=10)
     # Print out the parameters, documentation, and any default values.
     print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

     # We may also set parameters using setter methods.
     lr.setRegParam(0.01)

     # Learn a LogisticRegression model.  This uses the parameters stored in lr.
     model1 = lr.fit(training)

     # Since model1 is a Model (i.e., a Transformer produced by an Estimator),
     # we can view the parameters it used during fit().
     # This prints the parameter (name: value) pairs, where names are unique IDs for this
     # LogisticRegression instance.
     print("Model 1 was fit using parameters:\n")
     pprint.pprint(model1.extractParamMap())

     # We may alternatively specify parameters using a parameter map.
     # paramMap overrides all lr parameters set earlier.
     paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"}

     # Now learn a new model using the new parameters.
     model2 = lr.fit(training, paramMap)
     print("Model 2 was fit using parameters:\n")
     pprint.pprint(model2.extractParamMap())

     # prepare test data.
     test = sc.parallelize([
         LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])),
         LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])),
         LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF()

     # Make predictions on test data using the Transformer.transform() method.
     # LogisticRegressionModel.transform will only use the 'features' column.
     # Note that model2.transform() outputs a 'myProbability' column instead of the usual
     # 'probability' column since we renamed the lr.probabilityCol parameter previously.
     result = model2.transform(test) \
         .select("features", "label", "myProbability", "prediction") \
         .collect()

     for row in result:
         print("features=%s,label=%s -> prob=%s, prediction=%s"
               % (row.features, row.label, row.myProbability, row.prediction))

     sc.stop()
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from __future__ import print_function

	import pprint
	import sys

	from pyspark import SparkContext
	from pyspark.ml.classification import LogisticRegression
	from pyspark.mllib.linalg import DenseVector
	from pyspark.mllib.regression import LabeledPoint
	from pyspark.sql import SQLContext

	"""
	A simple example demonstrating ways to specify parameters for Estimators and Transformers.
	Run with:
	bin/spark-submit examples/src/main/python/ml/simple_params_example.py
	"""

	if __name__ == "__main__":
	if len(sys.argv) > 1:
	print("Usage: simple_params_example", file=sys.stderr)
	exit(1)
	sc = SparkContext(appName="PythonSimpleParamsExample")
	sqlContext = SQLContext(sc)

	# prepare training data.
	# We create an RDD of LabeledPoints and convert them into a DataFrame.
	# A LabeledPoint is an Object with two fields named label and features
	# and Spark SQL identifies these fields and creates the schema appropriately.
	training = sc.parallelize([
	LabeledPoint(1.0, DenseVector([0.0, 1.1, 0.1])),
	LabeledPoint(0.0, DenseVector([2.0, 1.0, -1.0])),
	LabeledPoint(0.0, DenseVector([2.0, 1.3, 1.0])),
	LabeledPoint(1.0, DenseVector([0.0, 1.2, -0.5]))]).toDF()

	# Create a LogisticRegression instance with maxIter = 10.
	# This instance is an Estimator.
	lr = LogisticRegression(maxIter=10)
	# Print out the parameters, documentation, and any default values.
	print("LogisticRegression parameters:\n" + lr.explainParams() + "\n")

	# We may also set parameters using setter methods.
	lr.setRegParam(0.01)

	# Learn a LogisticRegression model. This uses the parameters stored in lr.
	model1 = lr.fit(training)

	# Since model1 is a Model (i.e., a Transformer produced by an Estimator),
	# we can view the parameters it used during fit().
	# This prints the parameter (name: value) pairs, where names are unique IDs for this
	# LogisticRegression instance.
	print("Model 1 was fit using parameters:\n")
	pprint.pprint(model1.extractParamMap())

	# We may alternatively specify parameters using a parameter map.
	# paramMap overrides all lr parameters set earlier.
	paramMap = {lr.maxIter: 20, lr.thresholds: [0.45, 0.55], lr.probabilityCol: "myProbability"}

	# Now learn a new model using the new parameters.
	model2 = lr.fit(training, paramMap)
	print("Model 2 was fit using parameters:\n")
	pprint.pprint(model2.extractParamMap())

	# prepare test data.
	test = sc.parallelize([
	LabeledPoint(1.0, DenseVector([-1.0, 1.5, 1.3])),
	LabeledPoint(0.0, DenseVector([3.0, 2.0, -0.1])),
	LabeledPoint(0.0, DenseVector([0.0, 2.2, -1.5]))]).toDF()

	# Make predictions on test data using the Transformer.transform() method.
	# LogisticRegressionModel.transform will only use the 'features' column.
	# Note that model2.transform() outputs a 'myProbability' column instead of the usual
	# 'probability' column since we renamed the lr.probabilityCol parameter previously.
	result = model2.transform(test) \
	.select("features", "label", "myProbability", "prediction") \
	.collect()

	for row in result:
	print("features=%s,label=%s -> prob=%s, prediction=%s"
	% (row.features, row.label, row.myProbability, row.prediction))

	sc.stop()