python/pyspark/ml/param/_shared_params_code_gen.py - spark - Git at Google

 #
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #

 from typing import Optional

 header = """#
 # Licensed to the Apache Software Foundation (ASF) under one or more
 # contributor license agreements.  See the NOTICE file distributed with
 # this work for additional information regarding copyright ownership.
 # The ASF licenses this file to You under the Apache License, Version 2.0
 # (the "License"); you may not use this file except in compliance with
 # the License.  You may obtain a copy of the License at
 #
 #    http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 #"""

 # Code generator for shared params (shared.py). Run under this folder with:
 # python _shared_params_code_gen.py > shared.py

 _type_for_type_converter = {
     "TypeConverters.toBoolean": "bool",
     "TypeConverters.toFloat": "float",
     "TypeConverters.toInt": "int",
     "TypeConverters.toListFloat": "List[float]",
     "TypeConverters.toListInt": "List[int]",
     "TypeConverters.toListString": "List[str]",
     "TypeConverters.toString": "str",
 }


 def _gen_param_header(
     name: str, doc: str, defaultValueStr: Optional[str], typeConverter: str, paramType: str
 ) -> str:
     """
     Generates the header part for shared variables

     :param name: param name
     :param doc: param doc
     """
     Name = f"Has{name[0].upper()}{name[1:]}"

     template = f'''class {Name}(Params):
     """
     Mixin for param {name}: {doc}
     """

     {name}: "Param[{paramType}]" = Param(
         Params._dummy(),
         "{name}",
         "{doc}",
         typeConverter={typeConverter},
     )

     def __init__(self) -> None:
         super({Name}, self).__init__()'''

     if defaultValueStr is not None:
         template += f"""
         self._setDefault({name}={defaultValueStr})"""

     return template


 def _gen_param_code(name: str, paramType: str) -> str:
     """
     Generates Python code for a shared param class.

     :param name: param name
     :param doc: param doc
     :param defaultValueStr: string representation of the default value
     :return: code string
     """
     # TODO: How to correctly inherit instance attributes?
     return f'''
     def get{name[0].upper()}{name[1:]}(self) -> {paramType}:
         """
         Gets the value of {name} or its default value.
         """
         return self.getOrDefault(self.{name})'''


 if __name__ == "__main__":
     print(header)
     print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
     print("from typing import List\n")
     print("from pyspark.ml.param import Param, Params, TypeConverters\n\n")
     shared = [
         (
             "maxIter",
             "max number of iterations (>= 0).",
             None,
             "TypeConverters.toInt",
         ),
         (
             "regParam",
             "regularization parameter (>= 0).",
             None,
             "TypeConverters.toFloat",
         ),
         (
             "featuresCol",
             "features column name.",
             '"features"',
             "TypeConverters.toString",
         ),
         (
             "labelCol",
             "label column name.",
             '"label"',
             "TypeConverters.toString",
         ),
         (
             "predictionCol",
             "prediction column name.",
             '"prediction"',
             "TypeConverters.toString",
         ),
         (
             "probabilityCol",
             "Column name for predicted class conditional probabilities. "
             + "Note: Not all models output well-calibrated probability estimates! "
             + "These probabilities should be treated as confidences, not precise probabilities.",
             '"probability"',
             "TypeConverters.toString",
         ),
         (
             "rawPredictionCol",
             "raw prediction (a.k.a. confidence) column name.",
             '"rawPrediction"',
             "TypeConverters.toString",
         ),
         (
             "inputCol",
             "input column name.",
             None,
             "TypeConverters.toString",
         ),
         (
             "inputCols",
             "input column names.",
             None,
             "TypeConverters.toListString",
         ),
         (
             "outputCol",
             "output column name.",
             'self.uid + "__output"',
             "TypeConverters.toString",
         ),
         (
             "outputCols",
             "output column names.",
             None,
             "TypeConverters.toListString",
         ),
         (
             "numFeatures",
             "Number of features. Should be greater than 0.",
             "262144",
             "TypeConverters.toInt",
         ),
         (
             "checkpointInterval",
             "set checkpoint interval (>= 1) or disable checkpoint (-1). "
             + "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: "
             + "this setting will be ignored if the checkpoint directory is not set in "
             + "the SparkContext.",
             None,
             "TypeConverters.toInt",
         ),
         (
             "seed",
             "random seed.",
             "hash(type(self).__name__)",
             "TypeConverters.toInt",
         ),
         (
             "tol",
             "the convergence tolerance for iterative algorithms (>= 0).",
             None,
             "TypeConverters.toFloat",
         ),
         (
             "relativeError",
             "the relative target precision for the approximate quantile "
             + "algorithm. Must be in the range [0, 1]",
             "0.001",
             "TypeConverters.toFloat",
         ),
         (
             "stepSize",
             "Step size to be used for each iteration of optimization (>= 0).",
             None,
             "TypeConverters.toFloat",
         ),
         (
             "handleInvalid",
             "how to handle invalid entries. Options are skip (which will filter "
             + "out rows with bad values), or error (which will throw an error). "
             + "More options may be added later.",
             None,
             "TypeConverters.toString",
         ),
         (
             "elasticNetParam",
             "the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, "
             + "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
             "0.0",
             "TypeConverters.toFloat",
         ),
         (
             "fitIntercept",
             "whether to fit an intercept term.",
             "True",
             "TypeConverters.toBoolean",
         ),
         (
             "standardization",
             "whether to standardize the training features before fitting the " + "model.",
             "True",
             "TypeConverters.toBoolean",
         ),
         (
             "thresholds",
             "Thresholds in multi-class classification to adjust the probability of "
             + "predicting each class. Array must have length equal to the number of classes, with "
             + "values > 0, excepting that at most one value may be 0. "
             + "The class with largest value p/t is predicted, where p is the original "
             + "probability of that class and t is the class's threshold.",
             None,
             "TypeConverters.toListFloat",
         ),
         (
             "threshold",
             "threshold in binary classification prediction, in range [0, 1]",
             "0.5",
             "TypeConverters.toFloat",
         ),
         (
             "weightCol",
             "weight column name. If this is not set or empty, we treat "
             + "all instance weights as 1.0.",
             None,
             "TypeConverters.toString",
         ),
         (
             "solver",
             "the solver algorithm for optimization. If this is not set or empty, "
             + "default value is 'auto'.",
             '"auto"',
             "TypeConverters.toString",
         ),
         (
             "varianceCol",
             "column name for the biased sample variance of prediction.",
             None,
             "TypeConverters.toString",
         ),
         (
             "aggregationDepth",
             "suggested depth for treeAggregate (>= 2).",
             "2",
             "TypeConverters.toInt",
         ),
         (
             "parallelism",
             "the number of threads to use when running parallel algorithms (>= 1).",
             "1",
             "TypeConverters.toInt",
         ),
         (
             "collectSubModels",
             "Param for whether to collect a list of sub-models trained during "
             + "tuning. If set to false, then only the single best sub-model will be available "
             + "after fitting. If set to true, then all sub-models will be available. Warning: "
             + "For large models, collecting all sub-models can cause OOMs on the Spark driver.",
             "False",
             "TypeConverters.toBoolean",
         ),
         (
             "loss",
             "the loss function to be optimized.",
             None,
             "TypeConverters.toString",
         ),
         (
             "distanceMeasure",
             "the distance measure. Supported options: 'euclidean' and 'cosine'.",
             '"euclidean"',
             "TypeConverters.toString",
         ),
         (
             "validationIndicatorCol",
             "name of the column that indicates whether each row is for "
             + "training or for validation. False indicates training; true indicates validation.",
             None,
             "TypeConverters.toString",
         ),
         (
             "blockSize",
             "block size for stacking input data in matrices. Data is stacked within "
             "partitions. If block size is more than remaining data in a partition then it is "
             "adjusted to the size of this data.",
             None,
             "TypeConverters.toInt",
         ),
         (
             "maxBlockSizeInMB",
             "maximum memory in MB for stacking input data into blocks. Data is "
             + "stacked within partitions. If more than remaining data size in a partition then it "
             + "is adjusted to the data size. Default 0.0 represents choosing optimal value, "
             + "depends on specific algorithm. Must be >= 0.",
             "0.0",
             "TypeConverters.toFloat",
         ),
         (
             "numTrainWorkers",
             "number of training workers",
             "1",
             "TypeConverters.toInt",
         ),
         (
             "batchSize",
             "number of training batch size",
             None,
             "TypeConverters.toInt",
         ),
         (
             "learningRate",
             "learning rate for training",
             None,
             "TypeConverters.toFloat",
         ),
         (
             "momentum",
             "momentum for training optimizer",
             None,
             "TypeConverters.toFloat",
         ),
         (
             "featureSizes",
             "input feature size list for input columns of vector assembler",
             None,
             "TypeConverters.toListInt",
         ),
     ]

     code = []
     for name, doc, defaultValueStr, typeConverter in shared:
         paramType = _type_for_type_converter.get(typeConverter, "None")

         param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter, paramType)
         code.append(param_code + "\n" + _gen_param_code(name, paramType))

     print("\n\n\n".join(code))
	#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#

	from typing import Optional

	header = """#
	# Licensed to the Apache Software Foundation (ASF) under one or more
	# contributor license agreements. See the NOTICE file distributed with
	# this work for additional information regarding copyright ownership.
	# The ASF licenses this file to You under the Apache License, Version 2.0
	# (the "License"); you may not use this file except in compliance with
	# the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	#"""

	# Code generator for shared params (shared.py). Run under this folder with:
	# python _shared_params_code_gen.py > shared.py

	_type_for_type_converter = {
	"TypeConverters.toBoolean": "bool",
	"TypeConverters.toFloat": "float",
	"TypeConverters.toInt": "int",
	"TypeConverters.toListFloat": "List[float]",
	"TypeConverters.toListInt": "List[int]",
	"TypeConverters.toListString": "List[str]",
	"TypeConverters.toString": "str",
	}


	def _gen_param_header(
	name: str, doc: str, defaultValueStr: Optional[str], typeConverter: str, paramType: str
	) -> str:
	"""
	Generates the header part for shared variables

	:param name: param name
	:param doc: param doc
	"""
	Name = f"Has{name[0].upper()}{name[1:]}"

	template = f'''class {Name}(Params):
	"""
	Mixin for param {name}: {doc}
	"""

	{name}: "Param[{paramType}]" = Param(
	Params._dummy(),
	"{name}",
	"{doc}",
	typeConverter={typeConverter},
	)

	def __init__(self) -> None:
	super({Name}, self).__init__()'''

	if defaultValueStr is not None:
	template += f"""
	self._setDefault({name}={defaultValueStr})"""

	return template


	def _gen_param_code(name: str, paramType: str) -> str:
	"""
	Generates Python code for a shared param class.

	:param name: param name
	:param doc: param doc
	:param defaultValueStr: string representation of the default value
	:return: code string
	"""
	# TODO: How to correctly inherit instance attributes?
	return f'''
	def get{name[0].upper()}{name[1:]}(self) -> {paramType}:
	"""
	Gets the value of {name} or its default value.
	"""
	return self.getOrDefault(self.{name})'''


	if __name__ == "__main__":
	print(header)
	print("\n# DO NOT MODIFY THIS FILE! It was generated by _shared_params_code_gen.py.\n")
	print("from typing import List\n")
	print("from pyspark.ml.param import Param, Params, TypeConverters\n\n")
	shared = [
	(
	"maxIter",
	"max number of iterations (>= 0).",
	None,
	"TypeConverters.toInt",
	),
	(
	"regParam",
	"regularization parameter (>= 0).",
	None,
	"TypeConverters.toFloat",
	),
	(
	"featuresCol",
	"features column name.",
	'"features"',
	"TypeConverters.toString",
	),
	(
	"labelCol",
	"label column name.",
	'"label"',
	"TypeConverters.toString",
	),
	(
	"predictionCol",
	"prediction column name.",
	'"prediction"',
	"TypeConverters.toString",
	),
	(
	"probabilityCol",
	"Column name for predicted class conditional probabilities. "
	+ "Note: Not all models output well-calibrated probability estimates! "
	+ "These probabilities should be treated as confidences, not precise probabilities.",
	'"probability"',
	"TypeConverters.toString",
	),
	(
	"rawPredictionCol",
	"raw prediction (a.k.a. confidence) column name.",
	'"rawPrediction"',
	"TypeConverters.toString",
	),
	(
	"inputCol",
	"input column name.",
	None,
	"TypeConverters.toString",
	),
	(
	"inputCols",
	"input column names.",
	None,
	"TypeConverters.toListString",
	),
	(
	"outputCol",
	"output column name.",
	'self.uid + "__output"',
	"TypeConverters.toString",
	),
	(
	"outputCols",
	"output column names.",
	None,
	"TypeConverters.toListString",
	),
	(
	"numFeatures",
	"Number of features. Should be greater than 0.",
	"262144",
	"TypeConverters.toInt",
	),
	(
	"checkpointInterval",
	"set checkpoint interval (>= 1) or disable checkpoint (-1). "
	+ "E.g. 10 means that the cache will get checkpointed every 10 iterations. Note: "
	+ "this setting will be ignored if the checkpoint directory is not set in "
	+ "the SparkContext.",
	None,
	"TypeConverters.toInt",
	),
	(
	"seed",
	"random seed.",
	"hash(type(self).__name__)",
	"TypeConverters.toInt",
	),
	(
	"tol",
	"the convergence tolerance for iterative algorithms (>= 0).",
	None,
	"TypeConverters.toFloat",
	),
	(
	"relativeError",
	"the relative target precision for the approximate quantile "
	+ "algorithm. Must be in the range [0, 1]",
	"0.001",
	"TypeConverters.toFloat",
	),
	(
	"stepSize",
	"Step size to be used for each iteration of optimization (>= 0).",
	None,
	"TypeConverters.toFloat",
	),
	(
	"handleInvalid",
	"how to handle invalid entries. Options are skip (which will filter "
	+ "out rows with bad values), or error (which will throw an error). "
	+ "More options may be added later.",
	None,
	"TypeConverters.toString",
	),
	(
	"elasticNetParam",
	"the ElasticNet mixing parameter, in range [0, 1]. For alpha = 0, "
	+ "the penalty is an L2 penalty. For alpha = 1, it is an L1 penalty.",
	"0.0",
	"TypeConverters.toFloat",
	),
	(
	"fitIntercept",
	"whether to fit an intercept term.",
	"True",
	"TypeConverters.toBoolean",
	),
	(
	"standardization",
	"whether to standardize the training features before fitting the " + "model.",
	"True",
	"TypeConverters.toBoolean",
	),
	(
	"thresholds",
	"Thresholds in multi-class classification to adjust the probability of "
	+ "predicting each class. Array must have length equal to the number of classes, with "
	+ "values > 0, excepting that at most one value may be 0. "
	+ "The class with largest value p/t is predicted, where p is the original "
	+ "probability of that class and t is the class's threshold.",
	None,
	"TypeConverters.toListFloat",
	),
	(
	"threshold",
	"threshold in binary classification prediction, in range [0, 1]",
	"0.5",
	"TypeConverters.toFloat",
	),
	(
	"weightCol",
	"weight column name. If this is not set or empty, we treat "
	+ "all instance weights as 1.0.",
	None,
	"TypeConverters.toString",
	),
	(
	"solver",
	"the solver algorithm for optimization. If this is not set or empty, "
	+ "default value is 'auto'.",
	'"auto"',
	"TypeConverters.toString",
	),
	(
	"varianceCol",
	"column name for the biased sample variance of prediction.",
	None,
	"TypeConverters.toString",
	),
	(
	"aggregationDepth",
	"suggested depth for treeAggregate (>= 2).",
	"2",
	"TypeConverters.toInt",
	),
	(
	"parallelism",
	"the number of threads to use when running parallel algorithms (>= 1).",
	"1",
	"TypeConverters.toInt",
	),
	(
	"collectSubModels",
	"Param for whether to collect a list of sub-models trained during "
	+ "tuning. If set to false, then only the single best sub-model will be available "
	+ "after fitting. If set to true, then all sub-models will be available. Warning: "
	+ "For large models, collecting all sub-models can cause OOMs on the Spark driver.",
	"False",
	"TypeConverters.toBoolean",
	),
	(
	"loss",
	"the loss function to be optimized.",
	None,
	"TypeConverters.toString",
	),
	(
	"distanceMeasure",
	"the distance measure. Supported options: 'euclidean' and 'cosine'.",
	'"euclidean"',
	"TypeConverters.toString",
	),
	(
	"validationIndicatorCol",
	"name of the column that indicates whether each row is for "
	+ "training or for validation. False indicates training; true indicates validation.",
	None,
	"TypeConverters.toString",
	),
	(
	"blockSize",
	"block size for stacking input data in matrices. Data is stacked within "
	"partitions. If block size is more than remaining data in a partition then it is "
	"adjusted to the size of this data.",
	None,
	"TypeConverters.toInt",
	),
	(
	"maxBlockSizeInMB",
	"maximum memory in MB for stacking input data into blocks. Data is "
	+ "stacked within partitions. If more than remaining data size in a partition then it "
	+ "is adjusted to the data size. Default 0.0 represents choosing optimal value, "
	+ "depends on specific algorithm. Must be >= 0.",
	"0.0",
	"TypeConverters.toFloat",
	),
	(
	"numTrainWorkers",
	"number of training workers",
	"1",
	"TypeConverters.toInt",
	),
	(
	"batchSize",
	"number of training batch size",
	None,
	"TypeConverters.toInt",
	),
	(
	"learningRate",
	"learning rate for training",
	None,
	"TypeConverters.toFloat",
	),
	(
	"momentum",
	"momentum for training optimizer",
	None,
	"TypeConverters.toFloat",
	),
	(
	"featureSizes",
	"input feature size list for input columns of vector assembler",
	None,
	"TypeConverters.toListInt",
	),
	]

	code = []
	for name, doc, defaultValueStr, typeConverter in shared:
	paramType = _type_for_type_converter.get(typeConverter, "None")

	param_code = _gen_param_header(name, doc, defaultValueStr, typeConverter, paramType)
	code.append(param_code + "\n" + _gen_param_code(name, paramType))

	print("\n\n\n".join(code))