flink-ml-python/pyflink/ml/feature/univariatefeatureselector.py - flink-ml - Git at Google

 #  Licensed to the Apache Software Foundation (ASF) under one
 #  or more contributor license agreements.  See the NOTICE file
 #  distributed with this work for additional information
 #  regarding copyright ownership.  The ASF licenses this file
 #  to you under the Apache License, Version 2.0 (the
 #  "License"); you may not use this file except in compliance
 #  with the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 #  limitations under the License.
 import typing
 from pyflink.ml.wrapper import JavaWithParams
 from pyflink.ml.param import StringParam, FloatParam
 from pyflink.ml.common.param import HasFeaturesCol, HasLabelCol, HasOutputCol
 from pyflink.ml.feature.common import JavaFeatureModel, JavaFeatureEstimator


 class _UnivariateFeatureSelectorModelParams(
     JavaWithParams,
     HasFeaturesCol,
     HasOutputCol
 ):
     """
     Params for :class `UnivariateFeatureSelectorModel`.
     """
     def __init__(self, java_params):
         super(_UnivariateFeatureSelectorModelParams, self).__init__(java_params)


 class _UnivariateFeatureSelectorParams(HasLabelCol, _UnivariateFeatureSelectorModelParams):
     """
     Params for :class `UnivariateFeatureSelector`.
     """

     """
     Supported options of the feature type.

     <ul>
         <li>categorical: the features are categorical data.
         <li>continuous: the features are continuous data.
     </ul>
     """
     FEATURE_TYPE: StringParam = StringParam(
         "feature_type",
         "The feature type.",
         None)

     """
     Supported options of the label type.

     <ul>
         <li>categorical: the label is categorical data.
         <li>continuous: the label is continuous data.
     </ul>
     """
     LABEL_TYPE: StringParam = StringParam(
         "label_type",
         "The label type.",
         None)

     """
     Supported options of the feature selection mode.

     <ul>
         <li>numTopFeatures: chooses a fixed number of top features according to a hypothesis.
         <li>percentile: similar to numTopFeatures but chooses a fraction of all features
             instead of a fixed number.
         <li>fpr: chooses all features whose p-value are below a threshold, thus controlling the
             false positive rate of selection.
         <li>fdr: uses the <ahref="https://en.wikipedia.org/wiki/False_discovery_rate#
             Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a> to choose
             all features whose false discovery rate is below a threshold.
         <li>fwe: chooses all features whose p-values are below a threshold. The threshold is
             scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
     </ul>
     """
     SELECTION_MODE: StringParam = StringParam(
         "selection_mode",
         "The feature selection mode.",
         "numTopFeatures")

     SELECTION_THRESHOLD: FloatParam = FloatParam(
         "selection_threshold",
         "The upper bound of the features that selector will select. If not set, it will be "
         "replaced with a meaningful value according to different selection modes at runtime. "
         "When the mode is numTopFeatures, it will be replaced with 50; when the mode is "
         "percentile, it will be replaced with 0.1; otherwise, it will be replaced with 0.05.",
         None)

     def __init__(self, java_params):
         super(_UnivariateFeatureSelectorParams, self).__init__(java_params)

     def set_feature_type(self, value: str):
         return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.FEATURE_TYPE, value))

     def get_feature_type(self) -> str:
         return self.get(self.FEATURE_TYPE)

     def set_label_type(self, value: str):
         return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.LABEL_TYPE, value))

     def get_label_type(self) -> str:
         return self.get(self.LABEL_TYPE)

     def set_selection_mode(self, value: str):
         return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.SELECTION_MODE, value))

     def get_selection_mode(self) -> str:
         return self.get(self.SELECTION_MODE)

     def set_selection_threshold(self, value: float):
         return typing.cast(_UnivariateFeatureSelectorParams,
                            self.set(self.SELECTION_THRESHOLD, float(value)))

     def get_selection_threshold(self) -> float:
         return self.get(self.SELECTION_THRESHOLD)

     @property
     def feature_type(self):
         return self.get_feature_type()

     @property
     def label_type(self):
         return self.get_label_type()

     @property
     def selection_mode(self):
         return self.get_selection_mode()

     @property
     def selection_threshold(self):
         return self.get_selection_threshold()


 class UnivariateFeatureSelectorModel(JavaFeatureModel, _UnivariateFeatureSelectorModelParams):
     """
     A Model which transforms data using the model data computed
     by :class:`UnivariateFeatureSelector`.
     """

     def __init__(self, java_model=None):
         super(UnivariateFeatureSelectorModel, self).__init__(java_model)

     @classmethod
     def _java_model_package_name(cls) -> str:
         return "univariatefeatureselector"

     @classmethod
     def _java_model_class_name(cls) -> str:
         return "UnivariateFeatureSelectorModel"


 class UnivariateFeatureSelector(JavaFeatureEstimator, _UnivariateFeatureSelectorParams):
     """
     An Estimator which selects features based on univariate statistical tests against labels.

     Currently, Flink supports three Univariate Feature Selectors: chi-squared, ANOVA F-test and
     F-value. User can choose Univariate Feature Selector by setting `featureType` and `labelType`,
     and Flink will pick the score function based on the specified `featureType` and `labelType`.

     The following combination of `featureType` and `labelType` are supported:

     <ul>
         <li>`featureType` `categorical` and `labelType` `categorical`: Flink uses chi-squared,
             i.e. chi2 in sklearn.
         <li>`featureType` `continuous` and `labelType` `categorical`: Flink uses ANOVA F-test,
             i.e. f_classif in sklearn.
         <li>`featureType` `continuous` and `labelType` `continuous`: Flink uses F-value,
             i.e. f_regression in sklearn.
     </ul>

     The `UnivariateFeatureSelector` supports different selection modes:

     <ul>
         <li>numTopFeatures: chooses a fixed number of top features according to a hypothesis.
         <li>percentile: similar to numTopFeatures but chooses a fraction of all features
             instead of a fixed number.
         <li>fpr: chooses all features whose p-value are below a threshold, thus controlling
             the false positive rate of selection.
         <li>fdr: uses the <ahref="https://en.wikipedia.org/wiki/False_discovery_rate#
             Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a> to choose
             all features whose false discovery rate is below a threshold.
         <li>fwe: chooses all features whose p-values are below a threshold. The threshold is
             scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
     </ul>

     By default, the selection mode is `numTopFeatures`.
     """

     def __init__(self):
         super(UnivariateFeatureSelector, self).__init__()

     @classmethod
     def _create_model(cls, java_model) -> UnivariateFeatureSelectorModel:
         return UnivariateFeatureSelectorModel(java_model)

     @classmethod
     def _java_estimator_package_name(cls) -> str:
         return "univariatefeatureselector"

     @classmethod
     def _java_estimator_class_name(cls) -> str:
         return "UnivariateFeatureSelector"
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import typing
	from pyflink.ml.wrapper import JavaWithParams
	from pyflink.ml.param import StringParam, FloatParam
	from pyflink.ml.common.param import HasFeaturesCol, HasLabelCol, HasOutputCol
	from pyflink.ml.feature.common import JavaFeatureModel, JavaFeatureEstimator


	class _UnivariateFeatureSelectorModelParams(
	JavaWithParams,
	HasFeaturesCol,
	HasOutputCol
	):
	"""
	Params for :class `UnivariateFeatureSelectorModel`.
	"""
	def __init__(self, java_params):
	super(_UnivariateFeatureSelectorModelParams, self).__init__(java_params)


	class _UnivariateFeatureSelectorParams(HasLabelCol, _UnivariateFeatureSelectorModelParams):
	"""
	Params for :class `UnivariateFeatureSelector`.
	"""

	"""
	Supported options of the feature type.

	<ul>
	<li>categorical: the features are categorical data.
	<li>continuous: the features are continuous data.
	</ul>
	"""
	FEATURE_TYPE: StringParam = StringParam(
	"feature_type",
	"The feature type.",
	None)

	"""
	Supported options of the label type.

	<ul>
	<li>categorical: the label is categorical data.
	<li>continuous: the label is continuous data.
	</ul>
	"""
	LABEL_TYPE: StringParam = StringParam(
	"label_type",
	"The label type.",
	None)

	"""
	Supported options of the feature selection mode.

	<ul>
	<li>numTopFeatures: chooses a fixed number of top features according to a hypothesis.
	<li>percentile: similar to numTopFeatures but chooses a fraction of all features
	instead of a fixed number.
	<li>fpr: chooses all features whose p-value are below a threshold, thus controlling the
	false positive rate of selection.
	<li>fdr: uses the <ahref="https://en.wikipedia.org/wiki/False_discovery_rate#
	Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a> to choose
	all features whose false discovery rate is below a threshold.
	<li>fwe: chooses all features whose p-values are below a threshold. The threshold is
	scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
	</ul>
	"""
	SELECTION_MODE: StringParam = StringParam(
	"selection_mode",
	"The feature selection mode.",
	"numTopFeatures")

	SELECTION_THRESHOLD: FloatParam = FloatParam(
	"selection_threshold",
	"The upper bound of the features that selector will select. If not set, it will be "
	"replaced with a meaningful value according to different selection modes at runtime. "
	"When the mode is numTopFeatures, it will be replaced with 50; when the mode is "
	"percentile, it will be replaced with 0.1; otherwise, it will be replaced with 0.05.",
	None)

	def __init__(self, java_params):
	super(_UnivariateFeatureSelectorParams, self).__init__(java_params)

	def set_feature_type(self, value: str):
	return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.FEATURE_TYPE, value))

	def get_feature_type(self) -> str:
	return self.get(self.FEATURE_TYPE)

	def set_label_type(self, value: str):
	return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.LABEL_TYPE, value))

	def get_label_type(self) -> str:
	return self.get(self.LABEL_TYPE)

	def set_selection_mode(self, value: str):
	return typing.cast(_UnivariateFeatureSelectorParams, self.set(self.SELECTION_MODE, value))

	def get_selection_mode(self) -> str:
	return self.get(self.SELECTION_MODE)

	def set_selection_threshold(self, value: float):
	return typing.cast(_UnivariateFeatureSelectorParams,
	self.set(self.SELECTION_THRESHOLD, float(value)))

	def get_selection_threshold(self) -> float:
	return self.get(self.SELECTION_THRESHOLD)

	@property
	def feature_type(self):
	return self.get_feature_type()

	@property
	def label_type(self):
	return self.get_label_type()

	@property
	def selection_mode(self):
	return self.get_selection_mode()

	@property
	def selection_threshold(self):
	return self.get_selection_threshold()


	class UnivariateFeatureSelectorModel(JavaFeatureModel, _UnivariateFeatureSelectorModelParams):
	"""
	A Model which transforms data using the model data computed
	by :class:`UnivariateFeatureSelector`.
	"""

	def __init__(self, java_model=None):
	super(UnivariateFeatureSelectorModel, self).__init__(java_model)

	@classmethod
	def _java_model_package_name(cls) -> str:
	return "univariatefeatureselector"

	@classmethod
	def _java_model_class_name(cls) -> str:
	return "UnivariateFeatureSelectorModel"


	class UnivariateFeatureSelector(JavaFeatureEstimator, _UnivariateFeatureSelectorParams):
	"""
	An Estimator which selects features based on univariate statistical tests against labels.

	Currently, Flink supports three Univariate Feature Selectors: chi-squared, ANOVA F-test and
	F-value. User can choose Univariate Feature Selector by setting `featureType` and `labelType`,
	and Flink will pick the score function based on the specified `featureType` and `labelType`.

	The following combination of `featureType` and `labelType` are supported:

	<ul>
	<li>`featureType` `categorical` and `labelType` `categorical`: Flink uses chi-squared,
	i.e. chi2 in sklearn.
	<li>`featureType` `continuous` and `labelType` `categorical`: Flink uses ANOVA F-test,
	i.e. f_classif in sklearn.
	<li>`featureType` `continuous` and `labelType` `continuous`: Flink uses F-value,
	i.e. f_regression in sklearn.
	</ul>

	The `UnivariateFeatureSelector` supports different selection modes:

	<ul>
	<li>numTopFeatures: chooses a fixed number of top features according to a hypothesis.
	<li>percentile: similar to numTopFeatures but chooses a fraction of all features
	instead of a fixed number.
	<li>fpr: chooses all features whose p-value are below a threshold, thus controlling
	the false positive rate of selection.
	<li>fdr: uses the <ahref="https://en.wikipedia.org/wiki/False_discovery_rate#
	Benjamini.E2.80.93Hochberg_procedure">Benjamini-Hochberg procedure</a> to choose
	all features whose false discovery rate is below a threshold.
	<li>fwe: chooses all features whose p-values are below a threshold. The threshold is
	scaled by 1/numFeatures, thus controlling the family-wise error rate of selection.
	</ul>

	By default, the selection mode is `numTopFeatures`.
	"""

	def __init__(self):
	super(UnivariateFeatureSelector, self).__init__()

	@classmethod
	def _create_model(cls, java_model) -> UnivariateFeatureSelectorModel:
	return UnivariateFeatureSelectorModel(java_model)

	@classmethod
	def _java_estimator_package_name(cls) -> str:
	return "univariatefeatureselector"

	@classmethod
	def _java_estimator_class_name(cls) -> str:
	return "UnivariateFeatureSelector"