flink-ml-python/pyflink/ml/lib/feature/regextokenizer.py - flink-ml - Git at Google

 ################################################################################
 #  Licensed to the Apache Software Foundation (ASF) under one
 #  or more contributor license agreements.  See the NOTICE file
 #  distributed with this work for additional information
 #  regarding copyright ownership.  The ASF licenses this file
 #  to you under the Apache License, Version 2.0 (the
 #  "License"); you may not use this file except in compliance
 #  with the License.  You may obtain a copy of the License at
 #
 #      http://www.apache.org/licenses/LICENSE-2.0
 #
 #  Unless required by applicable law or agreed to in writing, software
 #  distributed under the License is distributed on an "AS IS" BASIS,
 #  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #  See the License for the specific language governing permissions and
 # limitations under the License.
 ################################################################################

 import typing

 from pyflink.ml.core.param import IntParam, BooleanParam, StringParam, ParamValidators
 from pyflink.ml.core.wrapper import JavaWithParams
 from pyflink.ml.lib.feature.common import JavaFeatureTransformer
 from pyflink.ml.lib.param import HasInputCol, HasOutputCol


 class _RegexTokenizerParams(
     JavaWithParams,
     HasInputCol,
     HasOutputCol
 ):
     """
     Params for :class:`RegexTokenizer`.
     """

     MIN_TOKEN_LENGTH: IntParam = IntParam(
         "min_token_length",
         "Minimum token length",
         1,
         ParamValidators.gt_eq(0)
     )

     GAPS: BooleanParam = BooleanParam(
         "gaps",
         "Set regex to match gaps or tokens",
         True
     )

     PATTERN: StringParam = StringParam(
         "pattern",
         "Regex pattern used for tokenizing",
         "\\s+"
     )

     TO_LOWERCASE: BooleanParam = BooleanParam(
         "to_lowercase",
         "Whether to convert all characters to lowercase before tokenizing",
         True
     )

     def __init__(self, java_params):
         super(_RegexTokenizerParams, self).__init__(java_params)

     def set_min_token_length(self, value: int):
         return typing.cast(_RegexTokenizerParams, self.set(self.MIN_TOKEN_LENGTH, value))

     def get_min_token_length(self) -> int:
         return self.get(self.MIN_TOKEN_LENGTH)

     def set_gaps(self, value: bool):
         return typing.cast(_RegexTokenizerParams, self.set(self.GAPS, value))

     def get_gaps(self) -> bool:
         return self.get(self.GAPS)

     def set_pattern(self, value: str):
         return typing.cast(_RegexTokenizerParams, self.set(self.PATTERN, value))

     def get_pattern(self) -> str:
         return self.get(self.PATTERN)

     def set_to_lowercase(self, value: bool):
         return typing.cast(_RegexTokenizerParams, self.set(self.TO_LOWERCASE, value))

     def get_to_lowertcase(self) -> bool:
         return self.get(self.TO_LOWERCASE)

     @property
     def min_token_length(self) -> int:
         return self.get_min_token_length()

     @property
     def gaps(self) -> bool:
         return self.get_gaps()

     @property
     def pattern(self) -> str:
         return self.get_pattern()

     @property
     def to_lowercase(self):
         return self.get_to_lowertcase()


 class RegexTokenizer(JavaFeatureTransformer, _RegexTokenizerParams):
     """
     A Transformer which converts the input string to lowercase and then splits it by white spaces
     based on regex. It provides two options to extract tokens:

     <ul>
         <li>if "gaps" is true: uses the provided pattern to split the input string.
         <li>else: repeatedly matches the regex (the provided pattern) with the input string.
     </ul>

     Moreover, it provides parameters to filter tokens with a minimal length and converts input to
     lowercase. The output of each input string is an array of strings that can be empty.

     """

     def __init__(self, java_model=None):
         super(RegexTokenizer, self).__init__(java_model)

     @classmethod
     def _java_transformer_package_name(cls) -> str:
         return "regextokenizer"

     @classmethod
     def _java_transformer_class_name(cls) -> str:
         return "RegexTokenizer"
	################################################################################
	# Licensed to the Apache Software Foundation (ASF) under one
	# or more contributor license agreements. See the NOTICE file
	# distributed with this work for additional information
	# regarding copyright ownership. The ASF licenses this file
	# to you under the Apache License, Version 2.0 (the
	# "License"); you may not use this file except in compliance
	# with the License. You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	################################################################################

	import typing

	from pyflink.ml.core.param import IntParam, BooleanParam, StringParam, ParamValidators
	from pyflink.ml.core.wrapper import JavaWithParams
	from pyflink.ml.lib.feature.common import JavaFeatureTransformer
	from pyflink.ml.lib.param import HasInputCol, HasOutputCol


	class _RegexTokenizerParams(
	JavaWithParams,
	HasInputCol,
	HasOutputCol
	):
	"""
	Params for :class:`RegexTokenizer`.
	"""

	MIN_TOKEN_LENGTH: IntParam = IntParam(
	"min_token_length",
	"Minimum token length",
	1,
	ParamValidators.gt_eq(0)
	)

	GAPS: BooleanParam = BooleanParam(
	"gaps",
	"Set regex to match gaps or tokens",
	True
	)

	PATTERN: StringParam = StringParam(
	"pattern",
	"Regex pattern used for tokenizing",
	"\\s+"
	)

	TO_LOWERCASE: BooleanParam = BooleanParam(
	"to_lowercase",
	"Whether to convert all characters to lowercase before tokenizing",
	True
	)

	def __init__(self, java_params):
	super(_RegexTokenizerParams, self).__init__(java_params)

	def set_min_token_length(self, value: int):
	return typing.cast(_RegexTokenizerParams, self.set(self.MIN_TOKEN_LENGTH, value))

	def get_min_token_length(self) -> int:
	return self.get(self.MIN_TOKEN_LENGTH)

	def set_gaps(self, value: bool):
	return typing.cast(_RegexTokenizerParams, self.set(self.GAPS, value))

	def get_gaps(self) -> bool:
	return self.get(self.GAPS)

	def set_pattern(self, value: str):
	return typing.cast(_RegexTokenizerParams, self.set(self.PATTERN, value))

	def get_pattern(self) -> str:
	return self.get(self.PATTERN)

	def set_to_lowercase(self, value: bool):
	return typing.cast(_RegexTokenizerParams, self.set(self.TO_LOWERCASE, value))

	def get_to_lowertcase(self) -> bool:
	return self.get(self.TO_LOWERCASE)

	@property
	def min_token_length(self) -> int:
	return self.get_min_token_length()

	@property
	def gaps(self) -> bool:
	return self.get_gaps()

	@property
	def pattern(self) -> str:
	return self.get_pattern()

	@property
	def to_lowercase(self):
	return self.get_to_lowertcase()


	class RegexTokenizer(JavaFeatureTransformer, _RegexTokenizerParams):
	"""
	A Transformer which converts the input string to lowercase and then splits it by white spaces
	based on regex. It provides two options to extract tokens:

	<ul>
	<li>if "gaps" is true: uses the provided pattern to split the input string.
	<li>else: repeatedly matches the regex (the provided pattern) with the input string.
	</ul>

	Moreover, it provides parameters to filter tokens with a minimal length and converts input to
	lowercase. The output of each input string is an array of strings that can be empty.

	"""

	def __init__(self, java_model=None):
	super(RegexTokenizer, self).__init__(java_model)

	@classmethod
	def _java_transformer_package_name(cls) -> str:
	return "regextokenizer"

	@classmethod
	def _java_transformer_class_name(cls) -> str:
	return "RegexTokenizer"