blob: 5f7e66a2c3630afff255f03b6373016682956ca8 [file] [log] [blame]
################################################################################
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
################################################################################
import typing
from pyflink.ml.core.param import IntParam, BooleanParam, StringParam, ParamValidators
from pyflink.ml.core.wrapper import JavaWithParams
from pyflink.ml.lib.feature.common import JavaFeatureTransformer
from pyflink.ml.lib.param import HasInputCol, HasOutputCol
class _RegexTokenizerParams(
JavaWithParams,
HasInputCol,
HasOutputCol
):
"""
Params for :class:`RegexTokenizer`.
"""
MIN_TOKEN_LENGTH: IntParam = IntParam(
"min_token_length",
"Minimum token length",
1,
ParamValidators.gt_eq(0)
)
GAPS: BooleanParam = BooleanParam(
"gaps",
"Set regex to match gaps or tokens",
True
)
PATTERN: StringParam = StringParam(
"pattern",
"Regex pattern used for tokenizing",
"\\s+"
)
TO_LOWERCASE: BooleanParam = BooleanParam(
"to_lowercase",
"Whether to convert all characters to lowercase before tokenizing",
True
)
def __init__(self, java_params):
super(_RegexTokenizerParams, self).__init__(java_params)
def set_min_token_length(self, value: int):
return typing.cast(_RegexTokenizerParams, self.set(self.MIN_TOKEN_LENGTH, value))
def get_min_token_length(self) -> int:
return self.get(self.MIN_TOKEN_LENGTH)
def set_gaps(self, value: bool):
return typing.cast(_RegexTokenizerParams, self.set(self.GAPS, value))
def get_gaps(self) -> bool:
return self.get(self.GAPS)
def set_pattern(self, value: str):
return typing.cast(_RegexTokenizerParams, self.set(self.PATTERN, value))
def get_pattern(self) -> str:
return self.get(self.PATTERN)
def set_to_lowercase(self, value: bool):
return typing.cast(_RegexTokenizerParams, self.set(self.TO_LOWERCASE, value))
def get_to_lowertcase(self) -> bool:
return self.get(self.TO_LOWERCASE)
@property
def min_token_length(self) -> int:
return self.get_min_token_length()
@property
def gaps(self) -> bool:
return self.get_gaps()
@property
def pattern(self) -> str:
return self.get_pattern()
@property
def to_lowercase(self):
return self.get_to_lowertcase()
class RegexTokenizer(JavaFeatureTransformer, _RegexTokenizerParams):
"""
A Transformer which converts the input string to lowercase and then splits it by white spaces
based on regex. It provides two options to extract tokens:
<ul>
<li>if "gaps" is true: uses the provided pattern to split the input string.
<li>else: repeatedly matches the regex (the provided pattern) with the input string.
</ul>
Moreover, it provides parameters to filter tokens with a minimal length and converts input to
lowercase. The output of each input string is an array of strings that can be empty.
"""
def __init__(self, java_model=None):
super(RegexTokenizer, self).__init__(java_model)
@classmethod
def _java_transformer_package_name(cls) -> str:
return "regextokenizer"
@classmethod
def _java_transformer_class_name(cls) -> str:
return "RegexTokenizer"