blob: f58fa84d0084f7d4e8425ba565bdd73f8a4fb4e3 [file] [log] [blame]
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements. See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership. The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License. You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing,
# software distributed under the License is distributed on an
# "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
# KIND, either express or implied. See the License for the
# specific language governing permissions and limitations
# under the License.
from logging import getLogger
from random import choice, randint, random
from db_types import (
Boolean,
Char,
Decimal,
Float,
Int,
TYPES,
Timestamp)
from funcs import (
AnalyticAvg,
AnalyticCount,
AnalyticFirstValue,
AnalyticLag,
AnalyticLastValue,
AnalyticLead,
AnalyticMax,
AnalyticMin,
AnalyticSum,
And,
Coalesce,
Equals,
GreaterThan,
GreaterThanOrEquals,
If,
In,
IsDistinctFrom,
IsNotDistinctFrom,
IsNotDistinctFromOp,
LessThan,
LessThanOrEquals,
NotEquals,
NotIn,
Or,
WindowBoundary)
from random_val_generator import RandomValGenerator
UNBOUNDED_PRECEDING = WindowBoundary.UNBOUNDED_PRECEDING
PRECEDING = WindowBoundary.PRECEDING
CURRENT_ROW = WindowBoundary.CURRENT_ROW
FOLLOWING = WindowBoundary.FOLLOWING
UNBOUNDED_FOLLOWING = WindowBoundary.UNBOUNDED_FOLLOWING
LOG = getLogger()
class DefaultProfile(object):
def __init__(self):
# Bounds are (min, max) values, the actual value used will be selected from the
# bounds and each value within the range has an equal probability of being selected.
self._bounds = {
'MAX_NESTED_QUERY_COUNT': (0, 2),
'MAX_NESTED_EXPR_COUNT': (0, 2),
'SELECT_ITEM_COUNT': (1, 5),
'WITH_TABLE_COUNT': (1, 3),
'TABLE_COUNT': (1, 2),
'ANALYTIC_LEAD_LAG_OFFSET': (1, 100),
'ANALYTIC_WINDOW_OFFSET': (1, 100)}
# Below are interdependent weights used to determine probabilities. The probability
# of any item being selected should be (item weight) / sum(weights). A weight of
# zero means the item will never be selected.
self._weights = {
'SELECT_ITEM_CATEGORY': {
'AGG': 3,
'ANALYTIC': 1,
'BASIC': 10},
'TYPES': {
Boolean: 1,
Char: 1,
Decimal: 1,
Float: 1,
Int: 10,
Timestamp: 1},
'RELATIONAL_FUNCS': {
# The weights below are "best effort" suggestions. Because QueryGenerator
# prefers to set column types first, and some functions are "supported" only by
# some types, it means functions can be pruned off from this dictionary, and
# that will shift the probabilities. A quick example if that if a Char column is
# chosen: LessThan may not have a pre-defined signature for Char comparison, so
# LessThan shouldn't be chosen with Char columns. The tendency to prune will
# shift as the "funcs" module is adjusted to add/remove signatures.
And: 2,
Coalesce: 2,
Equals: 40,
GreaterThan: 2,
GreaterThanOrEquals: 2,
In: 2,
If: 2,
IsDistinctFrom: 2,
IsNotDistinctFrom: 1,
IsNotDistinctFromOp: 1,
LessThan: 2,
LessThanOrEquals: 2,
NotEquals: 2,
NotIn: 2,
Or: 2,
},
'CONJUNCT_DISJUNCTS': {
# And and Or appear both under RELATIONAL_FUNCS and CONJUNCT_DISJUNCTS for the
# following reasons:
# 1. And and Or are considered "relational" by virtue of taking two arguments
# and returning a Boolean. The crude signature selection means they could be
# selected, so we describe weights there.
# 2. They are set here explicitly as well so that
# QueryGenerator._create_bool_func_tree() can create a "more realistic"
# expression that has a Boolean operator at the top of the tree by explicitly
# asking for an And or Or.
# IMPALA-3896 tracks a better way to do this.
And: 5,
Or: 1},
'ANALYTIC_WINDOW': {
('ROWS', UNBOUNDED_PRECEDING, None): 1,
('ROWS', UNBOUNDED_PRECEDING, PRECEDING): 2,
('ROWS', UNBOUNDED_PRECEDING, CURRENT_ROW): 1,
('ROWS', UNBOUNDED_PRECEDING, FOLLOWING): 2,
('ROWS', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 2,
('ROWS', PRECEDING, None): 1,
('ROWS', PRECEDING, PRECEDING): 2,
('ROWS', PRECEDING, CURRENT_ROW): 1,
('ROWS', PRECEDING, FOLLOWING): 2,
('ROWS', PRECEDING, UNBOUNDED_FOLLOWING): 2,
('ROWS', CURRENT_ROW, None): 1,
('ROWS', CURRENT_ROW, CURRENT_ROW): 1,
('ROWS', CURRENT_ROW, FOLLOWING): 2,
('ROWS', CURRENT_ROW, UNBOUNDED_FOLLOWING): 2,
('ROWS', FOLLOWING, FOLLOWING): 2,
('ROWS', FOLLOWING, UNBOUNDED_FOLLOWING): 2,
# Ranges not yet supported
('RANGE', UNBOUNDED_PRECEDING, None): 0,
('RANGE', UNBOUNDED_PRECEDING, PRECEDING): 0,
('RANGE', UNBOUNDED_PRECEDING, CURRENT_ROW): 0,
('RANGE', UNBOUNDED_PRECEDING, FOLLOWING): 0,
('RANGE', UNBOUNDED_PRECEDING, UNBOUNDED_FOLLOWING): 0,
('RANGE', PRECEDING, None): 0,
('RANGE', PRECEDING, PRECEDING): 0,
('RANGE', PRECEDING, CURRENT_ROW): 0,
('RANGE', PRECEDING, FOLLOWING): 0,
('RANGE', PRECEDING, UNBOUNDED_FOLLOWING): 0,
('RANGE', CURRENT_ROW, None): 0,
('RANGE', CURRENT_ROW, CURRENT_ROW): 0,
('RANGE', CURRENT_ROW, FOLLOWING): 0,
('RANGE', CURRENT_ROW, UNBOUNDED_FOLLOWING): 0,
('RANGE', FOLLOWING, FOLLOWING): 0,
('RANGE', FOLLOWING, UNBOUNDED_FOLLOWING): 0},
'JOIN': {
'INNER': 90,
'LEFT': 30,
'RIGHT': 10,
'FULL_OUTER': 3,
'CROSS': 1},
'SUBQUERY_PREDICATE': {
('Exists', 'AGG', 'CORRELATED'): 0, # Not supported
('Exists', 'AGG', 'UNCORRELATED'): 1,
('Exists', 'NON_AGG', 'CORRELATED'): 1,
('Exists', 'NON_AGG', 'UNCORRELATED'): 1,
('NotExists', 'AGG', 'CORRELATED'): 0, # Not supported
('NotExists', 'AGG', 'UNCORRELATED'): 0, # Not supported
('NotExists', 'NON_AGG', 'CORRELATED'): 1,
('NotExists', 'NON_AGG', 'UNCORRELATED'): 0, # Not supported
('In', 'AGG', 'CORRELATED'): 0, # Not supported
('In', 'AGG', 'UNCORRELATED'): 0, # Not supported
('In', 'NON_AGG', 'CORRELATED'): 1,
('In', 'NON_AGG', 'UNCORRELATED'): 1,
('NotIn', 'AGG', 'CORRELATED'): 0, # Not supported
('NotIn', 'AGG', 'UNCORRELATED'): 1,
('NotIn', 'NON_AGG', 'CORRELATED'): 1,
('NotIn', 'NON_AGG', 'UNCORRELATED'): 1,
('Scalar', 'AGG', 'CORRELATED'): 0, # Not supported
('Scalar', 'AGG', 'UNCORRELATED'): 1,
('Scalar', 'NON_AGG', 'CORRELATED'): 0, # Not supported
('Scalar', 'NON_AGG', 'UNCORRELATED'): 1},
'QUERY_EXECUTION': { # Used by the discrepancy searcher
'CREATE_TABLE_AS': 1,
'RAW': 10,
'VIEW': 1}}
# On/off switches
self._flags = {
'ANALYTIC_DESIGNS': {
'TOP_LEVEL_QUERY_WITHOUT_LIMIT': True,
'DETERMINISTIC_ORDER_BY': True,
'NO_ORDER_BY': True,
'ONLY_SELECT_ITEM': True,
'UNBOUNDED_WINDOW': True,
'RANK_FUNC': True}}
# Independent probabilities where 1 means 100%. These values may be ignored depending
# on the context. For example, GROUP_BY is almost always ignored and instead
# determined by the SELECT item weights above, since mixing aggregate and
# non-aggregate items requires the use of a GROUP BY. The GROUP_BY option below is
# only applied if all of the SELECT items are non-aggregate.
self._probabilities = {
'OPTIONAL_QUERY_CLAUSES': {
'WITH': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
'FROM': 1,
'WHERE': 0.5,
'GROUP_BY': 0.1, # special case, doesn't really do much, see comment above
'HAVING': 0.25,
'UNION': 0.1,
'ORDER_BY': 0.1},
'OPTIONAL_ANALYTIC_CLAUSES': {
'PARTITION_BY': 0.5,
'ORDER_BY': 0.5,
'WINDOW': 0.5}, # will only be used if ORDER BY is chosen
'MISC': {
'INLINE_VIEW': 0.1, # MAX_NESTED_QUERY_COUNT bounds take precedence
'SELECT_DISTINCT': 0.1,
'SCALAR_SUBQUERY': 0.1,
'ONLY_USE_EQUALITY_JOIN_PREDICATES': 0.8,
'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE': 0.7,
'UNION_ALL': 0.5}} # Determines use of "ALL" but not "UNION"
self.__type_weights = {}
self.constant_generator = RandomValGenerator()
def _get_config_value(self, start_config, *keys):
value = start_config
for key in keys:
value = value[key]
return value
def weights(self, *keys):
'''Convenience method for getting the values of named weights'''
return self._get_config_value(self._weights, *keys)
def bounds(self, *keys):
'''Convenience method for getting the values of named bounds'''
return self._get_config_value(self._bounds, *keys)
def probability(self, *keys):
'''Convenience method for getting the value of named probabilities'''
return self._get_config_value(self._probabilities, *keys)
def _choose_from_bounds(self, *bounds):
'''Returns a value that is within the given bounds. Each value has an equal chance
of being chosen.
'''
if isinstance(bounds[0], str):
lower, upper = self.bounds(*bounds)
else:
lower, upper = bounds
return randint(lower, upper)
def _choose_from_weights(self, *weight_args):
'''Returns a value that is selected from the keys of weights with the probability
determined by the values of weights.
'''
if isinstance(weight_args[0], str):
weights = self.weights(*weight_args)
else:
weights = weight_args[0]
total_weight = sum(weights.itervalues())
numeric_choice = randint(1, total_weight)
for choice_, weight in weights.iteritems():
if weight <= 0:
continue
if numeric_choice <= weight:
return choice_
numeric_choice -= weight
def _choose_from_filtered_weights(self, filter, *weights):
'''Convenience method, apply the given filter before choosing a value.'''
if isinstance(weights[0], str):
weights = self.weights(*weights)
else:
weights = weights[0]
return self._choose_from_weights(dict(
(choice_, weight) for choice_, weight in weights.iteritems() if filter(choice_)))
def _decide_from_probability(self, *keys):
return random() < self.probability(*keys)
def get_max_nested_query_count(self):
'''Return the maximum number of queries the top level query may contain.'''
return self._choose_from_bounds('MAX_NESTED_QUERY_COUNT')
def use_with_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WITH')
def only_use_equality_join_predicates(self):
return self._decide_from_probability('MISC', 'ONLY_USE_EQUALITY_JOIN_PREDICATES')
def only_use_aggregates_in_having_clause(self):
return self._decide_from_probability('MISC', 'ONLY_USE_AGGREGATES_IN_HAVING_CLAUSE')
def get_with_clause_table_ref_count(self):
'''Return the number of table ref entries a WITH clause should contain.'''
return self._choose_from_bounds('WITH_TABLE_COUNT')
def get_select_item_count(self):
return self._choose_from_bounds('SELECT_ITEM_COUNT')
def choose_nested_expr_count(self):
return self._choose_from_bounds('MAX_NESTED_EXPR_COUNT')
def allowed_analytic_designs(self):
return [design for design, is_enabled in self._flags['ANALYTIC_DESIGNS'].iteritems()
if is_enabled]
def use_partition_by_clause_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'PARTITION_BY')
def use_order_by_clause_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'ORDER_BY')
def use_window_in_analytic(self):
return self._decide_from_probability('OPTIONAL_ANALYTIC_CLAUSES', 'WINDOW')
def choose_window_type(self):
return self._choose_from_weights('ANALYTIC_WINDOW')
def get_window_offset(self):
return self._choose_from_bounds('ANALYTIC_WINDOW_OFFSET')
def get_offset_for_analytic_lead_or_lag(self):
return self._choose_from_bounds('ANALYTIC_LEAD_LAG_OFFSET')
def get_table_count(self):
return self._choose_from_bounds('TABLE_COUNT')
def use_inline_view(self):
return self._decide_from_probability('MISC', 'INLINE_VIEW')
def choose_table(self, table_exprs):
return choice(table_exprs)
def choose_join_type(self, join_types):
return self._choose_from_filtered_weights(
lambda join_type: join_type in join_types, 'JOIN')
def choose_join_condition_count(self):
return max(1, self._choose_from_bounds('MAX_NESTED_EXPR_COUNT'))
def use_where_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'WHERE')
def use_scalar_subquery(self):
return self._decide_from_probability('MISC', 'SCALAR_SUBQUERY')
def choose_subquery_predicate_category(self, func_name, allow_correlated):
weights = self.weights('SUBQUERY_PREDICATE')
func_names = set(name for name, _, _ in weights.iterkeys())
if func_name not in func_names:
func_name = 'Scalar'
allow_agg = self.weights('SELECT_ITEM_CATEGORY').get('AGG', 0)
if allow_correlated and self.bounds('TABLE_COUNT')[1] == 0:
allow_correlated = False
weights = dict(((name, use_agg, use_correlated), weight)
for (name, use_agg, use_correlated), weight in weights.iteritems()
if name == func_name \
and (allow_agg or use_agg == 'NON_AGG') \
and weight)
if weights:
return self._choose_from_weights(weights)
def use_distinct(self):
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
def use_distinct_in_func(self):
return self._decide_from_probability('MISC', 'SELECT_DISTINCT')
def use_group_by_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'GROUP_BY')
def use_having_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'HAVING')
def use_union_clause(self):
return self._decide_from_probability('OPTIONAL_QUERY_CLAUSES', 'UNION')
def use_union_all(self):
return self._decide_from_probability('MISC', 'UNION_ALL')
def get_query_execution(self):
return self._choose_from_weights('QUERY_EXECUTION')
def use_having_without_groupby(self):
return True
def use_nested_with(self):
return True
def use_lateral_join(self):
return False
def use_boolean_expr_for_lateral_join(self):
return False
def get_num_boolean_exprs_for_lateral_join(self):
return False
# Workaround for Hive null ordering differences, and lack of 'NULL FIRST', 'NULL LAST'
# specifications. The ref db will order nulls as specified for ASC sorting to make it
# identifical to Hive. Valid return values are: 'BEFORE', 'AFTER', or 'DEFAULT',
# the latter means no specification needed.
def nulls_order_asc(self):
return 'DEFAULT'
def choose_val_expr(self, val_exprs, types=TYPES):
if not val_exprs:
raise Exception('At least on value is required')
if not types:
raise Exception('At least one type is required')
available_types = set(types) & set(val_exprs.by_type)
if not available_types:
raise Exception('None of the provided values return any of the required types')
val_type = self.choose_type(available_types)
return choice(val_exprs.by_type[val_type])
def choose_constant(self, return_type=None, allow_null=True):
if not return_type:
return_type = self.choose_type()
while True:
val = self.constant_generator.generate_val(return_type)
if val is None and not allow_null:
continue
return return_type(val)
def choose_type(self, types=TYPES):
type_weights = self.weights('TYPES')
weights = dict((type_, type_weights[type_]) for type_ in types)
if not weights:
raise Exception('None of the requested types are enabled')
return self._choose_from_weights(weights)
def choose_conjunct_disjunct_fill_ratio(self):
'''Return the ratio of ANDs and ORs to use in a boolean function tree. For example,
when creating a WHERE condition that consists of 10 nested functions, a ratio of
0.1 means 1 out of the 10 functions in the WHERE clause will be an AND or OR.
'''
return random() * random()
def choose_relational_func_fill_ratio(self):
'''Return the ratio of relational functions to use in a boolean function tree. This
ratio is applied after 'choose_conjunct_disjunct_fill_ratio()'.
'''
return random() * random()
def choose_conjunct_disjunct(self):
return self._choose_from_weights('CONJUNCT_DISJUNCTS')
def choose_relational_func_signature(self, signatures):
'''Return a relational signature chosen from "signatures". A signature is considered
to be relational if it returns a boolean and accepts more than one argument.
'''
if not signatures:
raise Exception('At least one signature is required')
filtered_signatures = filter(
lambda s: s.return_type == Boolean \
and len(s.args) > 1 \
and not any(a.is_subquery for a in s.args),
signatures)
if not filtered_signatures:
raise Exception(
'None of the provided signatures corresponded to a relational function')
func_weights = self.weights('RELATIONAL_FUNCS')
missing_funcs = set(s.func for s in filtered_signatures) - set(func_weights)
if missing_funcs:
raise Exception("Weights are missing for functions: {0}".format(missing_funcs))
return self.choose_func_signature(filtered_signatures, self.weights('RELATIONAL_FUNCS'))
def choose_func_signature(self, signatures, _func_weights=None):
'''Return a signature chosen from "signatures".'''
if not signatures:
raise Exception('At least one signature is required')
type_weights = self.weights('TYPES')
func_weights = _func_weights
if func_weights:
distinct_funcs_in_signatures = set([s.func for s in signatures])
pruned_func_weights = {f: func_weights[f] for f in distinct_funcs_in_signatures}
func_weights = pruned_func_weights
else:
# First a function will be chosen then a signature. This is done so that the number
# of signatures a function has doesn't influence its likelihood of being chosen.
# Functions will be weighted based on the weight of the types in their arguments.
# The weights will be normalized by the number of arguments in the signature. The
# weight of a function will be the maximum weight out of all of it's signatures.
# If any signature has a type with a weight of zero, the signature will not be used.
#
# Example: type_weights = {Int: 10, Float: 1},
# funcs = [foo(Int), foo(Float), bar(Int, Float)]
#
# max signature length = 2 # from bar(Int, Float)
# weight of foo(Int) = (10 * 2)
# weight of foo(Float) = (1 * 2)
# weight of bar(Int, Float) = ((10 + 1) * 1)
# func_weights = {foo: 20, bar: 11}
#
# Note that this only selects a function, the function signature will be selected
# later. This is done to prevent function with a greater number of signatures from
# being selected more frequently.
func_weights = dict()
# The length of the signature in func_weights
signature_length_by_func = dict()
for signature in signatures:
signature_weight = type_weights[signature.return_type]
signature_length = 1
for arg in signature.args:
if arg.is_subquery:
for subtype in arg.type:
signature_weight *= type_weights[subtype]
signature_length += 1
else:
signature_weight *= type_weights[arg.type]
signature_length += 1
if not signature_weight:
continue
if signature.func not in func_weights \
or signature_weight > func_weights[signature.func]:
func_weights[signature.func] = signature_weight
signature_length_by_func[signature.func] = signature_length
if not func_weights:
raise Exception('All functions disallowed based on signature types')
distinct_signature_lengths = set(signature_length_by_func.values())
for func, weight in func_weights.iteritems():
signature_length = signature_length_by_func[func]
func_weights[func] = reduce(
lambda x, y: x * y,
distinct_signature_lengths - set([signature_length]),
func_weights[func])
func = self._choose_from_weights(func_weights)
# Same idea as above but for the signatures of the selected function.
signature_weights = dict()
signature_lengths = dict()
for idx, signature in enumerate(func.signatures()):
if signature not in signatures:
continue
signature_weight = type_weights[signature.return_type]
signature_length = 1
for arg in signature.args:
if arg.is_subquery:
for subtype in arg.type:
signature_weight *= type_weights[subtype]
signature_length += 1
else:
signature_weight *= type_weights[arg.type]
signature_length += 1
if signature_weight:
signature_weights[idx] = signature_weight
signature_lengths[idx] = signature_length
distinct_signature_lengths = set(signature_lengths.values())
for idx, weight in signature_weights.iteritems():
signature_length = signature_lengths[idx]
signature_weights[idx] = reduce(
lambda x, y: x * y,
distinct_signature_lengths - set([signature_length]),
signature_weights[idx])
idx = self._choose_from_weights(signature_weights)
return func.signatures()[idx]
def allow_func_signature(self, signature):
weights = self.weights('TYPES')
if not weights[signature.return_type]:
return False
for arg in signature.args:
if arg.is_subquery:
if not all(weights[subtype] for subtype in arg.type):
return False
elif not weights[arg.type]:
return False
return True
def get_allowed_join_signatures(self, signatures):
"""
Returns all the function signatures that are allowed inside a JOIN clause. This
method is mutually exclusive with only_use_equality_join_predicates. This results of
this method are ignored if only_use_equality_join_predicates return True.
"""
return signatures
def is_non_equality_join_predicate(self, func):
"""
Returns True if the given func is considered a non-equality join condition.
"""
return func in (GreaterThan, GreaterThanOrEquals, In,
IsNotDistinctFrom, IsNotDistinctFromOp, LessThan,
LessThanOrEquals, NotEquals, NotIn)
def get_analytic_funcs_that_cannot_contain_aggs(self):
"""
Returns a list of analytic functions that should not contain aggregate functions
"""
return None
class ImpalaNestedTypesProfile(DefaultProfile):
def __init__(self):
super(ImpalaNestedTypesProfile, self).__init__()
self._probabilities['OPTIONAL_QUERY_CLAUSES']['WITH'] = 0.3
self._probabilities['MISC']['INLINE_VIEW'] = 0.3
def use_lateral_join(self):
return random() < 0.5
def use_boolean_expr_for_lateral_join(self):
return random() < 0.2
def get_num_boolean_exprs_for_lateral_join(self):
if random() < 0.8:
return 0
result = 1
while random() < 0.6:
result += 1
return result
def get_table_count(self):
num = 1
while random() < (0.85 ** num):
num += 1
return num
# This profile was added for ad-hoc testing.
class TestFunctionProfile(DefaultProfile):
def choose_func_signature(self, signatures):
if not signatures:
raise Exception('At least one signature is required')
preferred_signatures = filter(lambda s: "DistinctFrom" in s.func._NAME, signatures)
if preferred_signatures:
signatures = preferred_signatures
return super(TestFunctionProfile, self).choose_func_signature(signatures)
class HiveProfile(DefaultProfile):
def __init__(self):
super(HiveProfile, self).__init__()
self._probabilities['MISC']['ONLY_USE_EQUALITY_JOIN_PREDICATES'] = 0
def use_having_without_groupby(self):
return False
def use_nested_with(self):
return False
def nulls_order_asc(self):
return 'BEFORE'
def allow_func_signature(self, signature):
if signature.func._NAME.startswith('DateAdd'):
return False
if signature.func._NAME in ('Greatest', 'Least'):
type = signature.return_type
argtypes = [arg.type for arg in signature.args]
for argtype in argtypes:
if type is None:
type = argtype
continue
else:
if type != argtype:
return False
return DefaultProfile.allow_func_signature(self, signature)
def get_allowed_join_signatures(self, signatures):
"""
Restricts the function signatures inside a JOIN clause to either be an Equals
operator, an And operator, or any operator that only takes in one argument. The reason
is that Hive only supports equi-joins, does not allow OR operators inside a JOIN, and
does not allow any other operator that operates over multiple columns.
The reason ONLY_USE_EQUALITY_JOIN_PREDICATES is not sufficient to guarantee this is
that Hive needs to restrict the functions used based on the argument size of a
function.
"""
return [signature for signature in signatures if
signature.func in (Equals, And) or len(signature.args) == 1]
def get_analytic_funcs_that_cannot_contain_aggs(self):
"""
Hive does not support aggregate functions inside AVG, COUNT, FIRSTVALUE, LAG,
LASTVALUE, LEAD, MAX, MIN, or SUM functions
"""
return (AnalyticAvg, AnalyticCount, AnalyticFirstValue, AnalyticLag,
AnalyticLastValue, AnalyticLead, AnalyticMax, AnalyticMin, AnalyticSum)
PROFILES = [var for var in locals().values()
if isinstance(var, type) and var.__name__.endswith('Profile')]