blob: 556b1d425cc3a20c87edca925650daed271cd51b [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Parser file for Pig Parser
*
* NOTE: THIS FILE IS THE BASE FOR A FEW TREE PARSER FILES, such as AstValidator.g,
* SO IF YOU CHANGE THIS FILE, YOU WILL PROBABLY NEED TO MAKE CORRESPONDING CHANGES TO
* THOSE FILES AS WELL.
*/
parser grammar QueryParser;
options {
tokenVocab=QueryLexer;
output=AST;
backtrack=false; // greatly slows down parsing!
}
tokens {
QUERY;
STATEMENT;
FUNC;
FUNC_REF;
FUNC_EVAL;
INVOKE;
INVOKER_FUNC_EVAL;
IN_LHS;
IN_RHS;
CASE_COND;
CASE_EXPR;
CASE_EXPR_LHS;
CASE_EXPR_RHS;
CAST_EXPR;
COL_RANGE;
BIN_EXPR;
TUPLE_VAL;
MAP_VAL;
BAG_VAL;
KEY_VAL_PAIR;
FIELD_DEF;
FIELD_DEF_WITHOUT_IDENTIFIER;
NESTED_CMD_ASSI;
NESTED_CMD;
NESTED_PROJ;
SPLIT_BRANCH;
FOREACH_PLAN_SIMPLE;
FOREACH_PLAN_COMPLEX;
MAP_TYPE;
TUPLE_TYPE;
BAG_TYPE;
NEG;
EXPR_IN_PAREN;
JOIN_ITEM;
TUPLE_TYPE_CAST;
BAG_TYPE_CAST;
PARAMS;
RETURN_VAL;
MACRO_DEF;
MACRO_BODY;
MACRO_INLINE;
NULL;
TRUE;
FALSE;
IDENTIFIER;
ANY;
TOBAG;
TOMAP;
TOTUPLE;
FAT_ARROW;
}
@header {
package org.apache.pig.parser;
import java.util.Set;
import java.util.HashSet;
import java.util.Collections;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.pig.parser.PigMacro;
import com.google.common.collect.ImmutableSet;
import com.google.common.collect.ImmutableMap;
import com.google.common.collect.Lists;
import com.google.common.base.Joiner;
}
@members {
private static Log log = LogFactory.getLog( QueryParser.class );
private Set<String> memory = new HashSet<String>();
// Make a deep copy of the given node
private static Tree deepCopy(Tree tree) {
Tree copy = tree.dupNode();
for (int i = 0; i < tree.getChildCount(); i++) {
Tree child = deepCopy(tree.getChild(i));
child.setParent(copy);
copy.addChild(child);
}
return copy;
}
@Override
protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
throws RecognitionException {
throw new MismatchedTokenException( ttype, input );
}
@Override
public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)
throws RecognitionException {
throw e;
}
@Override
public String getErrorMessage(RecognitionException e, String[] tokenNames ) {
if( !log.isDebugEnabled() ) {
if( e instanceof NoViableAltException ) {
return "Syntax error, unexpected symbol at or near " + getTokenErrorDisplay( e.token );
} else {
return super.getErrorMessage( e, tokenNames );
}
}
List stack = getRuleInvocationStack( e, this.getClass().getName() );
String msg = null;
if( e instanceof NoViableAltException ) {
NoViableAltException nvae = (NoViableAltException)e;
msg = " no viable alt; token = " + e.token + " (decision=" + nvae.decisionNumber + " state " + nvae.stateNumber + ")" +
" decision=<<" + nvae.grammarDecisionDescription + ">>";
} else {
msg = super.getErrorMessage( e, tokenNames );
}
return stack + " " + msg;
}
@Override
public String getTokenErrorDisplay(Token t) {
return "'" + t.getText() + "'";
}
@Override
public String getErrorHeader(RecognitionException ex) {
return QueryParserUtils.generateErrorHeader( ex, this.getSourceName() );
}
private static final Map<Integer, Integer> FUNC_TO_LITERAL = ImmutableMap.of(
TOBAG, BAG_VAL,
TOMAP, MAP_VAL,
TOTUPLE, TUPLE_VAL);
private static final Set<Integer> BOOLEAN_TOKENS = ImmutableSet.of(
STR_OP_EQ,
STR_OP_NE,
STR_OP_GT,
STR_OP_LT,
STR_OP_GTE,
STR_OP_LTE,
STR_OP_MATCHES,
AND,
OR,
NOT,
NULL,
NUM_OP_EQ,
NUM_OP_NE,
NUM_OP_GT,
NUM_OP_GTE,
NUM_OP_LT,
NUM_OP_LTE);
private static final Set<Integer> LITERAL_TOKENS = ImmutableSet.of(
INTEGER,
LONGINTEGER,
FLOATNUMBER,
DOUBLENUMBER,
QUOTEDSTRING,
NULL,
TRUE,
FALSE,
MAP_VAL,
BAG_VAL,
TUPLE_VAL,
PERIOD,
POUND);
} // End of @members
@rulecatch {
catch(RecognitionException re) {
throw re;
}
}
query : statement* EOF -> ^( QUERY statement* )
;
schema: field_def_list EOF
;
// STATEMENTS
statement : SEMI_COLON!
| general_statement SEMI_COLON!
| split_clause SEMI_COLON!
| inline_clause SEMI_COLON!
| import_clause SEMI_COLON!
| realias_clause SEMI_COLON!
| register_clause SEMI_COLON!
| assert_clause SEMI_COLON!
// semicolons after foreach_complex_statement are optional for backwards compatibility, but to keep
// the grammar unambiguous if there is one then we'll parse it as a single, standalone semicolon
// (which matches the first statement rule)
| foreach_statement
;
nested_op_clause : LEFT_PAREN! op_clause parallel_clause? RIGHT_PAREN!
| LEFT_PAREN FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? ) ) RIGHT_PAREN
-> ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause?
;
general_statement : FAT_ARROW ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT IDENTIFIER["____RESERVED____"] op_clause? parallel_clause? nested_op_clause? )
| ( identifier_plus EQUAL )? ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT identifier_plus? op_clause? parallel_clause? nested_op_clause? )
;
// Statement represented by a foreach operator with a nested block. Simple foreach statement
// is covered by general_statement.
// We need to handle foreach specifically because of the ending ';', which is not required
// if there is a nested block. This is ugly, but it gets the job done.
foreach_statement : FAT_ARROW FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) )
-> ^( STATEMENT IDENTIFIER["____RESERVED____"] ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? )
| ( identifier_plus EQUAL )? FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) )
-> ^( STATEMENT identifier_plus? ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? )
;
foreach_plan_complex : LEFT_CURLY nested_blk RIGHT_CURLY -> ^( FOREACH_PLAN_COMPLEX nested_blk )
;
foreach_plan_simple : GENERATE flatten_generated_item ( COMMA flatten_generated_item )* -> ^( FOREACH_PLAN_SIMPLE ^( GENERATE flatten_generated_item+ ) )
;
// MACRO grammar
macro_content : LEFT_CURLY ( macro_content | ~(LEFT_CURLY | RIGHT_CURLY) )* RIGHT_CURLY
;
macro_param_clause : LEFT_PAREN ( identifier_plus (COMMA identifier_plus)* )? RIGHT_PAREN
-> ^(PARAMS identifier_plus*)
;
macro_return_clause
: RETURNS ((identifier_plus (COMMA identifier_plus)*) | VOID)
-> ^(RETURN_VAL identifier_plus*)
;
macro_body_clause : macro_content -> ^(MACRO_BODY { new PigParserNode(new CommonToken(1, $macro_content.text), this.getSourceName(), $macro_content.start) } )
;
macro_clause : macro_param_clause macro_return_clause macro_body_clause
-> ^(MACRO_DEF macro_param_clause macro_return_clause macro_body_clause)
;
inline_return_clause
: identifier_plus EQUAL -> ^(RETURN_VAL identifier_plus)
| identifier_plus (COMMA identifier_plus)+ EQUAL -> ^(RETURN_VAL identifier_plus+)
| -> ^(RETURN_VAL)
;
parameter
: IDENTIFIER
| INTEGER
| DOUBLENUMBER
| BIGDECIMALNUMBER
| BIGINTEGERNUMBER
| QUOTEDSTRING
| DOLLARVAR
;
inline_param_clause : LEFT_PAREN ( parameter (COMMA parameter)* )? RIGHT_PAREN
-> ^(PARAMS parameter*)
;
inline_clause : inline_return_clause identifier_plus inline_param_clause
-> ^(MACRO_INLINE identifier_plus inline_return_clause inline_param_clause)
;
// TYPES
simple_type : BOOLEAN | INT | LONG | FLOAT | DOUBLE | DATETIME | BIGINTEGER | BIGDECIMAL | CHARARRAY | BYTEARRAY
;
implicit_tuple_type : LEFT_PAREN field_def_list? RIGHT_PAREN -> ^( TUPLE_TYPE field_def_list? )
;
explicit_tuple_type : TUPLE! implicit_tuple_type
;
explicit_tuple_type_cast : TUPLE LEFT_PAREN ( explicit_type_cast ( COMMA explicit_type_cast )* )? RIGHT_PAREN
-> ^( TUPLE_TYPE_CAST explicit_type_cast* )
;
tuple_type : implicit_tuple_type | explicit_tuple_type
;
implicit_bag_type : LEFT_CURLY NULL COLON tuple_type? RIGHT_CURLY -> ^( BAG_TYPE tuple_type? )
| LEFT_CURLY ( ( identifier_plus COLON )? tuple_type )? RIGHT_CURLY -> ^( BAG_TYPE identifier_plus? tuple_type? )
;
explicit_bag_type : BAG! implicit_bag_type
;
explicit_bag_type_cast : BAG LEFT_CURLY explicit_tuple_type_cast? RIGHT_CURLY -> ^( BAG_TYPE_CAST explicit_tuple_type_cast? )
;
implicit_map_type : LEFT_BRACKET ( ( identifier_plus COLON )? type )? RIGHT_BRACKET -> ^( MAP_TYPE identifier_plus? type? )
;
explicit_map_type : MAP! implicit_map_type
;
map_type : implicit_map_type | explicit_map_type
;
explicit_type : simple_type | explicit_tuple_type | explicit_bag_type | explicit_map_type
;
implicit_type : implicit_tuple_type | implicit_bag_type | implicit_map_type
;
type : explicit_type | implicit_type
;
explicit_type_cast : simple_type | explicit_map_type | explicit_tuple_type_cast | explicit_bag_type_cast
;
// CLAUSES
import_clause : IMPORT^ QUOTEDSTRING
;
register_clause : REGISTER^ QUOTEDSTRING (USING identifier_plus AS identifier_plus)?
;
define_clause : DEFINE^ IDENTIFIER ( cmd | func_clause | macro_clause)
;
realias_clause : identifier_plus EQUAL identifier_plus -> ^(REALIAS identifier_plus identifier_plus)
;
parallel_clause : PARALLEL^ INTEGER
;
op_clause : define_clause
| load_clause
| group_clause
| cube_clause
| store_clause
| filter_clause
| distinct_clause
| limit_clause
| sample_clause
| order_clause
| rank_clause
| cross_clause
| join_clause
| union_clause
| stream_clause
| mr_clause
;
ship_clause : SHIP^ LEFT_PAREN! path_list? RIGHT_PAREN!
;
path_list : QUOTEDSTRING ( COMMA QUOTEDSTRING )* -> QUOTEDSTRING+
;
cache_clause : CACHE^ LEFT_PAREN! path_list RIGHT_PAREN!
;
input_clause : INPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN!
;
output_clause : OUTPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN!
;
error_clause : STDERROR^ LEFT_PAREN! ( QUOTEDSTRING ( LIMIT! INTEGER )? )? RIGHT_PAREN!
;
load_clause : LOAD^ QUOTEDSTRING ( USING! func_clause )? as_clause?
;
func_clause : func_name
-> ^( FUNC_REF func_name )
| func_name LEFT_PAREN func_args? RIGHT_PAREN
-> ^( FUNC func_name func_args? )
;
// needed for disambiguation when parsing expressions...see below
func_name_without_columns : eid_without_columns ( ( PERIOD | DOLLAR ) eid )*
;
func_name : eid ( ( PERIOD | DOLLAR ) eid )*
;
func_args_string : QUOTEDSTRING | MULTILINE_QUOTEDSTRING
;
func_args : func_args_string ( COMMA func_args_string )*
-> func_args_string+
;
group_clause : ( GROUP | COGROUP )^ group_item_list ( USING! QUOTEDSTRING )? partition_clause?
;
group_item_list : group_item ( COMMA group_item )*
-> group_item+
;
group_item : rel ( join_group_by_clause | ALL | ANY ) ( INNER | OUTER )?
;
// "AS" CLAUSES
identifier_plus : IDENTIFIER | reserved_identifier_whitelist -> IDENTIFIER[$reserved_identifier_whitelist.text]
;
explicit_field_def : identifier_plus ( COLON type )? -> ^( FIELD_DEF identifier_plus type? )
| explicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER explicit_type )
;
field_def : explicit_field_def
| implicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER implicit_type )
;
field_def_list : field_def ( COMMA! field_def )*
;
// we have two tuple types as implicit_tuple_types can be confused with parentheses around
// a field_def - so to remove this ambiguity we'll decide brackets around a single field_def
// type is *not* a tuple
as_clause : AS^ ( explicit_field_def | ( LEFT_PAREN! field_def_list? RIGHT_PAREN! ) )
;
// OTHERS
stream_cmd_list : stream_cmd ( COMMA stream_cmd )* -> stream_cmd+
;
stream_cmd : ( STDIN | STDOUT | QUOTEDSTRING )^ ( USING! func_clause )?
;
cmd : EXECCOMMAND^ ( ship_clause | cache_clause | input_clause | output_clause | error_clause )*
;
rel : identifier_plus | previous_rel | nested_op_clause
;
previous_rel : ARROBA
;
store_clause : STORE^ rel INTO! QUOTEDSTRING ( USING! func_clause )?
;
assert_clause : ASSERT^ rel BY! cond ( COMMA! QUOTEDSTRING )?
;
filter_clause : FILTER^ rel BY! cond
;
stream_clause : STREAM^ rel THROUGH! ( EXECCOMMAND | identifier_plus ) as_clause?
;
mr_clause : MAPREDUCE^ QUOTEDSTRING ( LEFT_PAREN! path_list RIGHT_PAREN! )? store_clause load_clause EXECCOMMAND?
;
split_clause : SPLIT^ rel INTO! split_branch split_branches
;
split_branch : identifier_plus IF cond -> ^( SPLIT_BRANCH identifier_plus cond )
;
split_otherwise : identifier_plus OTHERWISE ALL? -> ^( OTHERWISE identifier_plus ALL? )
;
split_branches : COMMA! split_branch split_branches?
| COMMA! split_otherwise
;
limit_clause : LIMIT^ rel expr
;
sample_clause : SAMPLE^ rel expr
;
rank_clause : RANK^ rel ( rank_by_statement )?
;
rank_by_statement : BY^ rank_by_clause DENSE?
;
rank_by_clause : STAR ( ASC | DESC )?
| rank_list
;
rank_list : rank_col ( COMMA rank_col )*
-> rank_col+
;
rank_col : col_range ( ASC | DESC )?
| col_ref ( ASC | DESC )?
;
order_clause : ORDER^ rel BY! order_by_clause ( USING! func_clause )?
;
order_by_clause : STAR ( ASC | DESC )?
| order_col_list
;
order_col_list : order_col ( COMMA order_col )*
-> order_col+
;
order_col : col_range (ASC | DESC)?
| col_ref ( ASC | DESC )?
| LEFT_PAREN! col_ref ( ASC | DESC )? RIGHT_PAREN!
;
distinct_clause : DISTINCT^ rel partition_clause?
;
partition_clause : PARTITION^ BY! func_name
;
rel_list : rel ( COMMA rel )* -> rel+
;
cross_clause : CROSS^ rel_list partition_clause?
;
join_clause : JOIN^ join_sub_clause ( USING! join_type )? partition_clause?
;
join_type : QUOTEDSTRING
;
join_sub_clause : join_item ( ( ( LEFT | RIGHT | FULL ) OUTER? COMMA! join_item ) | ( ( COMMA! join_item )+ ) )
;
join_item : rel join_group_by_clause -> ^( JOIN_ITEM rel join_group_by_clause )
;
// this can either be a single arg or something like (a,b) - which is
// indistinguishable from a tuple. We'll therefore parse a single argument
// (which can be a tuple of several real_args) and expand it:
join_group_by_clause
@after
{
Tree by = (Tree) retval.getTree();
Tree realArg = by.getChild(0);
if(realArg.getType() == TUPLE_VAL
|| (realArg.getType() == FUNC_EVAL && realArg.getChild(0).getType() == TOTUPLE)) {
retval.tree = adaptor.create(by.getType(), by.getText());
for(int i = 0; i < realArg.getChildCount(); ++i) {
if(realArg.getChild(i).getType()!=TOTUPLE)
((Tree)retval.tree).addChild(realArg.getChild(i));
}
adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
}
}
: BY^ real_arg
;
union_clause : UNION^ ONSCHEMA? rel_list
;
cube_clause : CUBE rel BY cube_rollup_list ( COMMA cube_rollup_list )* -> ^( CUBE rel ^( BY cube_rollup_list+ ) )
;
cube_rollup_list : ( CUBE | ROLLUP )^ LEFT_PAREN! real_arg ( COMMA! real_arg )* RIGHT_PAREN!
;
flatten_clause : FLATTEN^ LEFT_PAREN! expr RIGHT_PAREN!
;
// unlike loading and streaming, we want the as_clause (if present) in a different format (i.e.
// we drop the AS token itself).
generate_as_clause : AS! ( ( LEFT_PAREN! field_def_list RIGHT_PAREN! ) | explicit_field_def )
;
flatten_generated_item : flatten_clause generate_as_clause?
| real_arg generate_as_clause?
;
// EXPRESSIONS
// conditional precedence is OR weakest, then AND, then NOT, then IS NOT NULL and the comparison operators equally
// by design the boolean operator hierarchy is entirely below the expression hierarchy
real_arg : expr
| STAR
| col_range
;
cond : and_cond ( OR^ and_cond )*
;
and_cond : not_cond ( AND^ not_cond )*
;
not_cond : NOT^? unary_cond
;
unary_cond
@after
{
// Expressions in parentheses are a little tricky to match as
// they could contain either "cond" rules or "expr" rules. If
// they are "expr" rules then they're put under a BOOL_COND node
// in the tree, but "cond" rules put no extra tokens in the tree.
// As we're matching non-recursively we'll parse whatever's in the
// brackets, and if the AST has a boolean expression at its root
// then we'll assume we've just got a "cond" expression in
// brackets, and otherwise we'll assume its an "expr" (and so
// we'll have to strip off the BOOL_COND token the "cast_expr"
// rule added)
BaseTree tree = (BaseTree) retval.getTree();
if(tree.getType() == BOOL_COND
&& tree.getChild(0).getType() == EXPR_IN_PAREN
&& BOOLEAN_TOKENS.contains(tree.getChild(0).getChild(0).getType())) {
retval.tree = tree.getChild(0).getChild(0);
adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
}
// For IN expression, we clone the lhs expression (1st child of the
// returned tree) and insert it before every rhs expression. For example,
//
// lhs IN (rhs1, rhs2, rhs3)
// =>
// ^( IN lhs, rhs1, lhs, rhs2, lhs, rhs3 )
//
// Note that lhs appears three times at index 0, 2 and 4.
//
// This is needed because in LogicalPlanGenerator.g, we translate this
// tree to nested or expressions, and we need to construct a new
// LogicalExpression object per rhs expression.
if(tree.getType() == IN) {
Tree lhs = tree.getChild(0);
for(int i = 2; i < tree.getChildCount(); i = i + 2) {
tree.insertChild(i, deepCopy(lhs));
}
}
}
: exp1 = expr
( ( IS NOT? NULL -> ^( NULL $exp1 NOT? ) )
| ( IN LEFT_PAREN ( rhs_operand ( COMMA rhs_operand )* ) RIGHT_PAREN -> ^( IN ^( IN_LHS expr ) ^( IN_RHS rhs_operand )+ ) )
| ( rel_op exp2 = expr -> ^( rel_op $exp1 $exp2 ) )
| ( -> ^(BOOL_COND expr) ) )
;
rhs_operand : expr
;
expr : multi_expr ( ( PLUS | MINUS )^ multi_expr )*
;
multi_expr : cast_expr ( ( STAR | DIV | PERCENT )^ cast_expr )*
;
func_name_suffix : ( ( DOLLAR | PERIOD ) eid )+
;
cast_expr
@after
{
BaseTree tree = (BaseTree) retval.getTree();
// the parser does an initial optimisation step: it removes TOTUPLE / TOMAP / TOBAG
// function calls if it knows they'll just return the input (i.e. because the function's
// argument is a literal). We'll do this here by post-processing the result:
if(tree.getType() == FUNC_EVAL) {
Integer func = FUNC_TO_LITERAL.get(tree.getChild(0).getType());
if(func != null) {
boolean canBeOptimised = true;
for(int arg = 1; arg < tree.getChildCount() && canBeOptimised; ++arg) {
canBeOptimised &= LITERAL_TOKENS.contains(tree.getChild(arg).getType());
}
if(canBeOptimised) {
retval.tree = adaptor.create(func, func.toString());
((BaseTree)retval.tree).addChildren(tree.getChildren());
((BaseTree)retval.tree).deleteChild(0); // the (e.g.) TOBAG token
adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
}
}
}
// a minor correction to the token text for formatting -
// we want NEG's text to be the same as MINUSes
if(tree.getType() == NEG) {
((CommonTree)tree).token.setText("-");
}
// As noted below, brackets around a single literal mean a tuple
// of that literal, not a nested expression which evaluates to
// that literal. Remember that a NULL with children is a boolean
// expression, not a literal!
if(tree.getType() == EXPR_IN_PAREN
&& LITERAL_TOKENS.contains(tree.getChild(0).getType())
&& (tree.getChild(0).getType() != NULL || tree.getChild(0).getChildCount() == 0)) {
((CommonTree)tree).token.setType(TUPLE_VAL);
}
// For CASE statement, we clone the case expression (1st child of the
// returned tree) and insert it before every when expression. For example,
//
// CASE e1
// WHEN e2 THEN e3
// WHEN e4 THEN e5
// ELSE e6
// END
// =>
// ^( CASE e1, e2, e3, e1, e4, e5, e6 )
//
// Note that e1 appears twice at index 0 and 3.
//
// This is needed because in LogicalPlanGenerator.g, we translate this
// tree to nested bincond expressions, and we need to construct a new
// LogicalExpression object per when branch.
if(tree.getType() == CASE_EXPR) {
Tree caseExpr = tree.getChild(0);
int childCount = tree.getChildCount();
boolean hasElse = childCount \% 2 == 0;
int whenBranchCount = ( childCount - (hasElse ? 2 : 1) ) / 2;
for(int i = 1; i < whenBranchCount; i++) {
tree.insertChild(3*i, deepCopy(caseExpr));
}
}
}
: scalar
| MINUS cast_expr -> ^( NEG cast_expr )
// single columns and functions (both of which can start with an identifier). Note that we have to be
// careful with periods straight after the identifier, as we want those to be projections, not function
// calls
| col_ref_without_identifier projection*
| invoker_func projection*
| identifier_plus projection*
| identifier_plus func_name_suffix? LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL identifier_plus func_name_suffix? real_arg* ) projection*
| func_name_without_columns LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL func_name_without_columns real_arg* ) projection*
| CASE ( (WHEN)=> WHEN cond THEN expr ( WHEN cond THEN expr )* ( ELSE expr )? END projection* -> ^( CASE_COND ^(WHEN cond+) ^(THEN expr+) ) projection*
| expr WHEN rhs_operand THEN rhs_operand ( WHEN rhs_operand THEN rhs_operand )* ( ELSE rhs_operand )? END projection*
-> ^( CASE_EXPR ^(CASE_EXPR_LHS expr) ^(CASE_EXPR_RHS rhs_operand)+ ) projection*
)
| paren_expr
| curly_expr
| bracket_expr
;
invoker_func
@init {
String staticStr = "true";
List<String> packageStr = Lists.newArrayList();
String methodStr = null;
}
: INVOKE ( AMPERSAND | LEFT_PAREN real_arg { staticStr = "false"; } RIGHT_PAREN ) ( packageName=identifier_plus PERIOD { packageStr.add($packageName.text); } )* methodName=identifier_plus { methodStr=$methodName.text; } LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN
-> ^( INVOKER_FUNC_EVAL IDENTIFIER[Joiner.on(".").join(packageStr)] IDENTIFIER[methodStr] IDENTIFIER[staticStr] real_arg* )
;
// now we have to deal with parentheses: in an expr, '(' can be the
// start of a cast, the start of a nested expression or the start of
// a tuple. We'll ensure parsing is unambiguous by assuming a single
// expression in parentheses is a nested expression, whereas two or
// more nested expressions are a tuple (unless that single expression
// is a literal, in which case we assume tuple with a single element
// - that literal).
paren_expr
@after
{
BaseTree tree = (BaseTree)retval.getTree();
// the other side of the @after block in unary_cond: if we've
// matched an EXPR_IN_PAREN we expect the nested expression to
// be an "expr", not a "cond", so we should strip off the
// BOOL_COND token.
if(tree.getType() == EXPR_IN_PAREN
&& tree.getChild(0).getType() == BOOL_COND) {
int type = tree.getChild(0).getChild(0).getType();
// NULL is a special case - if it has children it's a boolean
// expression, and if not it's a literal NULL. Note that we
// replace *all* children
if(!BOOLEAN_TOKENS.contains(type)
|| (type == NULL && tree.getChild(0).getChild(0).getChildCount() == 0)) {
Tree addChildrenOf = tree.getChild(0);
for(int i = 0; i < tree.getChildCount(); ++i)
tree.deleteChild(i);
for(int i = 0; i < addChildrenOf.getChildCount(); ++i)
tree.addChild(addChildrenOf.getChild(i));
}
}
// A function call to TOTUPLE is inserted into the AST for
// some tuple literals - but as we assume the first expression
// after an open bracket is a "cond" rule, and as "cond" rules
// nest "expr" rules under a BOOL_COND token we get an invalid
// AST. We'll remove this BOOL_COND here:
if(tree.getType() == FUNC_EVAL
&& tree.getChild(0).getType() == TOTUPLE
&& tree.getChildCount() > 1
&& tree.getChild(1).getType() == BOOL_COND) {
Tree insertChildrenOf = tree.getChild(1);
tree.deleteChild(1);
for(int i = insertChildrenOf.getChildCount() - 1; i >= 0; --i)
tree.insertChild(1, insertChildrenOf.getChild(i));
}
}
: LEFT_PAREN! try_implicit_map_cast
;
try_implicit_map_cast
// we'll also allow implicit map casts (for backwards compatibility only -
// bag and tuple casts have to be explicit and it makes the grammar more
// simple). Unfortunately we'll have to turn on back-tracking for this rule,
// as LEFT_PAREN LEFT_BRACKET could be a literal map in a EXPR_IN_PAREN.
// It'd be much better if we could remove this from the Pig language (and
// just rely on explicit map casts) - then we'd have no backtracking at all!
: ( implicit_map_type RIGHT_PAREN cast_expr) => implicit_map_type RIGHT_PAREN cast_expr -> ^( CAST_EXPR implicit_map_type cast_expr )
| after_left_paren
;
after_left_paren : explicit_type_cast RIGHT_PAREN cast_expr -> ^( CAST_EXPR explicit_type_cast cast_expr )
// tuples
| RIGHT_PAREN projection* -> ^( TUPLE_VAL ) projection*
| STAR ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE STAR real_arg* ) projection*
| col_range ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE col_range real_arg* ) projection*
// Tuples begin with '(' expr, but shorthand-booleans begin with '(' cond. As cond
// and expr are indistinguishable, we'll parse as a cond (i.e. the most lenient) and
// for exprs, strip off the BOOL_COND trees. You can have both nested conds and nested
// exprs, so we'll just assume cond.
| cond
( ( ( COMMA real_arg )+ RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE cond real_arg+ ) projection* )
| ( RIGHT_PAREN -> ^( EXPR_IN_PAREN cond ) )
| ( QMARK exp1 = expr COLON exp2 = expr RIGHT_PAREN -> ^( BIN_EXPR cond $exp1 $exp2 ) ) )
;
curly_expr : LEFT_CURLY real_arg ( COMMA real_arg )* RIGHT_CURLY projection* -> ^( FUNC_EVAL TOBAG real_arg+ ) projection*
| LEFT_CURLY RIGHT_CURLY projection* -> ^( BAG_VAL ) projection*
;
bracket_expr : LEFT_BRACKET real_arg ( COMMA real_arg )* RIGHT_BRACKET projection* -> ^( FUNC_EVAL TOMAP real_arg+ ) projection*
| LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET projection* -> ^( MAP_VAL keyvalue+ ) projection*
| LEFT_BRACKET RIGHT_BRACKET projection* -> ^( MAP_VAL ) projection*
;
projection : PERIOD ( col_ref | LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) -> ^( PERIOD col_ref+ )
| POUND^ ( QUOTEDSTRING | NULL )
;
// ATOMS
// for disambiguation with func_names
col_ref_without_identifier : GROUP | DOLLARVAR
;
col_ref : col_ref_without_identifier | identifier_plus
;
col_range : c1 = col_ref DOUBLE_PERIOD c2 = col_ref? -> ^(COL_RANGE $c1 DOUBLE_PERIOD $c2?)
| DOUBLE_PERIOD col_ref -> ^(COL_RANGE DOUBLE_PERIOD col_ref)
;
scalar : INTEGER
| LONGINTEGER
| FLOATNUMBER
| DOUBLENUMBER
| BIGINTEGERNUMBER
| BIGDECIMALNUMBER
| QUOTEDSTRING
| NULL
| TRUE
| FALSE
;
keyvalue : QUOTEDSTRING POUND literal -> ^( KEY_VAL_PAIR QUOTEDSTRING literal )
;
literal_map : LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET -> ^( MAP_VAL keyvalue+ )
| LEFT_BRACKET RIGHT_BRACKET -> ^( MAP_VAL )
;
literal_bag : LEFT_CURLY literal_tuple ( COMMA literal_tuple )* RIGHT_CURLY -> ^( BAG_VAL literal_tuple+ )
| LEFT_CURLY RIGHT_CURLY -> ^( BAG_VAL )
;
literal_tuple : LEFT_PAREN literal ( COMMA literal )* RIGHT_PAREN -> ^( TUPLE_VAL literal+ )
| LEFT_PAREN RIGHT_PAREN -> ^( TUPLE_VAL )
;
literal : scalar | literal_map | literal_bag | literal_tuple
;
// NESTING
nested_blk : ( nested_command SEMI_COLON )* GENERATE flatten_generated_item ( COMMA flatten_generated_item )* SEMI_COLON
-> nested_command* ^( GENERATE flatten_generated_item+ )
;
nested_command : ( identifier_plus EQUAL col_ref PERIOD col_ref_list { input.LA( 1 ) == SEMI_COLON }? ) => ( identifier_plus EQUAL nested_proj )
-> ^( NESTED_CMD identifier_plus nested_proj )
| identifier_plus EQUAL expr
-> ^( NESTED_CMD_ASSI identifier_plus expr )
| identifier_plus EQUAL nested_op
-> ^( NESTED_CMD identifier_plus nested_op )
;
nested_op : nested_filter
| nested_sort
| nested_distinct
| nested_limit
| nested_cross
| nested_foreach
;
nested_proj : col_ref PERIOD col_ref_list
-> ^( NESTED_PROJ col_ref col_ref_list )
;
col_ref_list : ( col_ref | ( LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) )
-> col_ref+
;
nested_filter : FILTER^ nested_op_input BY! cond
;
nested_sort : ORDER^ nested_op_input BY! order_by_clause ( USING! func_clause )?
;
nested_distinct : DISTINCT^ nested_op_input
;
nested_limit : LIMIT^ nested_op_input ( (INTEGER SEMI_COLON) => INTEGER | expr )
;
nested_cross : CROSS^ nested_op_input_list
;
nested_foreach: FOREACH nested_op_input GENERATE flatten_generated_item ( COMMA flatten_generated_item )*
-> ^( FOREACH nested_op_input ^( GENERATE flatten_generated_item+ ) )
;
nested_op_input : col_ref | nested_proj
;
nested_op_input_list : nested_op_input ( COMMA nested_op_input )*
-> nested_op_input+
;
// IDENTIFIERS
// extended identifier, handling the keyword and identifier conflicts. Ugly but there is no other choice.
eid_without_columns : rel_str_op
| IMPORT
| REGISTER
| RETURNS
| DEFINE
| LOAD
| FILTER
| FOREACH
| ROLLUP
| ORDER
| DISTINCT
| COGROUP
| JOIN
| CROSS
| UNION
| SPLIT
| INTO
| IF
| ALL
| AS
| BY
| USING
| INNER
| OUTER
| PARALLEL
| PARTITION
| AND
| OR
| GENERATE
| ASC
| DESC
| BOOL
| BIGINTEGER
| BIGDECIMAL
| DATETIME
| CHARARRAY
| BYTEARRAY
| IS
| STREAM
| THROUGH
| STORE
| MAPREDUCE
| SHIP
| CACHE
| INPUT
| OUTPUT
| STDERROR
| STDIN
| STDOUT
| LIMIT
| SAMPLE
| LEFT
| RIGHT
| FULL
| REALIAS
| BOOL_COND
| ASSERT
;
eid : eid_without_columns
| IDENTIFIER
| GROUP
| CUBE
| TRUE
| FALSE
| INT
| LONG
| FLOAT
| DOUBLE
| NULL
| NOT
| FLATTEN
| BAG
| TUPLE
| MAP
;
// relational operator
rel_op : rel_str_op
| NUM_OP_EQ
| NUM_OP_NE
| NUM_OP_GT
| NUM_OP_GTE
| NUM_OP_LT
| NUM_OP_LTE
;
rel_str_op : STR_OP_EQ
| STR_OP_NE
| STR_OP_GT
| STR_OP_LT
| STR_OP_GTE
| STR_OP_LTE
| STR_OP_MATCHES
;
reserved_identifier_whitelist : RANK
| CUBE
| IN
| WHEN
| THEN
| ELSE
| END
;