src/org/apache/pig/parser/QueryParser.g - pig - Git at Google


 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *     http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the License is distributed on an "AS IS" BASIS,
  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */

 /**
  * Parser file for Pig Parser
  *
  * NOTE: THIS FILE IS THE BASE FOR A FEW TREE PARSER FILES, such as AstValidator.g,
  *       SO IF YOU CHANGE THIS FILE, YOU WILL PROBABLY NEED TO MAKE CORRESPONDING CHANGES TO
  *       THOSE FILES AS WELL.
  */

 parser grammar QueryParser;

 options {
     tokenVocab=QueryLexer;
     output=AST;
     backtrack=false; // greatly slows down parsing!
 }

 tokens {
     QUERY;
     STATEMENT;
     FUNC;
     FUNC_REF;
     FUNC_EVAL;
     INVOKE;
     INVOKER_FUNC_EVAL;
     IN_LHS;
     IN_RHS;
     CASE_COND;
     CASE_EXPR;
     CASE_EXPR_LHS;
     CASE_EXPR_RHS;
     CAST_EXPR;
     COL_RANGE;
     BIN_EXPR;
     TUPLE_VAL;
     MAP_VAL;
     BAG_VAL;
     KEY_VAL_PAIR;
     FIELD_DEF;
     FIELD_DEF_WITHOUT_IDENTIFIER;
     NESTED_CMD_ASSI;
     NESTED_CMD;
     NESTED_PROJ;
     SPLIT_BRANCH;
     FOREACH_PLAN_SIMPLE;
     FOREACH_PLAN_COMPLEX;
     MAP_TYPE;
     TUPLE_TYPE;
     BAG_TYPE;
     NEG;
     EXPR_IN_PAREN;
     JOIN_ITEM;
     TUPLE_TYPE_CAST;
     BAG_TYPE_CAST;
     PARAMS;
     RETURN_VAL;
     MACRO_DEF;
     MACRO_BODY;
     MACRO_INLINE;
     NULL;
     TRUE;
     FALSE;
     IDENTIFIER;
     ANY;
     TOBAG;
     TOMAP;
     TOTUPLE;
     FAT_ARROW;
 }

 @header {
 package org.apache.pig.parser;

 import java.util.Set;
 import java.util.HashSet;
 import java.util.Collections;
 import org.apache.commons.logging.Log;
 import org.apache.commons.logging.LogFactory;
 import org.apache.pig.parser.PigMacro;

 import com.google.common.collect.ImmutableSet;
 import com.google.common.collect.ImmutableMap;
 import com.google.common.collect.Lists;
 import com.google.common.base.Joiner;
 }

 @members {
 private static Log log = LogFactory.getLog( QueryParser.class );

 private Set<String> memory = new HashSet<String>();

 // Make a deep copy of the given node
 private static Tree deepCopy(Tree tree) {
     Tree copy = tree.dupNode();
     for (int i = 0; i < tree.getChildCount(); i++) {
         Tree child = deepCopy(tree.getChild(i));
         child.setParent(copy);
         copy.addChild(child);
     }
     return copy;
 }

 @Override
 protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
 throws RecognitionException {
     throw new MismatchedTokenException( ttype, input );
 }

 @Override
 public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)
 throws RecognitionException {
     throw e;
 }

 @Override
 public String getErrorMessage(RecognitionException e, String[] tokenNames ) {
     if( !log.isDebugEnabled() ) {
         if( e instanceof NoViableAltException ) {
             return "Syntax error, unexpected symbol at or near " + getTokenErrorDisplay( e.token );
         } else {
             return super.getErrorMessage( e, tokenNames );
         }
     }

     List stack =  getRuleInvocationStack( e, this.getClass().getName() );
     String msg = null;
     if( e instanceof NoViableAltException ) {
         NoViableAltException nvae = (NoViableAltException)e;
         msg = " no viable alt; token = " + e.token + " (decision=" + nvae.decisionNumber + " state " + nvae.stateNumber + ")" +
             " decision=<<" + nvae.grammarDecisionDescription + ">>";
     } else {
         msg =  super.getErrorMessage( e, tokenNames );
     }
     return stack + " " + msg;
 }

 @Override
 public String getTokenErrorDisplay(Token t) {
     return "'" + t.getText() + "'";
 }

 @Override
 public String getErrorHeader(RecognitionException ex) {
 	return QueryParserUtils.generateErrorHeader( ex, this.getSourceName() );
 }

 private static final Map<Integer, Integer> FUNC_TO_LITERAL = ImmutableMap.of(
     TOBAG, BAG_VAL,
     TOMAP, MAP_VAL,
     TOTUPLE, TUPLE_VAL);

 private static final Set<Integer> BOOLEAN_TOKENS = ImmutableSet.of(
     STR_OP_EQ,
     STR_OP_NE,
     STR_OP_GT,
     STR_OP_LT,
     STR_OP_GTE,
     STR_OP_LTE,
     STR_OP_MATCHES,
     AND,
     OR,
     NOT,
     NULL,
     NUM_OP_EQ,
     NUM_OP_NE,
     NUM_OP_GT,
     NUM_OP_GTE,
     NUM_OP_LT,
     NUM_OP_LTE);

 private static final Set<Integer> LITERAL_TOKENS = ImmutableSet.of(
     INTEGER,
     LONGINTEGER,
     FLOATNUMBER,
     DOUBLENUMBER,
     QUOTEDSTRING,
     NULL,
     TRUE,
     FALSE,
     MAP_VAL,
     BAG_VAL,
     TUPLE_VAL,
     PERIOD,
     POUND);

 } // End of @members

 @rulecatch {
 catch(RecognitionException re) {
     throw re;
 }
 }

 query : statement* EOF -> ^( QUERY statement* )
 ;

 schema: field_def_list EOF
 ;

 // STATEMENTS

 statement : SEMI_COLON!
           | general_statement SEMI_COLON!
           | split_clause SEMI_COLON!
           | inline_clause SEMI_COLON!
           | import_clause SEMI_COLON!
           | realias_clause SEMI_COLON!
           | register_clause SEMI_COLON!
           | assert_clause SEMI_COLON!
           // semicolons after foreach_complex_statement are optional for backwards compatibility, but to keep
           // the grammar unambiguous if there is one then we'll parse it as a single, standalone semicolon
           // (which matches the first statement rule)
           | foreach_statement
 ;

 nested_op_clause : LEFT_PAREN! op_clause parallel_clause? RIGHT_PAREN!
                  | LEFT_PAREN FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? ) ) RIGHT_PAREN
                     -> ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause?
 ;

 general_statement : FAT_ARROW ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT IDENTIFIER["____RESERVED____"] op_clause? parallel_clause? nested_op_clause? )
                   | ( identifier_plus EQUAL )? ( ( op_clause parallel_clause? ) | nested_op_clause ) -> ^( STATEMENT identifier_plus? op_clause? parallel_clause? nested_op_clause? )
 ;

 // Statement represented by a foreach operator with a nested block. Simple foreach statement
 // is covered by general_statement.
 // We need to handle foreach specifically because of the ending ';', which is not required
 // if there is a nested block. This is ugly, but it gets the job done.
 foreach_statement : FAT_ARROW FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) )
     -> ^( STATEMENT IDENTIFIER["____RESERVED____"] ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? )
                   | ( identifier_plus EQUAL )? FOREACH rel ( foreach_plan_complex | ( foreach_plan_simple parallel_clause? SEMI_COLON ) )
     -> ^( STATEMENT identifier_plus? ^( FOREACH rel foreach_plan_complex? foreach_plan_simple? ) parallel_clause? )
 ;

 foreach_plan_complex : LEFT_CURLY nested_blk RIGHT_CURLY -> ^( FOREACH_PLAN_COMPLEX nested_blk )
 ;

 foreach_plan_simple : GENERATE flatten_generated_item ( COMMA flatten_generated_item )* -> ^( FOREACH_PLAN_SIMPLE ^( GENERATE flatten_generated_item+ ) )
 ;

 // MACRO grammar

 macro_content : LEFT_CURLY ( macro_content | ~(LEFT_CURLY | RIGHT_CURLY) )* RIGHT_CURLY
 ;

 macro_param_clause : LEFT_PAREN ( identifier_plus (COMMA identifier_plus)* )? RIGHT_PAREN
     -> ^(PARAMS identifier_plus*)
 ;

 macro_return_clause
     : RETURNS ((identifier_plus (COMMA identifier_plus)*) | VOID)
         -> ^(RETURN_VAL identifier_plus*)
 ;

 macro_body_clause : macro_content -> ^(MACRO_BODY { new PigParserNode(new CommonToken(1, $macro_content.text), this.getSourceName(), $macro_content.start) } )
 ;

 macro_clause : macro_param_clause macro_return_clause macro_body_clause
     -> ^(MACRO_DEF macro_param_clause macro_return_clause macro_body_clause)
 ;

 inline_return_clause
     : identifier_plus EQUAL -> ^(RETURN_VAL identifier_plus)
 	| identifier_plus (COMMA identifier_plus)+ EQUAL -> ^(RETURN_VAL identifier_plus+)
 	| -> ^(RETURN_VAL)
 ;

 parameter
     : IDENTIFIER
     | INTEGER
     | DOUBLENUMBER
     | BIGDECIMALNUMBER
     | BIGINTEGERNUMBER
     | QUOTEDSTRING
     | DOLLARVAR
 ;

 inline_param_clause : LEFT_PAREN ( parameter (COMMA parameter)* )? RIGHT_PAREN
     -> ^(PARAMS parameter*)
 ;

 inline_clause : inline_return_clause identifier_plus inline_param_clause
     -> ^(MACRO_INLINE identifier_plus inline_return_clause inline_param_clause)
 ;

 // TYPES

 simple_type : BOOLEAN | INT | LONG | FLOAT | DOUBLE | DATETIME | BIGINTEGER | BIGDECIMAL | CHARARRAY | BYTEARRAY
 ;

 implicit_tuple_type : LEFT_PAREN field_def_list? RIGHT_PAREN -> ^( TUPLE_TYPE field_def_list? )
 ;

 explicit_tuple_type : TUPLE! implicit_tuple_type
 ;

 explicit_tuple_type_cast : TUPLE LEFT_PAREN ( explicit_type_cast ( COMMA explicit_type_cast )* )? RIGHT_PAREN
     -> ^( TUPLE_TYPE_CAST explicit_type_cast* )
 ;

 tuple_type : implicit_tuple_type | explicit_tuple_type
 ;

 implicit_bag_type : LEFT_CURLY NULL COLON tuple_type? RIGHT_CURLY -> ^( BAG_TYPE tuple_type? )
                   | LEFT_CURLY ( ( identifier_plus COLON )? tuple_type )? RIGHT_CURLY -> ^( BAG_TYPE identifier_plus? tuple_type? )
 ;

 explicit_bag_type : BAG! implicit_bag_type
 ;

 explicit_bag_type_cast : BAG LEFT_CURLY explicit_tuple_type_cast? RIGHT_CURLY -> ^( BAG_TYPE_CAST explicit_tuple_type_cast? )
 ;

 implicit_map_type : LEFT_BRACKET ( ( identifier_plus COLON )? type )? RIGHT_BRACKET -> ^( MAP_TYPE identifier_plus? type? )
 ;

 explicit_map_type : MAP! implicit_map_type
 ;

 map_type : implicit_map_type | explicit_map_type
 ;

 explicit_type : simple_type | explicit_tuple_type | explicit_bag_type | explicit_map_type
 ;

 implicit_type : implicit_tuple_type | implicit_bag_type | implicit_map_type
 ;

 type : explicit_type | implicit_type
 ;

 explicit_type_cast : simple_type | explicit_map_type | explicit_tuple_type_cast | explicit_bag_type_cast
 ;

 // CLAUSES

 import_clause : IMPORT^ QUOTEDSTRING
 ;

 register_clause : REGISTER^ QUOTEDSTRING (USING identifier_plus AS identifier_plus)?
 ;

 define_clause : DEFINE^ IDENTIFIER ( cmd | func_clause | macro_clause)
 ;

 realias_clause : identifier_plus EQUAL identifier_plus -> ^(REALIAS identifier_plus identifier_plus)
 ;

 parallel_clause : PARALLEL^ INTEGER
 ;

 op_clause : define_clause
           | load_clause
           | group_clause
           | cube_clause
           | store_clause
           | filter_clause
           | distinct_clause
           | limit_clause
           | sample_clause
           | order_clause
           | rank_clause
           | cross_clause
           | join_clause
           | union_clause
           | stream_clause
           | mr_clause
 ;

 ship_clause : SHIP^ LEFT_PAREN! path_list? RIGHT_PAREN!
 ;

 path_list : QUOTEDSTRING ( COMMA QUOTEDSTRING )* -> QUOTEDSTRING+
 ;

 cache_clause : CACHE^ LEFT_PAREN! path_list RIGHT_PAREN!
 ;

 input_clause : INPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN!
 ;

 output_clause : OUTPUT^ LEFT_PAREN! stream_cmd_list RIGHT_PAREN!
 ;

 error_clause : STDERROR^ LEFT_PAREN! ( QUOTEDSTRING ( LIMIT! INTEGER )? )? RIGHT_PAREN!
 ;

 load_clause : LOAD^ QUOTEDSTRING ( USING! func_clause )? as_clause?
 ;

 func_clause : func_name
            -> ^( FUNC_REF func_name )
             | func_name LEFT_PAREN func_args? RIGHT_PAREN
            -> ^( FUNC func_name func_args? )
 ;

 // needed for disambiguation when parsing expressions...see below
 func_name_without_columns : eid_without_columns ( ( PERIOD | DOLLAR ) eid )*
 ;

 func_name : eid ( ( PERIOD | DOLLAR ) eid )*
 ;

 func_args_string : QUOTEDSTRING | MULTILINE_QUOTEDSTRING
 ;

 func_args : func_args_string ( COMMA func_args_string )*
          -> func_args_string+
 ;

 group_clause : ( GROUP | COGROUP )^ group_item_list ( USING! QUOTEDSTRING )? partition_clause?
 ;

 group_item_list : group_item ( COMMA group_item )*
                -> group_item+
 ;

 group_item : rel ( join_group_by_clause | ALL | ANY ) ( INNER | OUTER )?
 ;

 // "AS" CLAUSES

 identifier_plus : IDENTIFIER | reserved_identifier_whitelist -> IDENTIFIER[$reserved_identifier_whitelist.text]
 ;

 explicit_field_def : identifier_plus ( COLON type )? -> ^( FIELD_DEF identifier_plus type? )
                    | explicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER explicit_type )
 ;

 field_def : explicit_field_def
           | implicit_type -> ^( FIELD_DEF_WITHOUT_IDENTIFIER implicit_type )
 ;

 field_def_list : field_def ( COMMA! field_def )*
 ;

 // we have two tuple types as implicit_tuple_types can be confused with parentheses around
 // a field_def - so to remove this ambiguity we'll decide brackets around a single field_def
 // type is *not* a tuple
 as_clause : AS^ ( explicit_field_def | ( LEFT_PAREN! field_def_list? RIGHT_PAREN! ) )
 ;

 // OTHERS

 stream_cmd_list : stream_cmd ( COMMA stream_cmd )* -> stream_cmd+
 ;

 stream_cmd : ( STDIN | STDOUT | QUOTEDSTRING )^ ( USING! func_clause )?
 ;

 cmd : EXECCOMMAND^ ( ship_clause | cache_clause | input_clause | output_clause | error_clause )*
 ;

 rel : identifier_plus | previous_rel | nested_op_clause
 ;

 previous_rel : ARROBA
 ;

 store_clause : STORE^ rel INTO! QUOTEDSTRING ( USING! func_clause )?
 ;

 assert_clause : ASSERT^ rel BY! cond ( COMMA! QUOTEDSTRING )?
 ;

 filter_clause : FILTER^ rel BY! cond
 ;

 stream_clause : STREAM^ rel THROUGH! ( EXECCOMMAND | identifier_plus ) as_clause?
 ;

 mr_clause : MAPREDUCE^ QUOTEDSTRING ( LEFT_PAREN! path_list RIGHT_PAREN! )? store_clause load_clause EXECCOMMAND?
 ;

 split_clause : SPLIT^ rel INTO! split_branch split_branches
 ;

 split_branch : identifier_plus IF cond -> ^( SPLIT_BRANCH identifier_plus cond )
 ;

 split_otherwise : identifier_plus OTHERWISE ALL? -> ^( OTHERWISE identifier_plus ALL? )
 ;

 split_branches : COMMA! split_branch split_branches?
                | COMMA! split_otherwise
 ;

 limit_clause : LIMIT^ rel expr
 ;

 sample_clause : SAMPLE^ rel expr
 ;

 rank_clause : RANK^ rel ( rank_by_statement )?
 ;

 rank_by_statement : BY^ rank_by_clause DENSE?
 ;

 rank_by_clause : STAR ( ASC | DESC )?
                | rank_list
 ;

 rank_list : rank_col ( COMMA rank_col )*
          -> rank_col+
 ;

 rank_col : col_range ( ASC | DESC )?
          | col_ref ( ASC | DESC )?
 ;

 order_clause : ORDER^ rel BY! order_by_clause ( USING! func_clause )?
 ;

 order_by_clause : STAR ( ASC | DESC )?
                 | order_col_list
 ;

 order_col_list : order_col ( COMMA order_col )*
               -> order_col+
 ;

 order_col : col_range (ASC | DESC)?
           | col_ref ( ASC | DESC )?
           | LEFT_PAREN! col_ref ( ASC | DESC )? RIGHT_PAREN!
 ;

 distinct_clause : DISTINCT^ rel partition_clause?
 ;

 partition_clause : PARTITION^ BY! func_name
 ;

 rel_list : rel ( COMMA rel )* -> rel+
 ;

 cross_clause : CROSS^ rel_list partition_clause?
 ;


 join_clause : JOIN^ join_sub_clause ( USING! join_type )? partition_clause?
 ;

 join_type : QUOTEDSTRING
 ;

 join_sub_clause : join_item ( ( ( LEFT | RIGHT | FULL ) OUTER? COMMA! join_item ) | ( ( COMMA! join_item )+ ) )
 ;

 join_item : rel join_group_by_clause -> ^( JOIN_ITEM  rel join_group_by_clause )
 ;

 // this can either be a single arg or something like (a,b) - which is
 // indistinguishable from a tuple. We'll therefore parse a single argument
 // (which can be a tuple of several real_args) and expand it:
 join_group_by_clause
     @after
     {
         Tree by = (Tree) retval.getTree();
         Tree realArg = by.getChild(0);
         if(realArg.getType() == TUPLE_VAL
         || (realArg.getType() == FUNC_EVAL && realArg.getChild(0).getType() == TOTUPLE)) {
             retval.tree = adaptor.create(by.getType(), by.getText());
             for(int i = 0; i < realArg.getChildCount(); ++i) {
                 if(realArg.getChild(i).getType()!=TOTUPLE)
                 ((Tree)retval.tree).addChild(realArg.getChild(i));
             }
             adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
         }
     }
                      : BY^ real_arg
 ;

 union_clause : UNION^ ONSCHEMA? rel_list
 ;

 cube_clause : CUBE rel BY cube_rollup_list ( COMMA cube_rollup_list )* -> ^( CUBE rel ^( BY cube_rollup_list+ ) )
 ;

 cube_rollup_list : ( CUBE | ROLLUP )^ LEFT_PAREN! real_arg ( COMMA! real_arg )* RIGHT_PAREN!
 ;

 flatten_clause : FLATTEN^ LEFT_PAREN! expr RIGHT_PAREN!
 ;

 // unlike loading and streaming, we want the as_clause (if present) in a different format (i.e.
 // we drop the AS token itself).
 generate_as_clause :  AS! ( ( LEFT_PAREN! field_def_list RIGHT_PAREN! ) | explicit_field_def )
 ;

 flatten_generated_item : flatten_clause generate_as_clause?
                        | real_arg generate_as_clause?
 ;

 // EXPRESSIONS

 // conditional precedence is OR weakest, then AND, then NOT, then IS NOT NULL and the comparison operators equally
 // by design the boolean operator hierarchy is entirely below the expression hierarchy

 real_arg : expr
          | STAR
          | col_range
 ;

 cond : and_cond  ( OR^ and_cond )*
 ;

 and_cond : not_cond ( AND^ not_cond )*
 ;

 not_cond : NOT^? unary_cond
 ;

 unary_cond
     @after
     {
         // Expressions in parentheses are a little tricky to match as
         // they could contain either "cond" rules or "expr" rules. If
         // they are "expr" rules then they're put under a BOOL_COND node
         // in the tree, but "cond" rules put no extra tokens in the tree.
         // As we're matching non-recursively we'll parse whatever's in the
         // brackets, and if the AST has a boolean expression at its root
         // then we'll assume we've just got a "cond" expression in
         // brackets, and otherwise we'll assume its an "expr" (and so
         // we'll have to strip off the BOOL_COND token the "cast_expr"
         // rule added)
         BaseTree tree = (BaseTree) retval.getTree();
         if(tree.getType() == BOOL_COND
         && tree.getChild(0).getType() == EXPR_IN_PAREN
         && BOOLEAN_TOKENS.contains(tree.getChild(0).getChild(0).getType())) {
             retval.tree = tree.getChild(0).getChild(0);
             adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
         }

         // For IN expression, we clone the lhs expression (1st child of the
         // returned tree) and insert it before every rhs expression. For example,
         //
         //   lhs IN (rhs1, rhs2, rhs3)
         // =>
         //   ^( IN lhs, rhs1, lhs, rhs2, lhs, rhs3 )
         //
         // Note that lhs appears three times at index 0, 2 and 4.
         //
         // This is needed because in LogicalPlanGenerator.g, we translate this
         // tree to nested or expressions, and we need to construct a new
         // LogicalExpression object per rhs expression.
         if(tree.getType() == IN) {
             Tree lhs = tree.getChild(0);
             for(int i = 2; i < tree.getChildCount(); i = i + 2) {
                 tree.insertChild(i, deepCopy(lhs));
             }
         }
     }
     : exp1 = expr
         ( ( IS NOT? NULL -> ^( NULL $exp1 NOT? ) )
         | ( IN LEFT_PAREN ( rhs_operand ( COMMA rhs_operand )* ) RIGHT_PAREN -> ^( IN ^( IN_LHS expr ) ^( IN_RHS rhs_operand )+ ) )
         | ( rel_op exp2 = expr -> ^( rel_op $exp1 $exp2 ) )
         | ( -> ^(BOOL_COND expr) ) )
 ;

 rhs_operand : expr
 ;

 expr : multi_expr ( ( PLUS | MINUS )^ multi_expr )*
 ;

 multi_expr : cast_expr ( ( STAR | DIV | PERCENT )^ cast_expr )*
 ;

 func_name_suffix : ( ( DOLLAR | PERIOD ) eid )+
 ;

 cast_expr
     @after
     {
         BaseTree tree = (BaseTree) retval.getTree();

         // the parser does an initial optimisation step: it removes TOTUPLE / TOMAP / TOBAG
         // function calls if it knows they'll just return the input (i.e. because the function's
         // argument is a literal). We'll do this here by post-processing the result:
         if(tree.getType() == FUNC_EVAL) {
             Integer func = FUNC_TO_LITERAL.get(tree.getChild(0).getType());
             if(func != null) {
                 boolean canBeOptimised = true;
                 for(int arg = 1; arg < tree.getChildCount() && canBeOptimised; ++arg) {
                     canBeOptimised &= LITERAL_TOKENS.contains(tree.getChild(arg).getType());
                 }
                 if(canBeOptimised) {
                     retval.tree = adaptor.create(func, func.toString());
                     ((BaseTree)retval.tree).addChildren(tree.getChildren());
                     ((BaseTree)retval.tree).deleteChild(0); // the (e.g.) TOBAG token
                     adaptor.setTokenBoundaries(retval.tree, retval.start, retval.stop);
                 }
             }
         }

         // a minor correction to the token text for formatting -
         // we want NEG's text to be the same as MINUSes
         if(tree.getType() == NEG) {
             ((CommonTree)tree).token.setText("-");
         }

         // As noted below, brackets around a single literal mean a tuple
         // of that literal, not a nested expression which evaluates to
         // that literal. Remember that a NULL with children is a boolean
         // expression, not a literal!
         if(tree.getType() == EXPR_IN_PAREN
         && LITERAL_TOKENS.contains(tree.getChild(0).getType())
         && (tree.getChild(0).getType() != NULL || tree.getChild(0).getChildCount() == 0)) {
             ((CommonTree)tree).token.setType(TUPLE_VAL);
         }

         // For CASE statement, we clone the case expression (1st child of the
         // returned tree) and insert it before every when expression. For example,
         //
         //   CASE e1
         //     WHEN e2 THEN e3
         //     WHEN e4 THEN e5
         //     ELSE e6
         //   END
         // =>
         //   ^( CASE e1, e2, e3, e1, e4, e5, e6 )
         //
         // Note that e1 appears twice at index 0 and 3.
         //
         // This is needed because in LogicalPlanGenerator.g, we translate this
         // tree to nested bincond expressions, and we need to construct a new
         // LogicalExpression object per when branch.
         if(tree.getType() == CASE_EXPR) {
             Tree caseExpr = tree.getChild(0);
             int childCount = tree.getChildCount();
             boolean hasElse = childCount \% 2 == 0;
             int whenBranchCount = ( childCount - (hasElse ? 2 : 1) ) / 2;
             for(int i = 1; i < whenBranchCount; i++) {
                 tree.insertChild(3*i, deepCopy(caseExpr));
             }
         }
     }
           : scalar
           | MINUS cast_expr -> ^( NEG cast_expr )
           // single columns and functions (both of which can start with an identifier). Note that we have to be
           // careful with periods straight after the identifier, as we want those to be projections, not function
           // calls
           | col_ref_without_identifier projection*
           | invoker_func projection*
           | identifier_plus projection*
           | identifier_plus func_name_suffix? LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL identifier_plus func_name_suffix? real_arg* ) projection*
           | func_name_without_columns LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN projection* -> ^( FUNC_EVAL func_name_without_columns real_arg* ) projection*
           | CASE ( (WHEN)=> WHEN cond THEN expr ( WHEN cond THEN expr )* ( ELSE expr )? END projection* -> ^( CASE_COND ^(WHEN cond+) ^(THEN expr+) ) projection*
                  | expr WHEN rhs_operand THEN rhs_operand ( WHEN rhs_operand THEN rhs_operand )* ( ELSE rhs_operand )? END projection*
                  -> ^( CASE_EXPR ^(CASE_EXPR_LHS expr) ^(CASE_EXPR_RHS rhs_operand)+ ) projection*
                  )
           | paren_expr
           | curly_expr
           | bracket_expr
 ;

 invoker_func
 @init {
     String staticStr = "true";
     List<String> packageStr = Lists.newArrayList();
     String methodStr = null;
 }
 : INVOKE ( AMPERSAND | LEFT_PAREN real_arg { staticStr = "false"; } RIGHT_PAREN ) ( packageName=identifier_plus PERIOD { packageStr.add($packageName.text); } )* methodName=identifier_plus { methodStr=$methodName.text; } LEFT_PAREN ( real_arg ( COMMA real_arg )* )? RIGHT_PAREN
               -> ^( INVOKER_FUNC_EVAL IDENTIFIER[Joiner.on(".").join(packageStr)] IDENTIFIER[methodStr] IDENTIFIER[staticStr] real_arg* )
 ;

 // now we have to deal with parentheses: in an expr, '(' can be the
 // start of a cast, the start of a nested expression or the start of
 // a tuple. We'll ensure parsing is unambiguous by assuming a single
 // expression in parentheses is a nested expression, whereas two or
 // more nested expressions are a tuple (unless that single expression
 // is a literal, in which case we assume tuple with a single element
 // - that literal).
 paren_expr
     @after
     {
         BaseTree tree = (BaseTree)retval.getTree();

         // the other side of the @after block in unary_cond: if we've
         // matched an EXPR_IN_PAREN we expect the nested expression to
         // be an "expr", not a "cond", so we should strip off the
         // BOOL_COND token.
         if(tree.getType() == EXPR_IN_PAREN
         && tree.getChild(0).getType() == BOOL_COND) {
             int type = tree.getChild(0).getChild(0).getType();
             // NULL is a special case - if it has children it's a boolean
             // expression, and if not it's a literal NULL. Note that we
             // replace *all* children
             if(!BOOLEAN_TOKENS.contains(type)
             || (type == NULL && tree.getChild(0).getChild(0).getChildCount() == 0)) {
                 Tree addChildrenOf = tree.getChild(0);
                 for(int i = 0; i < tree.getChildCount(); ++i)
                     tree.deleteChild(i);
                 for(int i = 0; i < addChildrenOf.getChildCount(); ++i)
                     tree.addChild(addChildrenOf.getChild(i));
             }
         }

         // A function call to TOTUPLE is inserted into the AST for
         // some tuple literals - but as we assume the first expression
         // after an open bracket is a "cond" rule, and as "cond" rules
         // nest "expr" rules under a BOOL_COND token we get an invalid
         // AST. We'll remove this BOOL_COND here:
         if(tree.getType() == FUNC_EVAL
         && tree.getChild(0).getType() == TOTUPLE
         && tree.getChildCount() > 1
         && tree.getChild(1).getType() == BOOL_COND) {
             Tree insertChildrenOf = tree.getChild(1);
             tree.deleteChild(1);
             for(int i = insertChildrenOf.getChildCount() - 1; i >= 0; --i)
                 tree.insertChild(1, insertChildrenOf.getChild(i));
         }
     }
     : LEFT_PAREN! try_implicit_map_cast
 ;

 try_implicit_map_cast
            // we'll also allow implicit map casts (for backwards compatibility only -
            // bag and tuple casts have to be explicit and it makes the grammar more
            // simple). Unfortunately we'll have to turn on back-tracking for this rule,
            // as LEFT_PAREN LEFT_BRACKET could be a literal map in a EXPR_IN_PAREN.
            // It'd be much better if we could remove this from the Pig language (and
            // just rely on explicit map casts) - then we'd have no backtracking at all!
            : ( implicit_map_type RIGHT_PAREN cast_expr) => implicit_map_type RIGHT_PAREN cast_expr -> ^( CAST_EXPR implicit_map_type cast_expr )
            | after_left_paren
 ;

 after_left_paren : explicit_type_cast RIGHT_PAREN cast_expr -> ^( CAST_EXPR explicit_type_cast cast_expr )
                  // tuples
                  | RIGHT_PAREN projection* -> ^( TUPLE_VAL ) projection*
                  | STAR ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE STAR real_arg* ) projection*
                  | col_range ( COMMA real_arg )* RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE col_range real_arg* ) projection*
                  // Tuples begin with '(' expr, but shorthand-booleans begin with '(' cond. As cond
                  // and expr are indistinguishable, we'll parse as a cond (i.e. the most lenient) and
                  // for exprs, strip off the BOOL_COND trees. You can have both nested conds and nested
                  // exprs, so we'll just assume cond.
                  | cond
                    ( ( ( COMMA real_arg )+ RIGHT_PAREN projection* -> ^( FUNC_EVAL TOTUPLE cond real_arg+ ) projection* )
                    | ( RIGHT_PAREN -> ^( EXPR_IN_PAREN cond ) )
                    | ( QMARK exp1 = expr COLON exp2 = expr RIGHT_PAREN -> ^( BIN_EXPR cond $exp1 $exp2 ) ) )
 ;

 curly_expr : LEFT_CURLY real_arg ( COMMA real_arg )* RIGHT_CURLY projection* -> ^( FUNC_EVAL TOBAG real_arg+ ) projection*
            | LEFT_CURLY RIGHT_CURLY projection* -> ^( BAG_VAL ) projection*
 ;

 bracket_expr : LEFT_BRACKET real_arg ( COMMA real_arg )* RIGHT_BRACKET projection* -> ^( FUNC_EVAL TOMAP real_arg+ ) projection*
              | LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET projection* -> ^( MAP_VAL keyvalue+ ) projection*
              | LEFT_BRACKET RIGHT_BRACKET projection* -> ^( MAP_VAL ) projection*
 ;

 projection : PERIOD ( col_ref | LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) -> ^( PERIOD col_ref+ )
            | POUND^ ( QUOTEDSTRING | NULL )
 ;

 // ATOMS

 // for disambiguation with func_names
 col_ref_without_identifier : GROUP | DOLLARVAR
 ;

 col_ref : col_ref_without_identifier | identifier_plus
 ;

 col_range : c1 = col_ref DOUBLE_PERIOD c2 = col_ref? -> ^(COL_RANGE $c1 DOUBLE_PERIOD $c2?)
           |  DOUBLE_PERIOD col_ref -> ^(COL_RANGE DOUBLE_PERIOD col_ref)
 ;

 scalar : INTEGER
        | LONGINTEGER
        | FLOATNUMBER
        | DOUBLENUMBER
        | BIGINTEGERNUMBER
        | BIGDECIMALNUMBER
        | QUOTEDSTRING
        | NULL
        | TRUE
        | FALSE
 ;

 keyvalue : QUOTEDSTRING POUND literal -> ^( KEY_VAL_PAIR QUOTEDSTRING literal )
 ;

 literal_map : LEFT_BRACKET keyvalue ( COMMA keyvalue )* RIGHT_BRACKET -> ^( MAP_VAL keyvalue+ )
             | LEFT_BRACKET RIGHT_BRACKET -> ^( MAP_VAL )
 ;


 literal_bag : LEFT_CURLY literal_tuple ( COMMA literal_tuple )* RIGHT_CURLY -> ^( BAG_VAL literal_tuple+ )
             | LEFT_CURLY RIGHT_CURLY -> ^( BAG_VAL )
 ;

 literal_tuple : LEFT_PAREN literal ( COMMA literal )* RIGHT_PAREN -> ^( TUPLE_VAL literal+ )
               | LEFT_PAREN RIGHT_PAREN -> ^( TUPLE_VAL )
 ;

 literal : scalar | literal_map | literal_bag | literal_tuple
 ;

 // NESTING

 nested_blk : ( nested_command SEMI_COLON )* GENERATE flatten_generated_item ( COMMA flatten_generated_item )* SEMI_COLON
     -> nested_command* ^( GENERATE flatten_generated_item+ )
 ;

 nested_command : ( identifier_plus EQUAL col_ref PERIOD col_ref_list { input.LA( 1 ) == SEMI_COLON }? ) => ( identifier_plus EQUAL nested_proj )
               -> ^( NESTED_CMD identifier_plus nested_proj )
                | identifier_plus EQUAL expr
               -> ^( NESTED_CMD_ASSI identifier_plus expr )
                | identifier_plus EQUAL nested_op
               -> ^( NESTED_CMD identifier_plus nested_op )
 ;

 nested_op : nested_filter
           | nested_sort
           | nested_distinct
           | nested_limit
           | nested_cross
           | nested_foreach
 ;

 nested_proj : col_ref PERIOD col_ref_list
            -> ^( NESTED_PROJ col_ref col_ref_list )
 ;

 col_ref_list : ( col_ref | ( LEFT_PAREN col_ref ( COMMA col_ref )* RIGHT_PAREN ) )
             -> col_ref+
 ;

 nested_filter : FILTER^ nested_op_input BY! cond
 ;

 nested_sort : ORDER^ nested_op_input BY!  order_by_clause ( USING! func_clause )?
 ;

 nested_distinct : DISTINCT^ nested_op_input
 ;

 nested_limit : LIMIT^ nested_op_input ( (INTEGER SEMI_COLON) => INTEGER | expr )
 ;

 nested_cross : CROSS^ nested_op_input_list
 ;

 nested_foreach: FOREACH nested_op_input GENERATE flatten_generated_item ( COMMA flatten_generated_item )*
     -> ^( FOREACH nested_op_input ^( GENERATE flatten_generated_item+ ) )
 ;

 nested_op_input : col_ref | nested_proj
 ;

 nested_op_input_list : nested_op_input ( COMMA nested_op_input )*
         -> nested_op_input+
 ;

 // IDENTIFIERS

 // extended identifier, handling the keyword and identifier conflicts. Ugly but there is no other choice.
 eid_without_columns : rel_str_op
     | IMPORT
     | REGISTER
     | RETURNS
     | DEFINE
     | LOAD
     | FILTER
     | FOREACH
     | ROLLUP
     | ORDER
     | DISTINCT
     | COGROUP
     | JOIN
     | CROSS
     | UNION
     | SPLIT
     | INTO
     | IF
     | ALL
     | AS
     | BY
     | USING
     | INNER
     | OUTER
     | PARALLEL
     | PARTITION
     | AND
     | OR
     | GENERATE
     | ASC
     | DESC
     | BOOL
     | BIGINTEGER
     | BIGDECIMAL
     | DATETIME
     | CHARARRAY
     | BYTEARRAY
     | IS
     | STREAM
     | THROUGH
     | STORE
     | MAPREDUCE
     | SHIP
     | CACHE
     | INPUT
     | OUTPUT
     | STDERROR
     | STDIN
     | STDOUT
     | LIMIT
     | SAMPLE
     | LEFT
     | RIGHT
     | FULL
     | REALIAS
     | BOOL_COND
     | ASSERT
 ;

 eid : eid_without_columns
     | IDENTIFIER
     | GROUP
     | CUBE
     | TRUE
     | FALSE
     | INT
     | LONG
     | FLOAT
     | DOUBLE
     | NULL
     | NOT
     | FLATTEN
     | BAG
     | TUPLE
     | MAP
 ;

 // relational operator
 rel_op : rel_str_op
        | NUM_OP_EQ
        | NUM_OP_NE
        | NUM_OP_GT
        | NUM_OP_GTE
        | NUM_OP_LT
        | NUM_OP_LTE
 ;

 rel_str_op : STR_OP_EQ
            | STR_OP_NE
            | STR_OP_GT
            | STR_OP_LT
            | STR_OP_GTE
            | STR_OP_LTE
            | STR_OP_MATCHES
 ;

 reserved_identifier_whitelist : RANK
                               | CUBE
                               | IN
                               | WHEN
                               | THEN
                               | ELSE
                               | END
 ;