blob: 0090ac8314ec78427c0dbb4b2695435df258f0e5 [file] [log] [blame]
/*
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
/**
* Grammar file for Pig tree parser (visitor for default data type insertion).
*
* NOTE: THIS FILE IS BASED ON QueryParser.g, SO IF YOU CHANGE THAT FILE, YOU WILL
* PROBABLY NEED TO MAKE CORRESPONDING CHANGES TO THIS FILE AS WELL.
*/
tree grammar AstValidator;
options {
tokenVocab=QueryParser;
ASTLabelType=CommonTree;
output=AST;
backtrack=true;
}
@header {
package org.apache.pig.parser;
import org.apache.pig.data.DataType;
import org.apache.pig.impl.util.NumValCarrier;
import java.util.HashSet;
import java.util.Set;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
}
@members {
private static Log log = LogFactory.getLog( AstValidator.class );
@Override
protected Object recoverFromMismatchedToken(IntStream input, int ttype, BitSet follow)
throws RecognitionException {
throw new MismatchedTokenException( ttype, input );
}
@Override
public Object recoverFromMismatchedSet(IntStream input, RecognitionException e, BitSet follow)
throws RecognitionException {
throw e;
}
private void validateSchemaAliasName(Set<String> fieldNames, CommonTree node, String name)
throws DuplicatedSchemaAliasException {
if( fieldNames.contains( name ) ) {
throw new DuplicatedSchemaAliasException( input,
new SourceLocation( (PigParserNode)node ), name );
} else {
fieldNames.add( name );
}
}
private void validateAliasRef(Set<String> aliases, CommonTree node, String alias)
throws UndefinedAliasException {
if( !aliases.contains( alias ) ) {
throw new UndefinedAliasException( input, new SourceLocation( (PigParserNode)node ), alias );
}
}
private void checkDuplication(int count, CommonTree node) throws ParserValidationException {
if( count > 1 ) {
throw new ParserValidationException( input, new SourceLocation( (PigParserNode)node ),
"Duplicated command option" );
}
}
private String lastRel = null;
private String getLastRel(CommonTree node) throws UndefinedAliasException {
if (lastRel != null) {
return lastRel;
}
throw new UndefinedAliasException( input, new SourceLocation((PigParserNode)node), "@");
}
private Set<String> aliases = new HashSet<String>() {
@Override
public boolean add(String e) {
lastRel = e;
return super.add(e);
}
};
} // End of @members
@rulecatch {
catch(RecognitionException re) {
throw re;
}
}
query : ^( QUERY statement* )
;
statement : general_statement
| split_statement
| realias_statement
| register_statement
| assert_statement
;
split_statement : split_clause
;
realias_statement : realias_clause
;
register_statement : ^( REGISTER QUOTEDSTRING (USING IDENTIFIER AS IDENTIFIER)? )
;
assert_statement : assert_clause
;
general_statement : ^( STATEMENT ( alias { aliases.add( $alias.name ); } )? op_clause parallel_clause? )
;
realias_clause : ^(REALIAS alias IDENTIFIER)
{
aliases.add( $alias.name );
}
;
parallel_clause : ^( PARALLEL INTEGER )
;
alias returns[String name, CommonTree node]
: IDENTIFIER
{
$name = $IDENTIFIER.text;
$node = $IDENTIFIER;
}
;
previous_rel returns[String name, CommonTree node]
: ARROBA
{
$name = getLastRel($ARROBA);
$node = $ARROBA;
}
;
op_clause : define_clause
| load_clause
| group_clause
| store_clause
| filter_clause
| distinct_clause
| limit_clause
| sample_clause
| order_clause
| rank_clause
| cross_clause
| join_clause
| union_clause
| stream_clause
| mr_clause
| split_clause
| foreach_clause
| cube_clause
| assert_clause
;
define_clause : ^( DEFINE alias ( cmd | func_clause ) )
;
cmd
@init {
int ship = 0;
int cache = 0;
int in = 0;
int out = 0;
int error = 0;
}
: ^( EXECCOMMAND ( ship_clause { checkDuplication( ++ship, $ship_clause.start ); }
| cache_clause { checkDuplication( ++cache, $cache_clause.start ); }
| input_clause { checkDuplication( ++in, $input_clause.start ); }
| output_clause { checkDuplication( ++out, $output_clause.start ); }
| error_clause { checkDuplication( ++error, $error_clause.start ); }
)*
)
;
ship_clause : ^( SHIP path_list? )
;
path_list : QUOTEDSTRING+
;
cache_clause : ^( CACHE path_list )
;
input_clause : ^( INPUT stream_cmd+ )
;
stream_cmd : ^( STDIN func_clause? )
| ^( STDOUT func_clause? )
| ^( QUOTEDSTRING func_clause? )
;
output_clause : ^( OUTPUT stream_cmd+ )
;
error_clause : ^( STDERROR ( QUOTEDSTRING INTEGER? )? )
;
load_clause : ^( LOAD filename func_clause? as_clause? )
;
filename : QUOTEDSTRING
;
as_clause: ^( AS field_def_list )
;
field_def[Set<String> fieldNames, NumValCarrier nvc] throws DuplicatedSchemaAliasException
: ^( FIELD_DEF IDENTIFIER { validateSchemaAliasName( fieldNames, $IDENTIFIER, $IDENTIFIER.text ); } type? )
| ^( FIELD_DEF_WITHOUT_IDENTIFIER type { validateSchemaAliasName ( fieldNames, $FIELD_DEF_WITHOUT_IDENTIFIER, $nvc.makeNameFromDataType ( $type.typev ) ); } )
;
field_def_list throws DuplicatedSchemaAliasException
scope{
Set<String> fieldNames;
NumValCarrier nvc;
}
@init {
$field_def_list::fieldNames = new HashSet<String>();
$field_def_list::nvc = new NumValCarrier();
}
: ( field_def[$field_def_list::fieldNames, $field_def_list::nvc] )+
;
type returns [byte typev]
: simple_type { $typev = $simple_type.typev; }
| tuple_type { $typev = DataType.TUPLE; }
| bag_type { $typev = DataType.BAG; }
| map_type { $typev = DataType.MAP; }
;
simple_type returns [byte typev]
: BOOLEAN { $typev = DataType.BOOLEAN; }
| INT { $typev = DataType.INTEGER; }
| LONG { $typev = DataType.LONG; }
| FLOAT { $typev = DataType.FLOAT; }
| DOUBLE { $typev = DataType.DOUBLE; }
| BIGINTEGER { $typev = DataType.BIGINTEGER; }
| BIGDECIMAL { $typev = DataType.BIGDECIMAL; }
| DATETIME { $typev = DataType.DATETIME; }
| CHARARRAY { $typev = DataType.CHARARRAY; }
| BYTEARRAY { $typev = DataType.BYTEARRAY; }
;
tuple_type : ^( TUPLE_TYPE field_def_list? )
;
bag_type : ^( BAG_TYPE IDENTIFIER? tuple_type? )
;
map_type : ^( MAP_TYPE IDENTIFIER? type? )
;
func_clause : ^( FUNC_REF func_name )
| ^( FUNC func_name func_args? )
;
func_name : eid ( ( PERIOD | DOLLAR ) eid )*
;
func_args_string : QUOTEDSTRING | MULTILINE_QUOTEDSTRING
;
func_args : func_args_string+
;
cube_clause
: ^( CUBE cube_item )
;
cube_item
: rel ( cube_by_clause )
;
cube_by_clause
: ^( BY cube_or_rollup )
;
cube_or_rollup
: cube_rollup_list+
;
cube_rollup_list
: ^( ( CUBE | ROLLUP ) cube_by_expr_list )
;
cube_by_expr_list
: cube_by_expr+
;
cube_by_expr
: col_range | expr | STAR
;
group_clause
scope {
int arity;
}
@init {
$group_clause::arity = 0;
}
: ^( ( GROUP | COGROUP ) group_item+ group_type? partition_clause? )
;
group_type : QUOTEDSTRING
;
group_item
: rel ( join_group_by_clause | ALL | ANY ) ( INNER | OUTER )?
{
if( $group_clause::arity == 0 ) {
// For the first input
$group_clause::arity = $join_group_by_clause.exprCount;
} else if( $join_group_by_clause.exprCount != $group_clause::arity ) {
throw new ParserValidationException( input, new SourceLocation( (PigParserNode)$group_item.start ),
"The arity of the group by columns do not match." );
}
}
;
rel : alias { validateAliasRef( aliases, $alias.node, $alias.name ); }
| previous_rel { validateAliasRef( aliases, $previous_rel.node, $previous_rel.name ); }
| op_clause parallel_clause?
;
flatten_generated_item : ( flatten_clause | col_range | expr | STAR ) field_def_list?
;
flatten_clause : ^( FLATTEN expr )
;
store_clause : ^( STORE rel filename func_clause? )
;
assert_clause : ^( ASSERT rel cond comment? )
;
comment : QUOTEDSTRING
;
filter_clause : ^( FILTER rel cond )
;
cond : ^( OR cond cond )
| ^( AND cond cond )
| ^( NOT cond )
| ^( NULL expr NOT? )
| ^( rel_op expr expr )
| in_eval
| func_eval
| ^( BOOL_COND expr )
;
in_eval: ^( IN ( ^( IN_LHS expr ) ^( IN_RHS expr ) )+ )
;
func_eval: ^( FUNC_EVAL func_name real_arg* ) | ^( INVOKER_FUNC_EVAL func_name IDENTIFIER real_arg* )
;
real_arg : expr | STAR | col_range
;
expr : ^( PLUS expr expr )
| ^( MINUS expr expr )
| ^( STAR expr expr )
| ^( DIV expr expr )
| ^( PERCENT expr expr )
| ^( CAST_EXPR type expr )
| const_expr
| var_expr
| ^( NEG expr )
| ^( CAST_EXPR type_cast expr )
| ^( EXPR_IN_PAREN expr )
;
type_cast : simple_type | map_type | tuple_type_cast | bag_type_cast
;
tuple_type_cast : ^( TUPLE_TYPE_CAST type_cast* )
;
bag_type_cast : ^( BAG_TYPE_CAST tuple_type_cast? )
;
var_expr : projectable_expr ( dot_proj | pound_proj )*
;
projectable_expr: func_eval | col_ref | bin_expr | case_expr | case_cond
;
dot_proj : ^( PERIOD col_alias_or_index+ )
;
col_alias_or_index : col_alias | col_index
;
col_alias : GROUP | CUBE | IDENTIFIER
;
col_index : DOLLARVAR
;
col_range : ^(COL_RANGE col_ref? DOUBLE_PERIOD col_ref?)
;
pound_proj : ^( POUND ( QUOTEDSTRING | NULL ) )
;
bin_expr : ^( BIN_EXPR cond expr expr )
;
case_expr: ^( CASE_EXPR ( ^( CASE_EXPR_LHS expr ) ( ^( CASE_EXPR_RHS expr) )+ )+ )
;
case_cond: ^( CASE_COND ^( WHEN cond+ ) ^( THEN expr+ ) )
;
limit_clause : ^( LIMIT rel ( INTEGER | LONGINTEGER | expr ) )
;
sample_clause : ^( SAMPLE rel ( DOUBLENUMBER | expr ) )
;
rank_clause : ^( RANK rel ( rank_by_statement )? )
;
rank_by_statement : ^( BY rank_by_clause ( DENSE )? )
;
rank_by_clause : STAR ( ASC | DESC )?
| rank_col+
;
rank_col : col_range (ASC | DESC)?
| col_ref ( ASC | DESC )?
;
order_clause : ^( ORDER rel order_by_clause func_clause? )
;
order_by_clause : STAR ( ASC | DESC )?
| order_col+
;
order_col : col_range (ASC | DESC)?
| col_ref ( ASC | DESC )?
;
distinct_clause : ^( DISTINCT rel partition_clause? )
;
partition_clause : ^( PARTITION func_name )
;
cross_clause : ^( CROSS rel_list partition_clause? )
;
rel_list : rel+
;
join_clause
scope {
int arity;
}
@init {
$join_clause::arity = 0;
}
: ^( JOIN join_sub_clause join_type? partition_clause? )
;
join_type : QUOTEDSTRING
;
join_sub_clause
: join_item ( LEFT | RIGHT | FULL ) OUTER? join_item
| join_item+
;
join_item
: ^( JOIN_ITEM rel join_group_by_clause )
{
if( $join_clause::arity == 0 ) {
// For the first input
$join_clause::arity = $join_group_by_clause.exprCount;
} else if( $join_group_by_clause.exprCount != $join_clause::arity ) {
throw new ParserValidationException( input, new SourceLocation( (PigParserNode)$join_item.start ),
"The arity of the join columns do not match." );
}
}
;
join_group_by_clause returns[int exprCount]
@init {
$exprCount = 0;
}
: ^( BY ( join_group_by_expr { $exprCount++; } )+ )
;
join_group_by_expr : col_range | expr | STAR
;
union_clause : ^( UNION ONSCHEMA? rel_list )
;
foreach_clause : ^( FOREACH rel foreach_plan )
;
foreach_plan : ^( FOREACH_PLAN_SIMPLE generate_clause )
| ^( FOREACH_PLAN_COMPLEX nested_blk )
;
nested_blk
scope { Set<String> ids; }
@init{ $nested_blk::ids = new HashSet<String>(); }
: nested_command* generate_clause
;
generate_clause : ^( GENERATE flatten_generated_item+ )
;
nested_command
: ^( NESTED_CMD IDENTIFIER nested_op )
{
$nested_blk::ids.add( $IDENTIFIER.text );
}
| ^( NESTED_CMD_ASSI IDENTIFIER expr )
{
$nested_blk::ids.add( $IDENTIFIER.text );
}
;
nested_op : nested_proj
| nested_filter
| nested_sort
| nested_distinct
| nested_limit
| nested_cross
| nested_foreach
;
nested_proj : ^( NESTED_PROJ col_ref col_ref+ )
;
nested_filter
: ^( FILTER nested_op_input cond )
;
nested_sort : ^( ORDER nested_op_input order_by_clause func_clause? )
;
nested_distinct : ^( DISTINCT nested_op_input )
;
nested_limit : ^( LIMIT nested_op_input ( INTEGER | expr ) )
;
nested_cross : ^( CROSS nested_op_input_list )
;
nested_foreach : ^( FOREACH nested_op_input generate_clause )
;
nested_op_input : col_ref | nested_proj
;
nested_op_input_list : nested_op_input+
;
stream_clause : ^( STREAM rel ( EXECCOMMAND | IDENTIFIER ) as_clause? )
;
mr_clause : ^( MAPREDUCE QUOTEDSTRING path_list? store_clause load_clause EXECCOMMAND? )
;
split_clause : ^( SPLIT rel split_branch+ split_otherwise? )
;
split_branch
: ^( SPLIT_BRANCH alias cond )
{
aliases.add( $alias.name );
}
;
split_otherwise : ^( OTHERWISE alias ALL? )
{
aliases.add( $alias.name );
}
;
col_ref : alias_col_ref | dollar_col_ref
;
alias_col_ref : GROUP | CUBE | IDENTIFIER
;
dollar_col_ref : DOLLARVAR
;
const_expr : literal
;
literal : scalar | map | bag | tuple
;
scalar : num_scalar | QUOTEDSTRING | NULL | TRUE | FALSE
;
num_scalar : MINUS? ( INTEGER | LONGINTEGER | FLOATNUMBER | DOUBLENUMBER | BIGINTEGERNUMBER | BIGDECIMALNUMBER )
;
map : ^( MAP_VAL keyvalue* )
;
keyvalue : ^( KEY_VAL_PAIR map_key const_expr )
;
map_key : QUOTEDSTRING
;
bag : ^( BAG_VAL tuple* )
;
tuple : ^( TUPLE_VAL literal* )
;
// extended identifier, handling the keyword and identifier conflicts. Ugly but there is no other choice.
eid : rel_str_op
| IMPORT
| RETURNS
| DEFINE
| LOAD
| FILTER
| FOREACH
| CUBE
| ROLLUP
| MATCHES
| ORDER
| RANK
| DISTINCT
| COGROUP
| JOIN
| CROSS
| UNION
| SPLIT
| INTO
| IF
| ALL
| AS
| BY
| USING
| INNER
| OUTER
| PARALLEL
| PARTITION
| GROUP
| AND
| OR
| NOT
| GENERATE
| FLATTEN
| EVAL
| ASC
| DESC
| BOOLEAN
| INT
| LONG
| FLOAT
| DOUBLE
| BIGINTEGER
| BIGDECIMAL
| DATETIME
| CHARARRAY
| BYTEARRAY
| BAG
| TUPLE
| MAP
| IS
| NULL
| TRUE
| FALSE
| STREAM
| THROUGH
| STORE
| MAPREDUCE
| SHIP
| CACHE
| INPUT
| OUTPUT
| STDERROR
| STDIN
| STDOUT
| LIMIT
| SAMPLE
| LEFT
| RIGHT
| FULL
| IDENTIFIER
| TOBAG
| TOMAP
| TOTUPLE
| ASSERT
;
// relational operator
rel_op : rel_op_eq
| rel_op_ne
| rel_op_gt
| rel_op_gte
| rel_op_lt
| rel_op_lte
| STR_OP_MATCHES
;
rel_op_eq : STR_OP_EQ | NUM_OP_EQ
;
rel_op_ne : STR_OP_NE | NUM_OP_NE
;
rel_op_gt : STR_OP_GT | NUM_OP_GT
;
rel_op_gte : STR_OP_GTE | NUM_OP_GTE
;
rel_op_lt : STR_OP_LT | NUM_OP_LT
;
rel_op_lte : STR_OP_LTE | NUM_OP_LTE
;
rel_str_op : STR_OP_EQ
| STR_OP_NE
| STR_OP_GT
| STR_OP_LT
| STR_OP_GTE
| STR_OP_LTE
| STR_OP_MATCHES
;