spark/v4.0/spark-extensions/src/main/antlr/org.apache.spark.sql.catalyst.parser.extensions/IcebergSqlExtensions.g4 - iceberg - Git at Google

 /*
  * Licensed to the Apache Software Foundation (ASF) under one
  * or more contributor license agreements.  See the NOTICE file
  * distributed with this work for additional information
  * regarding copyright ownership.  The ASF licenses this file
  * to you under the Apache License, Version 2.0 (the
  * "License"); you may not use this file except in compliance
  * with the License.  You may obtain a copy of the License at
  *
  *   http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing,
  * software distributed under the License is distributed on an
  * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
  * KIND, either express or implied.  See the License for the
  * specific language governing permissions and limitations
  * under the License.
  *
  * This file is an adaptation of Presto's and Spark's grammar files.
  */

 grammar IcebergSqlExtensions;

 @lexer::members {
   /**
    * Verify whether current token is a valid decimal token (which contains dot).
    * Returns true if the character that follows the token is not a digit or letter or underscore.
    *
    * For example:
    * For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
    * For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
    * For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
    * For char stream "12.0D 34.E2+0.12 "  12.0D is a valid decimal token because it is followed
    * by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
    * which is not a digit or letter or underscore.
    */
   public boolean isValidDecimal() {
     int nextChar = _input.LA(1);
     if (nextChar >= 'A' && nextChar <= 'Z' || nextChar >= '0' && nextChar <= '9' ||
       nextChar == '_') {
       return false;
     } else {
       return true;
     }
   }

   /**
    * This method will be called when we see '/*' and try to match it as a bracketed comment.
    * If the next character is '+', it should be parsed as hint later, and we cannot match
    * it as a bracketed comment.
    *
    * Returns true if the next character is '+'.
    */
   public boolean isHint() {
     int nextChar = _input.LA(1);
     if (nextChar == '+') {
       return true;
     } else {
       return false;
     }
   }
 }

 singleStatement
     : statement EOF
     ;

 statement
     : CALL multipartIdentifier '(' (callArgument (',' callArgument)*)? ')'                  #call
     | ALTER TABLE multipartIdentifier ADD PARTITION FIELD transform (AS name=identifier)?   #addPartitionField
     | ALTER TABLE multipartIdentifier DROP PARTITION FIELD transform                        #dropPartitionField
     | ALTER TABLE multipartIdentifier REPLACE PARTITION FIELD transform WITH transform (AS name=identifier)? #replacePartitionField
     | ALTER TABLE multipartIdentifier WRITE writeSpec                                       #setWriteDistributionAndOrdering
     | ALTER TABLE multipartIdentifier SET IDENTIFIER_KW FIELDS fieldList                    #setIdentifierFields
     | ALTER TABLE multipartIdentifier DROP IDENTIFIER_KW FIELDS fieldList                   #dropIdentifierFields
     | ALTER TABLE multipartIdentifier createReplaceBranchClause                             #createOrReplaceBranch
     | ALTER TABLE multipartIdentifier createReplaceTagClause                                #createOrReplaceTag
     | ALTER TABLE multipartIdentifier DROP BRANCH (IF EXISTS)? identifier                   #dropBranch
     | ALTER TABLE multipartIdentifier DROP TAG (IF EXISTS)? identifier                      #dropTag
     ;

 createReplaceTagClause
     : (CREATE OR)? REPLACE TAG identifier tagOptions
     | CREATE TAG (IF NOT EXISTS)? identifier tagOptions
     ;

 createReplaceBranchClause
     : (CREATE OR)? REPLACE BRANCH identifier branchOptions
     | CREATE BRANCH (IF NOT EXISTS)? identifier branchOptions
     ;

 tagOptions
     : (AS OF VERSION snapshotId)? (refRetain)?
     ;

 branchOptions
     : (AS OF VERSION snapshotId)? (refRetain)? (snapshotRetention)?
     ;

 snapshotRetention
     : WITH SNAPSHOT RETENTION minSnapshotsToKeep
     | WITH SNAPSHOT RETENTION maxSnapshotAge
     | WITH SNAPSHOT RETENTION minSnapshotsToKeep maxSnapshotAge
     ;

 refRetain
     : RETAIN number timeUnit
     ;

 maxSnapshotAge
     : number timeUnit
     ;

 minSnapshotsToKeep
     : number SNAPSHOTS
     ;

 writeSpec
     : (writeDistributionSpec | writeOrderingSpec)*
     ;

 writeDistributionSpec
     : DISTRIBUTED BY PARTITION
     ;

 writeOrderingSpec
     : LOCALLY? ORDERED BY order
     | UNORDERED
     ;

 callArgument
     : expression                    #positionalArgument
     | identifier '=>' expression    #namedArgument
     ;

 singleOrder
     : order EOF
     ;

 order
     : fields+=orderField (',' fields+=orderField)*
     | '(' fields+=orderField (',' fields+=orderField)* ')'
     ;

 orderField
     : transform direction=(ASC | DESC)? (NULLS nullOrder=(FIRST | LAST))?
     ;

 transform
     : multipartIdentifier                                                       #identityTransform
     | transformName=identifier
       '(' arguments+=transformArgument (',' arguments+=transformArgument)* ')'  #applyTransform
     ;

 transformArgument
     : multipartIdentifier
     | constant
     ;

 expression
     : constant
     | stringMap
     | stringArray
     ;

 constant
     : number                          #numericLiteral
     | booleanValue                    #booleanLiteral
     | STRING+                         #stringLiteral
     | identifier STRING               #typeConstructor
     ;

 stringMap
     : MAP '(' constant (',' constant)* ')'
     ;

 stringArray
     : ARRAY '(' constant (',' constant)* ')'
     ;

 booleanValue
     : TRUE | FALSE
     ;

 number
     : MINUS? EXPONENT_VALUE           #exponentLiteral
     | MINUS? DECIMAL_VALUE            #decimalLiteral
     | MINUS? INTEGER_VALUE            #integerLiteral
     | MINUS? BIGINT_LITERAL           #bigIntLiteral
     | MINUS? SMALLINT_LITERAL         #smallIntLiteral
     | MINUS? TINYINT_LITERAL          #tinyIntLiteral
     | MINUS? DOUBLE_LITERAL           #doubleLiteral
     | MINUS? FLOAT_LITERAL            #floatLiteral
     | MINUS? BIGDECIMAL_LITERAL       #bigDecimalLiteral
     ;

 multipartIdentifier
     : parts+=identifier ('.' parts+=identifier)*
     ;

 identifier
     : IDENTIFIER              #unquotedIdentifier
     | quotedIdentifier        #quotedIdentifierAlternative
     | nonReserved             #unquotedIdentifier
     ;

 quotedIdentifier
     : BACKQUOTED_IDENTIFIER
     ;

 fieldList
     : fields+=multipartIdentifier (',' fields+=multipartIdentifier)*
     ;

 nonReserved
     : ADD | ALTER | AS | ASC | BRANCH | BY | CALL | CREATE | DAYS | DESC | DROP | EXISTS | FIELD | FIRST | HOURS | IF | LAST | NOT | NULLS | OF | OR | ORDERED | PARTITION | TABLE | WRITE
     | DISTRIBUTED | LOCALLY | MINUTES | MONTHS | UNORDERED | REPLACE | RETAIN | VERSION | WITH | IDENTIFIER_KW | FIELDS | SET | SNAPSHOT | SNAPSHOTS
     | TAG | TRUE | FALSE
     | MAP
     ;

 snapshotId
     : number
     ;

 numSnapshots
     : number
     ;

 timeUnit
     : DAYS
     | HOURS
     | MINUTES
     ;

 ADD: 'ADD';
 ALTER: 'ALTER';
 AS: 'AS';
 ASC: 'ASC';
 BRANCH: 'BRANCH';
 BY: 'BY';
 CALL: 'CALL';
 DAYS: 'DAYS';
 DESC: 'DESC';
 DISTRIBUTED: 'DISTRIBUTED';
 DROP: 'DROP';
 EXISTS: 'EXISTS';
 FIELD: 'FIELD';
 FIELDS: 'FIELDS';
 FIRST: 'FIRST';
 HOURS: 'HOURS';
 IF : 'IF';
 LAST: 'LAST';
 LOCALLY: 'LOCALLY';
 MINUTES: 'MINUTES';
 MONTHS: 'MONTHS';
 CREATE: 'CREATE';
 NOT: 'NOT';
 NULLS: 'NULLS';
 OF: 'OF';
 OR: 'OR';
 ORDERED: 'ORDERED';
 PARTITION: 'PARTITION';
 REPLACE: 'REPLACE';
 RETAIN: 'RETAIN';
 RETENTION: 'RETENTION';
 IDENTIFIER_KW: 'IDENTIFIER';
 SET: 'SET';
 SNAPSHOT: 'SNAPSHOT';
 SNAPSHOTS: 'SNAPSHOTS';
 TABLE: 'TABLE';
 TAG: 'TAG';
 UNORDERED: 'UNORDERED';
 VERSION: 'VERSION';
 WITH: 'WITH';
 WRITE: 'WRITE';

 TRUE: 'TRUE';
 FALSE: 'FALSE';

 MAP: 'MAP';
 ARRAY: 'ARRAY';

 PLUS: '+';
 MINUS: '-';

 STRING
     : '\'' ( ~('\''|'\\') | ('\\' .) )* '\''
     | '"' ( ~('"'|'\\') | ('\\' .) )* '"'
     ;

 BIGINT_LITERAL
     : DIGIT+ 'L'
     ;

 SMALLINT_LITERAL
     : DIGIT+ 'S'
     ;

 TINYINT_LITERAL
     : DIGIT+ 'Y'
     ;

 INTEGER_VALUE
     : DIGIT+
     ;

 EXPONENT_VALUE
     : DIGIT+ EXPONENT
     | DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
     ;

 DECIMAL_VALUE
     : DECIMAL_DIGITS {isValidDecimal()}?
     ;

 FLOAT_LITERAL
     : DIGIT+ EXPONENT? 'F'
     | DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}?
     ;

 DOUBLE_LITERAL
     : DIGIT+ EXPONENT? 'D'
     | DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
     ;

 BIGDECIMAL_LITERAL
     : DIGIT+ EXPONENT? 'BD'
     | DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
     ;

 IDENTIFIER
     : (LETTER | DIGIT | '_')+
     ;

 BACKQUOTED_IDENTIFIER
     : '`' ( ~'`' | '``' )* '`'
     ;

 fragment DECIMAL_DIGITS
     : DIGIT+ '.' DIGIT*
     | '.' DIGIT+
     ;

 fragment EXPONENT
     : 'E' [+-]? DIGIT+
     ;

 fragment DIGIT
     : [0-9]
     ;

 fragment LETTER
     : [A-Z]
     ;

 SIMPLE_COMMENT
     : '--' ('\\\n' | ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN)
     ;

 BRACKETED_COMMENT
     : '/*' {!isHint()}? (BRACKETED_COMMENT|.)*? '*/' -> channel(HIDDEN)
     ;

 WS
     : [ \r\n\t]+ -> channel(HIDDEN)
     ;

 // Catch-all for anything we can't recognize.
 // We use this to be able to ignore and recover all the text
 // when splitting statements with DelimiterLexer
 UNRECOGNIZED
     : .
     ;
	/*
	* Licensed to the Apache Software Foundation (ASF) under one
	* or more contributor license agreements. See the NOTICE file
	* distributed with this work for additional information
	* regarding copyright ownership. The ASF licenses this file
	* to you under the Apache License, Version 2.0 (the
	* "License"); you may not use this file except in compliance
	* with the License. You may obtain a copy of the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing,
	* software distributed under the License is distributed on an
	* "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
	* KIND, either express or implied. See the License for the
	* specific language governing permissions and limitations
	* under the License.
	*
	* This file is an adaptation of Presto's and Spark's grammar files.
	*/

	grammar IcebergSqlExtensions;

	@lexer::members {
	/**
	* Verify whether current token is a valid decimal token (which contains dot).
	* Returns true if the character that follows the token is not a digit or letter or underscore.
	*
	* For example:
	* For char stream "2.3", "2." is not a valid decimal token, because it is followed by digit '3'.
	* For char stream "2.3_", "2.3" is not a valid decimal token, because it is followed by '_'.
	* For char stream "2.3W", "2.3" is not a valid decimal token, because it is followed by 'W'.
	* For char stream "12.0D 34.E2+0.12 " 12.0D is a valid decimal token because it is followed
	* by a space. 34.E2 is a valid decimal token because it is followed by symbol '+'
	* which is not a digit or letter or underscore.
	*/
	public boolean isValidDecimal() {
	int nextChar = _input.LA(1);
	if (nextChar >= 'A' && nextChar <= 'Z' \|\| nextChar >= '0' && nextChar <= '9' \|\|
	nextChar == '_') {
	return false;
	} else {
	return true;
	}
	}

	/**
	* This method will be called when we see '/*' and try to match it as a bracketed comment.
	* If the next character is '+', it should be parsed as hint later, and we cannot match
	* it as a bracketed comment.
	*
	* Returns true if the next character is '+'.
	*/
	public boolean isHint() {
	int nextChar = _input.LA(1);
	if (nextChar == '+') {
	return true;
	} else {
	return false;
	}
	}
	}

	singleStatement
	: statement EOF
	;

	statement
	: CALL multipartIdentifier '(' (callArgument (',' callArgument)*)? ')' #call
	\| ALTER TABLE multipartIdentifier ADD PARTITION FIELD transform (AS name=identifier)? #addPartitionField
	\| ALTER TABLE multipartIdentifier DROP PARTITION FIELD transform #dropPartitionField
	\| ALTER TABLE multipartIdentifier REPLACE PARTITION FIELD transform WITH transform (AS name=identifier)? #replacePartitionField
	\| ALTER TABLE multipartIdentifier WRITE writeSpec #setWriteDistributionAndOrdering
	\| ALTER TABLE multipartIdentifier SET IDENTIFIER_KW FIELDS fieldList #setIdentifierFields
	\| ALTER TABLE multipartIdentifier DROP IDENTIFIER_KW FIELDS fieldList #dropIdentifierFields
	\| ALTER TABLE multipartIdentifier createReplaceBranchClause #createOrReplaceBranch
	\| ALTER TABLE multipartIdentifier createReplaceTagClause #createOrReplaceTag
	\| ALTER TABLE multipartIdentifier DROP BRANCH (IF EXISTS)? identifier #dropBranch
	\| ALTER TABLE multipartIdentifier DROP TAG (IF EXISTS)? identifier #dropTag
	;

	createReplaceTagClause
	: (CREATE OR)? REPLACE TAG identifier tagOptions
	\| CREATE TAG (IF NOT EXISTS)? identifier tagOptions
	;

	createReplaceBranchClause
	: (CREATE OR)? REPLACE BRANCH identifier branchOptions
	\| CREATE BRANCH (IF NOT EXISTS)? identifier branchOptions
	;

	tagOptions
	: (AS OF VERSION snapshotId)? (refRetain)?
	;

	branchOptions
	: (AS OF VERSION snapshotId)? (refRetain)? (snapshotRetention)?
	;

	snapshotRetention
	: WITH SNAPSHOT RETENTION minSnapshotsToKeep
	\| WITH SNAPSHOT RETENTION maxSnapshotAge
	\| WITH SNAPSHOT RETENTION minSnapshotsToKeep maxSnapshotAge
	;

	refRetain
	: RETAIN number timeUnit
	;

	maxSnapshotAge
	: number timeUnit
	;

	minSnapshotsToKeep
	: number SNAPSHOTS
	;

	writeSpec
	: (writeDistributionSpec \| writeOrderingSpec)*
	;

	writeDistributionSpec
	: DISTRIBUTED BY PARTITION
	;

	writeOrderingSpec
	: LOCALLY? ORDERED BY order
	\| UNORDERED
	;

	callArgument
	: expression #positionalArgument
	\| identifier '=>' expression #namedArgument
	;

	singleOrder
	: order EOF
	;

	order
	: fields+=orderField (',' fields+=orderField)*
	\| '(' fields+=orderField (',' fields+=orderField)* ')'
	;

	orderField
	: transform direction=(ASC \| DESC)? (NULLS nullOrder=(FIRST \| LAST))?
	;

	transform
	: multipartIdentifier #identityTransform
	\| transformName=identifier
	'(' arguments+=transformArgument (',' arguments+=transformArgument)* ')' #applyTransform
	;

	transformArgument
	: multipartIdentifier
	\| constant
	;

	expression
	: constant
	\| stringMap
	\| stringArray
	;

	constant
	: number #numericLiteral
	\| booleanValue #booleanLiteral
	\| STRING+ #stringLiteral
	\| identifier STRING #typeConstructor
	;

	stringMap
	: MAP '(' constant (',' constant)* ')'
	;

	stringArray
	: ARRAY '(' constant (',' constant)* ')'
	;

	booleanValue
	: TRUE \| FALSE
	;

	number
	: MINUS? EXPONENT_VALUE #exponentLiteral
	\| MINUS? DECIMAL_VALUE #decimalLiteral
	\| MINUS? INTEGER_VALUE #integerLiteral
	\| MINUS? BIGINT_LITERAL #bigIntLiteral
	\| MINUS? SMALLINT_LITERAL #smallIntLiteral
	\| MINUS? TINYINT_LITERAL #tinyIntLiteral
	\| MINUS? DOUBLE_LITERAL #doubleLiteral
	\| MINUS? FLOAT_LITERAL #floatLiteral
	\| MINUS? BIGDECIMAL_LITERAL #bigDecimalLiteral
	;

	multipartIdentifier
	: parts+=identifier ('.' parts+=identifier)*
	;

	identifier
	: IDENTIFIER #unquotedIdentifier
	\| quotedIdentifier #quotedIdentifierAlternative
	\| nonReserved #unquotedIdentifier
	;

	quotedIdentifier
	: BACKQUOTED_IDENTIFIER
	;

	fieldList
	: fields+=multipartIdentifier (',' fields+=multipartIdentifier)*
	;

	nonReserved
	: ADD \| ALTER \| AS \| ASC \| BRANCH \| BY \| CALL \| CREATE \| DAYS \| DESC \| DROP \| EXISTS \| FIELD \| FIRST \| HOURS \| IF \| LAST \| NOT \| NULLS \| OF \| OR \| ORDERED \| PARTITION \| TABLE \| WRITE
	\| DISTRIBUTED \| LOCALLY \| MINUTES \| MONTHS \| UNORDERED \| REPLACE \| RETAIN \| VERSION \| WITH \| IDENTIFIER_KW \| FIELDS \| SET \| SNAPSHOT \| SNAPSHOTS
	\| TAG \| TRUE \| FALSE
	\| MAP
	;

	snapshotId
	: number
	;

	numSnapshots
	: number
	;

	timeUnit
	: DAYS
	\| HOURS
	\| MINUTES
	;

	ADD: 'ADD';
	ALTER: 'ALTER';
	AS: 'AS';
	ASC: 'ASC';
	BRANCH: 'BRANCH';
	BY: 'BY';
	CALL: 'CALL';
	DAYS: 'DAYS';
	DESC: 'DESC';
	DISTRIBUTED: 'DISTRIBUTED';
	DROP: 'DROP';
	EXISTS: 'EXISTS';
	FIELD: 'FIELD';
	FIELDS: 'FIELDS';
	FIRST: 'FIRST';
	HOURS: 'HOURS';
	IF : 'IF';
	LAST: 'LAST';
	LOCALLY: 'LOCALLY';
	MINUTES: 'MINUTES';
	MONTHS: 'MONTHS';
	CREATE: 'CREATE';
	NOT: 'NOT';
	NULLS: 'NULLS';
	OF: 'OF';
	OR: 'OR';
	ORDERED: 'ORDERED';
	PARTITION: 'PARTITION';
	REPLACE: 'REPLACE';
	RETAIN: 'RETAIN';
	RETENTION: 'RETENTION';
	IDENTIFIER_KW: 'IDENTIFIER';
	SET: 'SET';
	SNAPSHOT: 'SNAPSHOT';
	SNAPSHOTS: 'SNAPSHOTS';
	TABLE: 'TABLE';
	TAG: 'TAG';
	UNORDERED: 'UNORDERED';
	VERSION: 'VERSION';
	WITH: 'WITH';
	WRITE: 'WRITE';

	TRUE: 'TRUE';
	FALSE: 'FALSE';

	MAP: 'MAP';
	ARRAY: 'ARRAY';

	PLUS: '+';
	MINUS: '-';

	STRING
	: '\'' ( ~('\''\|'\\') \| ('\\' .) )* '\''
	\| '"' ( ~('"'\|'\\') \| ('\\' .) )* '"'
	;

	BIGINT_LITERAL
	: DIGIT+ 'L'
	;

	SMALLINT_LITERAL
	: DIGIT+ 'S'
	;

	TINYINT_LITERAL
	: DIGIT+ 'Y'
	;

	INTEGER_VALUE
	: DIGIT+
	;

	EXPONENT_VALUE
	: DIGIT+ EXPONENT
	\| DECIMAL_DIGITS EXPONENT {isValidDecimal()}?
	;

	DECIMAL_VALUE
	: DECIMAL_DIGITS {isValidDecimal()}?
	;

	FLOAT_LITERAL
	: DIGIT+ EXPONENT? 'F'
	\| DECIMAL_DIGITS EXPONENT? 'F' {isValidDecimal()}?
	;

	DOUBLE_LITERAL
	: DIGIT+ EXPONENT? 'D'
	\| DECIMAL_DIGITS EXPONENT? 'D' {isValidDecimal()}?
	;

	BIGDECIMAL_LITERAL
	: DIGIT+ EXPONENT? 'BD'
	\| DECIMAL_DIGITS EXPONENT? 'BD' {isValidDecimal()}?
	;

	IDENTIFIER
	: (LETTER \| DIGIT \| '_')+
	;

	BACKQUOTED_IDENTIFIER
	: '`' ( ~'`' \| '``' )* '`'
	;

	fragment DECIMAL_DIGITS
	: DIGIT+ '.' DIGIT*
	\| '.' DIGIT+
	;

	fragment EXPONENT
	: 'E' [+-]? DIGIT+
	;

	fragment DIGIT
	: [0-9]
	;

	fragment LETTER
	: [A-Z]
	;

	SIMPLE_COMMENT
	: '--' ('\\\n' \| ~[\r\n])* '\r'? '\n'? -> channel(HIDDEN)
	;

	BRACKETED_COMMENT
	: '/' {!isHint()}? (BRACKETED_COMMENT\|.)? '*/' -> channel(HIDDEN)
	;

	WS
	: [ \r\n\t]+ -> channel(HIDDEN)
	;

	// Catch-all for anything we can't recognize.
	// We use this to be able to ignore and recover all the text
	// when splitting statements with DelimiterLexer
	UNRECOGNIZED
	: .
	;