* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
grammar Pydml;
// This antlr grammar is based on Python 3.3 language reference:
tokens { INDENT, DEDENT }
@lexer::members {
private boolean debugIndentRules = false;
// Indentation level stack
private java.util.Stack<Integer> indents = new java.util.Stack<Integer>();
// Extra tokens queue (see the NEWLINE rule).
private java.util.Queue<Token> tokens = new java.util.LinkedList<Token>();
// Number of opened braces, brackets and parenthesis.
private int opened = 0;
// This is only used to set the line number for dedent
private Token lastToken = null;
public void emit(Token t) {
System.out.println("Emitted token:" + t);
public Token nextToken() {
if (_input.LA(1) == EOF && !this.indents.isEmpty()) {
System.out.println("EOF reached and expecting some DEDENTS, so emitting them");
this.emit(commonToken(PydmlParser.NEWLINE, "\n"));
// Now emit as much DEDENT tokens as needed.
while (!indents.isEmpty()) {
System.out.println("Emitting (inserted) DEDENTS");
// Put the EOF back on the token stream.
this.emit(commonToken(PydmlParser.EOF, "<EOF>"));
Token next = super.nextToken();
if (next.getChannel() == Token.DEFAULT_CHANNEL) {
// Keep track of the last token on the default channel.
this.lastToken = next;
Token retVal = tokens.isEmpty() ? next : tokens.poll();
System.out.println("Returning nextToken: [" + retVal + "]<<" + tokens.isEmpty());
return retVal;
private Token createDedent() {
CommonToken dedent = commonToken(PydmlParser.DEDENT, "");
return dedent;
private CommonToken commonToken(int type, String text) {
// Nike: Main change: This logic was screwed up and was emitting additional 3 characters, so commenting it for now.
// int start = this.getCharIndex();
// int stop = start + text.length();
// return new CommonToken(this._tokenFactorySourcePair, type, DEFAULT_TOKEN_CHANNEL, start, stop);
return new CommonToken(type, text); // Main change
// Calculates the indentation level from the spaces:
// "Tabs are replaced (from left to right) by one to eight spaces
// such that the total number of characters up to and including
// the replacement is a multiple of eight [...]"
static int getIndentationCount(String spaces) {
int count = 0;
for (char ch : spaces.toCharArray()) {
switch (ch) {
case '\t':
count += 8 - (count % 8);
// A normal space char.
return count;
// 2. Modify this g4 by comparing it with Java:
// -
// -
// -
// -
// DML Program is a list of expression
// For now, we only allow global function definitions (not nested or inside a while block)
programroot: (blocks+=statement | functionBlocks+=functionStatement)* NEWLINE* EOF;
statement returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
// ------------------------------------------
// ImportStatement
'source' OPEN_PAREN filePath = STRING CLOSE_PAREN 'as' namespace=ID NEWLINE # ImportStatement
| 'setwd' OPEN_PAREN pathValue = STRING CLOSE_PAREN NEWLINE # PathStatement
// ------------------------------------------
// AssignmentStatement
| targetList=dataIdentifier '=' 'ifdef' OPEN_PAREN commandLineParam=dataIdentifier ',' source=expression CLOSE_PAREN NEWLINE # IfdefAssignmentStatement
// ------------------------------------------
// Treat function call as AssignmentStatement or MultiAssignmentStatement
// For backward compatibility and also since the behavior of foo() * A + foo() ... where foo returns A
// Convert FunctionCallIdentifier(paramExprs, ..) -> source
| // TODO: Throw an informative error if user doesnot provide the optional assignment
( targetList=dataIdentifier '=' )? name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN NEWLINE # FunctionCallAssignmentStatement
| OPEN_BRACK targetList+=dataIdentifier (',' targetList+=dataIdentifier)* CLOSE_BRACK '=' name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN NEWLINE # FunctionCallMultiAssignmentStatement
// {notifyErrorListeners("Too many parentheses");}
// We don't support block statement
// | '{' body+=expression ';'* ( body+=expression ';'* )* '}' # BlockStatement
// ------------------------------------------
| targetList=dataIdentifier '=' source=expression NEWLINE # AssignmentStatement
// IfStatement
// | 'if' OPEN_PAREN predicate=expression CLOSE_PAREN (ifBody+=statement ';'* | NEWLINE INDENT (ifBody+=statement)+ DEDENT ) ('else' (elseBody+=statement ';'* | '{' (elseBody+=statement ';'*)* '}'))? # IfStatement
| 'if' (OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression) ':' NEWLINE INDENT (ifBody+=statement)+ DEDENT (elifBranches += elifBranch)* ('else' ':' NEWLINE INDENT (elseBody+=statement)+ DEDENT )? # IfStatement
// ------------------------------------------
// ForStatement & ParForStatement
| 'for' (OPEN_PAREN iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* CLOSE_PAREN | iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # ForStatement
// Convert strictParameterizedExpression to HashMap<String, String> for parForParams
| 'parfor' (OPEN_PAREN iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* CLOSE_PAREN | iterVar=ID 'in' iterPred=iterablePredicate (',' parForParams+=strictParameterizedExpression)* ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # ParForStatement
| 'while' ( OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression ) ':' NEWLINE INDENT (body+=statement)+ DEDENT # WhileStatement
// ------------------------------------------
| NEWLINE #IgnoreNewLine
elifBranch returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
'elif' (OPEN_PAREN predicate=expression CLOSE_PAREN | predicate=expression) ':' NEWLINE INDENT (elifBody+=statement)+ DEDENT
iterablePredicate returns [ org.apache.sysml.parser.common.ExpressionInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.ExpressionInfo();
} :
from=expression ':' to=expression #IterablePredicateColonExpression
| ID OPEN_PAREN from=expression ',' to=expression (',' increment=expression)? CLOSE_PAREN #IterablePredicateSeqExpression
functionStatement returns [ org.apache.sysml.parser.common.StatementInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.StatementInfo();
} :
// ------------------------------------------
// FunctionStatement & ExternalFunctionStatement
// small change: only allow typed arguments here ... instead of data identifier
'def' name=ID OPEN_PAREN ( inputParams+=typedArgNoAssign (',' inputParams+=typedArgNoAssign)* )? CLOSE_PAREN ( '->' OPEN_PAREN ( outputParams+=typedArgNoAssign (',' outputParams+=typedArgNoAssign)* )? CLOSE_PAREN )? ':' NEWLINE INDENT (body+=statement)+ DEDENT # InternalFunctionDefExpression
| 'defExternal' name=ID OPEN_PAREN ( inputParams+=typedArgNoAssign (',' inputParams+=typedArgNoAssign)* )? CLOSE_PAREN ( '->' OPEN_PAREN ( outputParams+=typedArgNoAssign (',' outputParams+=typedArgNoAssign)* )? CLOSE_PAREN )? 'implemented' 'in' OPEN_PAREN ( otherParams+=strictParameterizedKeyValueString (',' otherParams+=strictParameterizedKeyValueString)* )? CLOSE_PAREN NEWLINE # ExternalFunctionDefExpression
// ------------------------------------------
// Other data identifiers are typedArgNoAssign, parameterizedExpression and strictParameterizedExpression
dataIdentifier returns [ org.apache.sysml.parser.common.ExpressionInfo dataInfo ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$dataInfo = new org.apache.sysml.parser.common.ExpressionInfo();
// $dataInfo.expr = new org.apache.sysml.parser.DataIdentifier();
} :
// ------------------------------------------
// IndexedIdentifier -- allows implicit lower and upper bounds
name=ID OPEN_BRACK (rowLower=expression)? (rowImplicitSlice=':' (rowUpper=expression)?)? (',' (colLower=expression)? (colImplicitSlice=':' (colUpper=expression)?)?)? CLOSE_BRACK # IndexedExpression
// ------------------------------------------
| ID # SimpleDataIdentifierExpression
| COMMANDLINE_NAMED_ID # CommandlineParamExpression
| COMMANDLINE_POSITION_ID # CommandlinePositionExpression
expression returns [ org.apache.sysml.parser.common.ExpressionInfo info ]
@init {
// This actions occurs regardless of how many alternatives in this rule
$info = new org.apache.sysml.parser.common.ExpressionInfo();
// $info.expr = new org.apache.sysml.parser.BinaryExpression(org.apache.sysml.parser.Expression.BinaryOp.INVALID);
} :
// ------------------------------------------
// BinaryExpression
// power
<assoc=right> left=expression op='**' right=expression # PowerExpression
// unary plus and minus
| op=('-'|'+') left=expression # UnaryExpression
// sequence - since we are only using this into for loop => Array not supported
//| left=expression op=':' right=expression # SequenceExpression
// matrix multiply
// | left=expression op='*' right=expression # MatrixMulExpression
// modulus and integer division
| left=expression op=('//' | '%' ) right=expression # ModIntDivExpression
// arithmetic multiply and divide
| left=expression op=('*'|'/') right=expression # MultDivExpression
// arithmetic addition and subtraction
| left=expression op=('+'|'-') right=expression # AddSubExpression
// ------------------------------------------
// RelationalExpression
| left=expression op=('>'|'>='|'<'|'<='|'=='|'!=') right=expression # RelationalExpression
// ------------------------------------------
// BooleanExpression
// boolean not
| op='!' left=expression # BooleanNotExpression
// boolean and
| left=expression op=('&'|'and') right=expression # BooleanAndExpression
// boolean or
| left=expression op=('|'|'or') right=expression # BooleanOrExpression
// ---------------------------------
// only applicable for builtin function expressions
// Add following additional functions and check number of parameters:
// power, full, matrix, reshape, dot
// Also take care whether there is y.transpose() => which sometinamespace
| name=ID OPEN_PAREN (paramExprs+=parameterizedExpression (',' paramExprs+=parameterizedExpression)* )? CLOSE_PAREN ';'* # BuiltinFunctionExpression
// 4. Atomic
| OPEN_PAREN left=expression CLOSE_PAREN # AtomicExpression
// Should you allow indexed expression here ?
// | OPEN_BRACK targetList+=expression (',' targetList+=expression)* CLOSE_BRACK # MultiIdExpression
// | BOOLEAN # ConstBooleanIdExpression
| 'True' # ConstTrueExpression
| 'False' # ConstFalseExpression
| INT # ConstIntIdExpression
| DOUBLE # ConstDoubleIdExpression
| STRING # ConstStringIdExpression
| dataIdentifier # DataIdExpression
// Special
// | 'NULL' | 'NA' | 'Inf' | 'NaN'
typedArgNoAssign : paramName=ID ':' paramType=ml_type ;
parameterizedExpression : (paramName=ID '=')? paramVal=expression;
strictParameterizedExpression : paramName=ID '=' paramVal=expression ;
strictParameterizedKeyValueString : paramName=ID '=' paramVal=STRING ;
// sometimes this is matrix object and sometimes its namespace
// Special ID cases:
// | 'matrix' // --> This is a special case which causes lot of headache
// | 'scalar' | 'float' | 'int' | 'bool' // corresponds to as.scalar, as.double, as.integer and as.logical
| 'index.return'
// Unfortunately, we have datatype name clashing with builtin function name: matrix :(
// Therefore, ugly work around for checking datatype
ml_type : valueType | dataType OPEN_BRACK valueType CLOSE_BRACK;
// Note to reduce number of keywords, these are case-sensitive,
// To allow case-insenstive, 'int' becomes: ('i' | 'I') ('n' | 'N') ('t' | 'T')
ID # ValueDataTypeCheck
// 'int' | 'str' | 'bool' | 'float'
// 'scalar' # ScalarDataTypeDummyCheck
// |
ID # MatrixDataTypeCheck //{ if($ID.text.compareTo("matrix") != 0) { notifyErrorListeners("incorrect datatype"); } }
//| 'matrix' //---> See ID, this causes lot of headache
INT : DIGIT+ [Ll]?;
| DIGIT+ EXP? [Ll]?
| '.' DIGIT+ EXP? [Ll]?
DIGIT: '0'..'9';
ALPHABET : [a-zA-Z] ;
fragment EXP : ('E' | 'e') ('+' | '-')? INT ;
// supports single and double quoted string with escape characters
STRING: '"' ( ESC | ~[\\"] )*? '"' | '\'' ( ESC | ~[\\'] )*? '\'';
fragment ESC : '\\' [btnfr"'\\] ;
// Comments, whitespaces and new line
// LINE_COMMENT : '#' .*? '\r'? '\n' -> skip ;
// MULTILINE_BLOCK_COMMENT : '/*' .*? '*/' -> skip ;
// WHITESPACE : (' ' | '\r' | '\n')+ -> skip ;
OPEN_BRACK : '[' {opened++;};
CLOSE_BRACK : ']' {opened--;};
OPEN_PAREN : '(' {opened++;};
CLOSE_PAREN : ')' {opened--;};
// OPEN_BRACE : '{' {opened++;};
// CLOSE_BRACE : '}' {opened--;};
fragment SPACES : [ \t]+ ;
fragment COMMENT : '#' ~[\r\n]* ;
fragment LINE_JOINING : '\\' SPACES? ( '\r'? '\n' | '\r' ) ;
NEWLINE : ( '\r'? '\n' | '\r' ) SPACES?
String newLine = getText().replaceAll("[^\r\n]+", "");
String spaces = getText().replaceAll("[\r\n]+", "");
int next = _input.LA(1);
if (opened > 0 || next == '\r' || next == '\n' || next == '#') {
// If we're inside a list or on a blank line, ignore all indents,
// dedents and line breaks.
if(debugIndentRules) {
if(next == '\r' || next == '\n') {
System.out.println("4.1 Skipping (blank lines)");
else if(next == '#') {
System.out.println("4.2 Skipping (comment)");
else {
System.out.println("4.2 Skipping something else");
else {
emit(commonToken(NEWLINE, newLine));
int indent = getIndentationCount(spaces);
int previous = indents.isEmpty() ? 0 : indents.peek();
if (indent == previous) {
System.out.println("3. Skipping identation as of same size:" + next);
// skip indents of the same size as the present indent-size
else if (indent > previous) {
System.out.println("1. Indent:" + next);
emit(commonToken(PydmlParser.INDENT, spaces));
else {
// Possibly emit more than 1 DEDENT token.
while(!indents.isEmpty() && indents.peek() > indent) {
System.out.println("2. Dedent:" + next);