blob: 551569c14cea0e9e602e1590a978ac7b9e139444 [file] [log] [blame]
* Licensed to the Apache Software Foundation (ASF) under one
* or more contributor license agreements. See the NOTICE file
* distributed with this work for additional information
* regarding copyright ownership. The ASF licenses this file
* to you under the Apache License, Version 2.0 (the
* "License"); you may not use this file except in compliance
* with the License. You may obtain a copy of the License at
* Unless required by applicable law or agreed to in writing,
* software distributed under the License is distributed on an
* KIND, either express or implied. See the License for the
* specific language governing permissions and limitations
* under the License.
package org.apache.sysds.parser.dml;
import java.util.Map;
import org.antlr.v4.runtime.ANTLRInputStream;
import org.antlr.v4.runtime.BailErrorStrategy;
import org.antlr.v4.runtime.CommonTokenStream;
import org.antlr.v4.runtime.DefaultErrorStrategy;
import org.antlr.v4.runtime.atn.PredictionMode;
import org.antlr.v4.runtime.misc.ParseCancellationException;
import org.antlr.v4.runtime.tree.ParseTree;
import org.antlr.v4.runtime.tree.ParseTreeWalker;
import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;
import org.apache.sysds.api.DMLScript;
import org.apache.sysds.parser.DMLProgram;
import org.apache.sysds.parser.FunctionDictionary;
import org.apache.sysds.parser.FunctionStatementBlock;
import org.apache.sysds.parser.ImportStatement;
import org.apache.sysds.parser.LanguageException;
import org.apache.sysds.parser.ParseException;
import org.apache.sysds.parser.ParserWrapper;
import org.apache.sysds.parser.Statement;
import org.apache.sysds.parser.dml.DmlParser.FunctionStatementContext;
import org.apache.sysds.parser.dml.DmlParser.ProgramrootContext;
import org.apache.sysds.parser.dml.DmlParser.StatementContext;
* This is the main entry point for the Antlr4 parser.
* Dml.g4 is the grammar file which enforces syntactic structure of DML program.
* DmlSyntaticValidator on other hand captures little bit of semantic as well as does the job of translation of Antlr AST to DMLProgram.
* At a high-level, DmlSyntaticValidator implements call-back methods that are called by walker.walk(validator, tree)
* The callback methods are of two type: enterSomeASTNode() and exitSomeASTNode()
* It is important to note that almost every node in AST has either ExpressionInfo or StatementInfo object associated with it.
* The key design decision is that while "exiting" the node (i.e. callback to exitSomeASTNode), we use information in given AST node and construct an object of type Statement or Expression and put it in StatementInfo or ExpressionInfo respectively.
* This way it avoids any bugs due to lookahead and one only has to "think as an AST node", thereby making any changes to parse code much simpler :)
* Note: to add additional builtin function, one only needs to modify DmlSyntaticValidator (which is java file and provides full Eclipse tooling support) not g4.
* To separate logic of semantic validation, DmlSyntaticValidatorHelper contains functions that do semantic validation. Currently, there is no semantic validation as most of it is delegated to subsequent validation phase.
* Whenever there is a parse error, it goes through CustomErrorListener. This allows us to pipe the error messages to any future pipeline as well as control the format in an elegant manner.
* There are three types of messages passed:
* - Syntactic errors: When passed DML script doesnot conform to syntatic structure enforced by Dml.g4
* - Validation errors: Errors due to translation of AST to DMLProgram
* - Validation warnings: Messages to inform users that there might be potential bug in their program
* As of this moment, Antlr4ParserWrapper is stateful and cannot be multithreaded. This is not big deal because each users calls SystemDS in different process.
* If in future we intend to make it multi-threaded, look at cleanUpState method and resolve the dependency accordingly.
public class DMLParserWrapper extends ParserWrapper
private static final Log LOG = LogFactory.getLog(DMLScript.class.getName());
* Parses the passed file with command line parameters. You can either pass both (local file) or just dmlScript (hdfs) or just file name (import command)
* @param fileName either full path or null --> only used for better error handling
* @param dmlScript script file contents
* @param argVals script arguments
* @return dml program, or null if error
public DMLProgram parse(String fileName, String dmlScript, Map<String,String> argVals) {
return doParse(fileName, dmlScript, null, argVals);
* This function is supposed to be called directly only from DmlSyntacticValidator when it encounters 'import'
* @param fileName script file name
* @param dmlScript script file contents
* @param sourceNamespace namespace from source statement
* @param argVals script arguments
* @return dml program, or null if at least one error
public DMLProgram doParse(String fileName, String dmlScript, String sourceNamespace, Map<String,String> argVals) {
DMLProgram dmlPgm = null;
ANTLRInputStream in;
try {
if(dmlScript == null) {
dmlScript = readDMLScript(fileName, LOG);
InputStream stream = new ByteArrayInputStream(dmlScript.getBytes());
in = new ANTLRInputStream(stream);
} catch (FileNotFoundException e) {
throw new ParseException("Cannot find file/resource: " + fileName, e);
} catch (IOException e) {
throw new ParseException("Cannot open file: " + fileName, e);
} catch (LanguageException e) {
throw new ParseException(e.getMessage(), e);
ProgramrootContext ast = null;
CustomErrorListener errorListener = new CustomErrorListener();
try {
DmlLexer lexer = new DmlLexer(in);
CommonTokenStream tokens = new CommonTokenStream(lexer);
DmlParser antlr4Parser = new DmlParser(tokens);
boolean tryOptimizedParsing = false; // For now no optimization, since it is not able to parse integer value.
if(tryOptimizedParsing) {
// Try faster and simpler SLL
antlr4Parser.setErrorHandler(new BailErrorStrategy());
ast = antlr4Parser.programroot();
// If successful, no need to try out full LL(*) ... SLL was enough
catch(ParseCancellationException ex) {
// Error occurred, so now try full LL(*) for better error messages
if(fileName != null) {
else {
// Set our custom error listener
antlr4Parser.setErrorHandler(new DefaultErrorStrategy());
ast = antlr4Parser.programroot();
else {
// Set our custom error listener
// Now do the parsing
ast = antlr4Parser.programroot();
catch(Exception e) {
throw new ParseException("ERROR: Cannot parse the program:" + fileName, e);
// Now convert the parse tree into DMLProgram
// Do syntactic validation while converting
ParseTree tree = ast;
// And also do syntactic validation
ParseTreeWalker walker = new ParseTreeWalker();
// Get list of function definitions which take precedence over built-in functions if same name
DmlPreprocessor prep = new DmlPreprocessor(errorListener);
walker.walk(prep, tree);
// Syntactic validation
DmlSyntacticValidator validator = new DmlSyntacticValidator(errorListener, argVals, sourceNamespace, prep.getFunctionDefs());
walker.walk(validator, tree);
this.parseIssues = errorListener.getParseIssues();
this.atLeastOneWarning = errorListener.isAtLeastOneWarning();
this.atLeastOneError = errorListener.isAtLeastOneError();
if (atLeastOneError) {
throw new ParseException(parseIssues, dmlScript);
if (atLeastOneWarning) {
LOG.warn(CustomErrorListener.generateParseIssuesMessage(dmlScript, parseIssues));
dmlPgm = createDMLProgram(ast, sourceNamespace);
return dmlPgm;
private static DMLProgram createDMLProgram(ProgramrootContext ast, String sourceNamespace)
DMLProgram dmlPgm = new DMLProgram();
String namespace = (sourceNamespace != null && sourceNamespace.length() > 0)
? sourceNamespace : DMLProgram.DEFAULT_NAMESPACE;
dmlPgm.getNamespaces().put(namespace, new FunctionDictionary<>());
// add all functions from the main script file
for(FunctionStatementContext fn : ast.functionBlocks) {
FunctionStatementBlock functionStmtBlk = new FunctionStatementBlock();
try {
dmlPgm.addFunctionStatementBlock(namespace,, functionStmtBlk);
} catch (LanguageException e) {
LOG.error("line: " + fn.start.getLine() + ":" + fn.start.getCharPositionInLine()
+ " error processing function " +;
return null;
// add statements from main script file, as well as
// functions from imports and dml-bodied builtin functions
for(StatementContext stmtCtx : ast.blocks) {
Statement current =;
if(current == null) {
LOG.error("line: " + stmtCtx.start.getLine() + ":"
+ stmtCtx.start.getCharPositionInLine() + " error processing statement");
return null;
if(current instanceof ImportStatement) {
// Handle import statements separately
if( != null) {
// Add the DMLProgram entries into current program
for(Map.Entry<String, FunctionDictionary<FunctionStatementBlock>> e : {
addFunctions(dmlPgm, e.getKey(), e.getValue());
else {
LOG.error("line: " + stmtCtx.start.getLine() + ":"
+ stmtCtx.start.getCharPositionInLine() + " error processing import");
return null;
// Now wrap statement into individual statement block
// merge statement will take care of merging these blocks
return dmlPgm;
private static void addFunctions(DMLProgram dmlPgm, String namespace, FunctionDictionary<FunctionStatementBlock> dict) {
// TODO handle namespace key already exists for different program value instead of overwriting
if (dict != null)
dmlPgm.getNamespaces().put(namespace, dict);