blob: ea546e3b43083acabe933af45851b0eae747af29 [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "Common.h"
#include "BuildGrammar.h"
#include "Util.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <assert.h>
#define MAX_CHILDREN 64
typedef struct Builder Builder;
struct Builder {
Grammar *gram;
const char *input;
int len;
};
static Expression *buildExpression(Builder *builder, Term *term);
static char *unescapeLiteral(const char *escaped)
{
size_t escapedLen = strlen(escaped);
size_t escapedPos = 0;
size_t unescapedLen = 0;
char *unescaped = (char *)malloc(escapedLen+1);
while (escapedPos < escapedLen) {
char c = escaped[escapedPos++];
if ((c == '\\') && (escapedPos < escapedLen)) {
c = escaped[escapedPos++];
switch (c) {
case 'n':
unescaped[unescapedLen++] = '\n';
break;
case 'r':
unescaped[unescapedLen++] = '\r';
break;
case 't':
unescaped[unescapedLen++] = '\t';
break;
default:
unescaped[unescapedLen++] = c;
break;
}
}
else {
unescaped[unescapedLen++] = c;
}
}
unescaped[unescapedLen] = '\0';
return unescaped;
}
// Sanity checking functions
//
// These are just used to verify that a Term we're about to use is of the expected type.
// Since the format of the parse tree is likely to change in these early stages of development,
// this will catch cases where the code in this file has not been adapted to those changes
//
// The large number of assert statements in this file, most of which use these functions, will
// likely only be necessary in the short term. Ideally we should be able to get to a point where
// one can safely write code that consumes a parse tree without having so many sanity checks.
static int isTerm(Term *term, ExprKind kind, int count)
{
return ((TermKind(term) == kind) && (TermCount(term) == count));
}
static int isSequence(Term *term, int count)
{
return isTerm(term,SequenceExpr,count);
}
static int isIdent(Term *term, const char *name)
{
return (isTerm(term,IdentExpr,1) && !strcmp(name,ExprIdentValue(TermType(term))));
}
// Extract a substring of the input based on a specific term, or one or more children of a term.
//
// Each Term object has a start and end field, which specify which portion of the input string
// the term covers (that is, the portion of the input consumed durin parsing of that term). In
// some cases we want all the text matched by a term, and in others we only want the text matched
// by specific children. An example of the latter is a "pseudo-terminal" like Identifier where
// we want all characters except the trailing whitespace.
static char *termString(Builder *builder, Term *term)
{
assert(term->start >= 0);
assert(term->end <= builder->len);
int len = term->end - term->start;
char *str = malloc(len+1);
memcpy(str,&builder->input[term->start],len);
str[len] = '\0';
return str;
}
static char *identifierString(Builder *builder, Term *term)
{
assert(isIdent(term,"Identifier"));
Term *body = TermChildAt(term,0);
assert(isSequence(body,2));
Term *content = TermChildAt(body,0);
Term *spacing = TermChildAt(body,1);
assert(TermKind(content) == StringExpr);
assert(isIdent(spacing,"Spacing"));
return termString(builder,content);
}
// For Terms of type ChoiceExpr, determine which of the choices the corresponding term is
static int choiceIndex(Term *body)
{
assert(TermKind(body) == ChoiceExpr);
assert(TermCount(body) == 1);
Term *choice = TermChildAt(body,0);
int match = -1;
int choiceCount = ExprChoiceCount(TermType(body));
for (int i = 0; i < choiceCount; i++) {
if (TermType(choice) == ExpressionChildAt(TermType(body),i))
match = i;
}
return match;
}
// Expression building functions (build*)
//
// There is one of these for each type of expression that can be present in the grammar. The
// supplied Term object is, in all cases, of an IdentExpr type, whose single child is the body
// of the corresponding grammar rule.
//
// For example, buildPrimary is called with an ExpressionType of IdentExpr "Primary", and the
// body is a ChoiceExpr which can contain one of five possible child types, as given in the
// PEG grammar.
//
// To see the exact expression tree of a given rule, look at its definition in Builtin.c
static Expression *buildIdentifier(Builder *builder, Term *term)
{
char *str = identifierString(builder,term);
Expression *result = ExpressionNewIdent(str);
free(str);
return result;
}
static Expression *buildLiteral(Builder *builder, Term *term)
{
// The Literal rule in the built-in PEG grammar contains two choices - one for single quotes
// and the other for double quotes - which otherwise have the same structure. There are four
// children, where child 0 and child 2 represent the quotes themselves, child 1 represents the
// escaped representation of a string, and child 3 is any trailing whitespace.
//
// All we're interested in is the unescaped string, so we get child 2 and convert any escape
// sequences (e.g. \" or \n) to the characters they represent, and return a new literal
// expression with the unescaped string as its value.
assert(isIdent(term,"Literal"));
Term *body = TermChildAt(term,0);
assert(TermKind(body) == ChoiceExpr);
int index = choiceIndex(body);
assert((index == 0) || (index == 1));
Term *choice = TermChildAt(body,0);
assert(isSequence(choice,4));
Term *content = TermChildAt(choice,1);
char *escaped = termString(builder,content);
char *unescaped = unescapeLiteral(escaped);
Expression *result = ExpressionNewLit(unescaped);
free(escaped);
free(unescaped);
return result;
}
static int decodeRangeChar(Builder *builder, Term *charTerm)
{
// FIXME: Handle non-ASCII chars encoded as UTF-8, as well as numeric escape sequences
// We don't need to allocate a string for unescaped here, we should just do it directly
assert(isIdent(charTerm,"Char"));
char *escaped = termString(builder,charTerm);
char *unescaped = unescapeLiteral(escaped);
int result;
if (strlen(unescaped) == 0)
result = '?';
else if (unescaped[0] < 0) // Start of UTF-8 multibyte sequence
result = '?';
else
result = unescaped[0];
free(escaped);
free(unescaped);
return result;
}
static Expression *buildRange(Builder *builder, Term *term)
{
// A range expression, one or more of which appear inside a [...] character class expression,
// matches a character in a range between a start and end value. We use a single expression
// type to represent both exact matches (where the minimum and maximum are the same), and
// "true" ranges (which match two or more possible characters).
//
// Note that the representation we use for Range expressions is a (start,end) pair, where a
// character match must satisfy the condition start <= c < end (that is, the range includes the
// start, but does *not* include the end). This representation is for the convenience of other
// code that works with ranges. The ExpressionNewRange function however takes the minimum and
// maximum values - that is, a matching character c must satisfy min <= c <= max.
assert(isIdent(term,"Range"));
Term *body = TermChildAt(term,0);
int index = choiceIndex(body);
Term *choice = TermChildAt(body,0);
assert((index == 0) || (index == 1));
if (index == 0) {
assert(isSequence(choice,3));
Term *minChar = TermChildAt(choice,0);
Term *maxChar = TermChildAt(choice,2);
assert(isIdent(minChar,"Char"));
assert(isIdent(maxChar,"Char"));
int min = decodeRangeChar(builder,minChar);
int max = decodeRangeChar(builder,maxChar);
return ExpressionNewRange(min,max);
}
else {
assert(isIdent(choice,"Char"));
int value = decodeRangeChar(builder,choice);
return ExpressionNewRange(value,value);
}
}
static Expression *buildClass(Builder *builder, Term *term)
{
// A character class expression is the same as a choice expression, except that it is only
// supposed to contain range expressions.
assert(isIdent(term,"Class"));
Term *body = TermChildAt(term,0);
assert(isSequence(body,4));
Term *star = TermChildAt(body,1);
assert(TermKind(star) == StarExpr);
Expression *children[MAX_CHILDREN];
int count = 0;
for (TermList *item = TermChildren(star); (item != NULL) && (count < MAX_CHILDREN); item = item->next) {
assert(isSequence(item->term,2));
Term *rangeTerm = TermChildAt(item->term,1);
Expression *rangeExpr = buildRange(builder,rangeTerm);
children[count++] = rangeExpr;
}
return ExpressionNewClass(count,children);
}
static Expression *buildDot(Builder *builder, Term *term)
{
assert(isIdent(term,"DOT"));
return ExpressionNewDot();
}
static Expression *buildPrimary(Builder *builder, Term *term)
{
// A primary expression can be one of five possibilities: An identifier (reference to another
// rule in the grammar), a parenthesised expresion, a literal, a character class, or a dot
// (which matches any character).
//
// The type of the term is an Expression with kind == ChoiceExpr, and the term has exactly
// one child. We use choiceIndex to determine which of the five possible expression types
// the choice matches, and then call through to the relevant function to build the appropriate
// type of Expression object.
assert(isIdent(term,"Primary"));
Term *body = TermChildAt(term,0);
assert(TermKind(body) == ChoiceExpr);
assert(TermCount(body) == 1);
Term *choice = TermChildAt(body,0);
switch (choiceIndex(body)) {
case 0: {
assert(isSequence(choice,2));
Term *identifier = TermChildAt(choice,0);
return buildIdentifier(builder,identifier);
}
case 1: {
assert(isSequence(choice,4));
Term *expression = TermChildAt(choice,2);
Expression *result = buildExpression(builder,expression);
return ExpressionNewString(result);
}
case 2: {
assert(isSequence(choice,3));
Term *expression = TermChildAt(choice,1);
return buildExpression(builder,expression);
}
case 3:
return buildLiteral(builder,choice);
case 4:
return buildClass(builder,choice);
case 5:
return buildDot(builder,choice);
default:
assert(!"Invalid choice for Primary");
return NULL;
}
}
static Expression *buildSuffix(Builder *builder, Term *term)
{
// A suffix expression has two children. The second is optional, and is one of either QUESTION,
// STAR, or PLUS - which indicate that the first child may occur zero or one times, zero or more
// times, or one or more times. The first child is always present, and represents a primary
// expression with no prefix or suffix.
//
// If a QUESTION, STAR, or PLUS suffix is given, we wrap the constructed primary expression
// inside another Expression object of type OptExpr, StarExpr, or PlusExpr. Otherwise, we just
// return the primary expression directly.
assert(isIdent(term,"Suffix"));
Term *body = TermChildAt(term,0);
assert(isSequence(body,2));
Term *primary = TermChildAt(body,0);
Term *suffix = TermChildAt(body,1);
assert(isIdent(primary,"Primary"));
assert(TermKind(suffix) == OptExpr);
Expression *primaryExpr = buildPrimary(builder,primary);
assert((TermCount(suffix) == 0) || (TermCount(suffix) == 1));
if (TermCount(suffix) == 1) {
Term *suffixChild = TermChildAt(suffix,0);
int index = choiceIndex(suffixChild);
Term *choice = TermChildAt(suffixChild,0);
switch (index) {
case 0:
assert(isIdent(choice,"QUESTION"));
return ExpressionNewOpt(primaryExpr);
case 1:
assert(isIdent(choice,"STAR"));
return ExpressionNewStar(primaryExpr);
case 2:
assert(isIdent(choice,"PLUS"));
return ExpressionNewPlus(primaryExpr);
default:
assert(!"Invalid choice for Suffix");
break;
}
}
return primaryExpr;
}
static Expression *buildPrefix(Builder *builder, Term *term)
{
// A prefix expression has two children. The first is optional, and is one of either AND or
// NOT - which indicate a positive or negative lookahead assertion. The second is not optional,
// and represents a primary expression with an optional suffix.
//
// If an AND or NOT prefx is given, we wrap the constructed primary expression inside another
// Expression object of type AndExpr or NotExpr. Otherwise, we just return the primary
// expression directly.
assert(isIdent(term,"Prefix"));
Term *body = TermChildAt(term,0);
assert(isSequence(body,2));
Term *prefix = TermChildAt(body,0);
Term *suffix = TermChildAt(body,1);
assert(TermKind(prefix) == OptExpr);
assert(isIdent(suffix,"Suffix"));
Expression *suffixExpr = buildSuffix(builder,suffix);
assert((TermCount(prefix) == 0) || (TermCount(prefix) == 1));
if (TermCount(prefix) == 1) {
Term *prefixChild = TermChildAt(prefix,0);
int index = choiceIndex(prefixChild);
Term *choice = TermChildAt(prefixChild,0);
switch (index) {
case 0:
assert(isIdent(choice,"AND"));
return ExpressionNewAnd(suffixExpr);
case 1:
assert(isIdent(choice,"NOT"));
return ExpressionNewNot(suffixExpr);
default:
assert(!"Invalid choice for Prefix");
break;
}
}
return suffixExpr;
}
static Expression *buildSequence(Builder *builder, Term *term)
{
// An Sequence consists of one or more expressions, each of which has an optional prefix
// and suffix
assert(isIdent(term,"Sequence"));
Term *body = TermChildAt(term,0);
assert(TermKind(body) == StarExpr);
Expression *children[MAX_CHILDREN];
int count = 0;
for (TermList *item = TermChildren(body); (item != NULL) && (count < MAX_CHILDREN); item = item->next) {
children[count] = buildPrefix(builder,item->term);
count++;
}
if (count == 1)
return children[0];
else
return ExpressionNewSequence(count,children);
}
static Expression *buildExpression(Builder *builder, Term *term)
{
// An Expression consists of one or more choices
assert(isIdent(term,"Expression"));
Term *body = TermChildAt(term,0);
assert(isSequence(body,2));
Term *child0 = TermChildAt(body,0);
Term *child1 = TermChildAt(body,1);
assert(isIdent(child0,"Sequence"));
assert(TermKind(child1) == StarExpr);
Expression *initial = buildSequence(builder,child0);
if (TermCount(child1) == 0)
return initial;
Expression *children[MAX_CHILDREN];
children[0] = initial;
int count = 1;
for (TermList *item = TermChildren(child1); (item != NULL) && (count < MAX_CHILDREN); item = item->next) {
assert(isSequence(item->term,2));
children[count] = buildSequence(builder,TermChildAt(item->term,1));
count++;
}
return ExpressionNewChoice(count,children);
}
static void buildGrammar(Builder *builder, Term *term)
{
assert(isSequence(term,3));
Term *plus = TermChildAt(term,1);
assert(TermKind(plus) == PlusExpr);
for (TermList *plusItem = plus->children; plusItem != NULL; plusItem = plusItem->next) {
Term *defIdent = plusItem->term;
assert(isIdent(defIdent,"Definition"));
Term *defSeq = TermChildAt(defIdent,0);
assert(isSequence(defSeq,4));
Term *identTerm = TermChildAt(defSeq,0);
Term *exprTerm = TermChildAt(defSeq,2);
assert(isIdent(identTerm,"Identifier"));
assert(isIdent(exprTerm,"Expression"));
char *ruleName = identifierString(builder,identTerm);
Expression *ruleExpr = buildExpression(builder,exprTerm);
GrammarDefine(builder->gram,ruleName,ruleExpr);
free(ruleName);
}
GrammarResolve(builder->gram);
}
// This function creates a new Grammar object from the result of parsing a file usin the built-in
// PEG grammar. The resulting Grammar object can then be used to parse other files which are written
// in another language that is accepted by that grammar. The Term objects are constructed by the
// parse function defined in Parser.c.
Grammar *grammarFromTerm(Term *term, const char *input)
{
Grammar *gram = GrammarNew();
Builder *builder = (Builder *)calloc(1,sizeof(Builder));
builder->gram = gram;
builder->input = input;
builder->len = strlen(input);
buildGrammar(builder,term);
free(builder);
return gram;
}