| // Licensed to the Apache Software Foundation (ASF) under one |
| // or more contributor license agreements. See the NOTICE file |
| // distributed with this work for additional information |
| // regarding copyright ownership. The ASF licenses this file |
| // to you under the Apache License, Version 2.0 (the |
| // "License"); you may not use this file except in compliance |
| // with the License. You may obtain a copy of the License at |
| // |
| // http://www.apache.org/licenses/LICENSE-2.0 |
| // |
| // Unless required by applicable law or agreed to in writing, |
| // software distributed under the License is distributed on an |
| // "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY |
| // KIND, either express or implied. See the License for the |
| // specific language governing permissions and limitations |
| // under the License. |
| |
| #include "Common.h" |
| #include "BuildGrammar.h" |
| #include "Util.h" |
| #include <stdlib.h> |
| #include <stdio.h> |
| #include <string.h> |
| #include <assert.h> |
| |
| #define MAX_CHILDREN 64 |
| |
| typedef struct Builder Builder; |
| |
| struct Builder { |
| Grammar *gram; |
| const char *input; |
| int len; |
| }; |
| |
| static Expression *buildExpression(Builder *builder, Term *term); |
| |
| static char *unescapeLiteral(const char *escaped) |
| { |
| size_t escapedLen = strlen(escaped); |
| size_t escapedPos = 0; |
| size_t unescapedLen = 0; |
| char *unescaped = (char *)malloc(escapedLen+1); |
| |
| while (escapedPos < escapedLen) { |
| char c = escaped[escapedPos++]; |
| if ((c == '\\') && (escapedPos < escapedLen)) { |
| c = escaped[escapedPos++]; |
| switch (c) { |
| case 'n': |
| unescaped[unescapedLen++] = '\n'; |
| break; |
| case 'r': |
| unescaped[unescapedLen++] = '\r'; |
| break; |
| case 't': |
| unescaped[unescapedLen++] = '\t'; |
| break; |
| default: |
| unescaped[unescapedLen++] = c; |
| break; |
| } |
| } |
| else { |
| unescaped[unescapedLen++] = c; |
| } |
| } |
| |
| unescaped[unescapedLen] = '\0'; |
| return unescaped; |
| } |
| |
| // Sanity checking functions |
| // |
| // These are just used to verify that a Term we're about to use is of the expected type. |
| // Since the format of the parse tree is likely to change in these early stages of development, |
| // this will catch cases where the code in this file has not been adapted to those changes |
| // |
| // The large number of assert statements in this file, most of which use these functions, will |
| // likely only be necessary in the short term. Ideally we should be able to get to a point where |
| // one can safely write code that consumes a parse tree without having so many sanity checks. |
| |
| static int isTerm(Term *term, ExprKind kind, int count) |
| { |
| return ((TermKind(term) == kind) && (TermCount(term) == count)); |
| } |
| |
| static int isSequence(Term *term, int count) |
| { |
| return isTerm(term,SequenceExpr,count); |
| } |
| |
| static int isIdent(Term *term, const char *name) |
| { |
| return (isTerm(term,IdentExpr,1) && !strcmp(name,ExprIdentValue(TermType(term)))); |
| } |
| |
| // Extract a substring of the input based on a specific term, or one or more children of a term. |
| // |
| // Each Term object has a start and end field, which specify which portion of the input string |
| // the term covers (that is, the portion of the input consumed durin parsing of that term). In |
| // some cases we want all the text matched by a term, and in others we only want the text matched |
| // by specific children. An example of the latter is a "pseudo-terminal" like Identifier where |
| // we want all characters except the trailing whitespace. |
| |
| static char *termString(Builder *builder, Term *term) |
| { |
| assert(term->start >= 0); |
| assert(term->end <= builder->len); |
| int len = term->end - term->start; |
| char *str = malloc(len+1); |
| memcpy(str,&builder->input[term->start],len); |
| str[len] = '\0'; |
| return str; |
| } |
| |
| static char *identifierString(Builder *builder, Term *term) |
| { |
| assert(isIdent(term,"Identifier")); |
| Term *body = TermChildAt(term,0); |
| assert(isSequence(body,2)); |
| Term *content = TermChildAt(body,0); |
| Term *spacing = TermChildAt(body,1); |
| assert(TermKind(content) == StringExpr); |
| assert(isIdent(spacing,"Spacing")); |
| return termString(builder,content); |
| } |
| |
| // For Terms of type ChoiceExpr, determine which of the choices the corresponding term is |
| |
| static int choiceIndex(Term *body) |
| { |
| assert(TermKind(body) == ChoiceExpr); |
| assert(TermCount(body) == 1); |
| Term *choice = TermChildAt(body,0); |
| int match = -1; |
| int choiceCount = ExprChoiceCount(TermType(body)); |
| for (int i = 0; i < choiceCount; i++) { |
| if (TermType(choice) == ExpressionChildAt(TermType(body),i)) |
| match = i; |
| } |
| return match; |
| } |
| |
| // Expression building functions (build*) |
| // |
| // There is one of these for each type of expression that can be present in the grammar. The |
| // supplied Term object is, in all cases, of an IdentExpr type, whose single child is the body |
| // of the corresponding grammar rule. |
| // |
| // For example, buildPrimary is called with an ExpressionType of IdentExpr "Primary", and the |
| // body is a ChoiceExpr which can contain one of five possible child types, as given in the |
| // PEG grammar. |
| // |
| // To see the exact expression tree of a given rule, look at its definition in Builtin.c |
| |
| static Expression *buildIdentifier(Builder *builder, Term *term) |
| { |
| char *str = identifierString(builder,term); |
| Expression *result = ExpressionNewIdent(str); |
| free(str); |
| return result; |
| } |
| |
| static Expression *buildLiteral(Builder *builder, Term *term) |
| { |
| // The Literal rule in the built-in PEG grammar contains two choices - one for single quotes |
| // and the other for double quotes - which otherwise have the same structure. There are four |
| // children, where child 0 and child 2 represent the quotes themselves, child 1 represents the |
| // escaped representation of a string, and child 3 is any trailing whitespace. |
| // |
| // All we're interested in is the unescaped string, so we get child 2 and convert any escape |
| // sequences (e.g. \" or \n) to the characters they represent, and return a new literal |
| // expression with the unescaped string as its value. |
| |
| assert(isIdent(term,"Literal")); |
| Term *body = TermChildAt(term,0); |
| assert(TermKind(body) == ChoiceExpr); |
| int index = choiceIndex(body); |
| assert((index == 0) || (index == 1)); |
| Term *choice = TermChildAt(body,0); |
| assert(isSequence(choice,4)); |
| Term *content = TermChildAt(choice,1); |
| char *escaped = termString(builder,content); |
| char *unescaped = unescapeLiteral(escaped); |
| Expression *result = ExpressionNewLit(unescaped); |
| free(escaped); |
| free(unescaped); |
| return result; |
| } |
| |
| static int decodeRangeChar(Builder *builder, Term *charTerm) |
| { |
| // FIXME: Handle non-ASCII chars encoded as UTF-8, as well as numeric escape sequences |
| // We don't need to allocate a string for unescaped here, we should just do it directly |
| |
| assert(isIdent(charTerm,"Char")); |
| char *escaped = termString(builder,charTerm); |
| char *unescaped = unescapeLiteral(escaped); |
| int result; |
| if (strlen(unescaped) == 0) |
| result = '?'; |
| else if (unescaped[0] < 0) // Start of UTF-8 multibyte sequence |
| result = '?'; |
| else |
| result = unescaped[0]; |
| free(escaped); |
| free(unescaped); |
| return result; |
| } |
| |
| static Expression *buildRange(Builder *builder, Term *term) |
| { |
| // A range expression, one or more of which appear inside a [...] character class expression, |
| // matches a character in a range between a start and end value. We use a single expression |
| // type to represent both exact matches (where the minimum and maximum are the same), and |
| // "true" ranges (which match two or more possible characters). |
| // |
| // Note that the representation we use for Range expressions is a (start,end) pair, where a |
| // character match must satisfy the condition start <= c < end (that is, the range includes the |
| // start, but does *not* include the end). This representation is for the convenience of other |
| // code that works with ranges. The ExpressionNewRange function however takes the minimum and |
| // maximum values - that is, a matching character c must satisfy min <= c <= max. |
| |
| assert(isIdent(term,"Range")); |
| Term *body = TermChildAt(term,0); |
| int index = choiceIndex(body); |
| Term *choice = TermChildAt(body,0); |
| assert((index == 0) || (index == 1)); |
| if (index == 0) { |
| assert(isSequence(choice,3)); |
| Term *minChar = TermChildAt(choice,0); |
| Term *maxChar = TermChildAt(choice,2); |
| assert(isIdent(minChar,"Char")); |
| assert(isIdent(maxChar,"Char")); |
| int min = decodeRangeChar(builder,minChar); |
| int max = decodeRangeChar(builder,maxChar); |
| return ExpressionNewRange(min,max); |
| } |
| else { |
| assert(isIdent(choice,"Char")); |
| int value = decodeRangeChar(builder,choice); |
| return ExpressionNewRange(value,value); |
| } |
| } |
| |
| static Expression *buildClass(Builder *builder, Term *term) |
| { |
| // A character class expression is the same as a choice expression, except that it is only |
| // supposed to contain range expressions. |
| |
| assert(isIdent(term,"Class")); |
| Term *body = TermChildAt(term,0); |
| assert(isSequence(body,4)); |
| Term *star = TermChildAt(body,1); |
| assert(TermKind(star) == StarExpr); |
| |
| Expression *children[MAX_CHILDREN]; |
| int count = 0; |
| for (TermList *item = TermChildren(star); (item != NULL) && (count < MAX_CHILDREN); item = item->next) { |
| assert(isSequence(item->term,2)); |
| Term *rangeTerm = TermChildAt(item->term,1); |
| Expression *rangeExpr = buildRange(builder,rangeTerm); |
| children[count++] = rangeExpr; |
| } |
| |
| return ExpressionNewClass(count,children); |
| } |
| |
| static Expression *buildDot(Builder *builder, Term *term) |
| { |
| assert(isIdent(term,"DOT")); |
| return ExpressionNewDot(); |
| } |
| |
| static Expression *buildPrimary(Builder *builder, Term *term) |
| { |
| // A primary expression can be one of five possibilities: An identifier (reference to another |
| // rule in the grammar), a parenthesised expresion, a literal, a character class, or a dot |
| // (which matches any character). |
| // |
| // The type of the term is an Expression with kind == ChoiceExpr, and the term has exactly |
| // one child. We use choiceIndex to determine which of the five possible expression types |
| // the choice matches, and then call through to the relevant function to build the appropriate |
| // type of Expression object. |
| |
| assert(isIdent(term,"Primary")); |
| Term *body = TermChildAt(term,0); |
| assert(TermKind(body) == ChoiceExpr); |
| assert(TermCount(body) == 1); |
| Term *choice = TermChildAt(body,0); |
| |
| switch (choiceIndex(body)) { |
| case 0: { |
| assert(isSequence(choice,2)); |
| Term *identifier = TermChildAt(choice,0); |
| return buildIdentifier(builder,identifier); |
| } |
| case 1: { |
| assert(isSequence(choice,4)); |
| Term *expression = TermChildAt(choice,2); |
| Expression *result = buildExpression(builder,expression); |
| return ExpressionNewString(result); |
| } |
| case 2: { |
| assert(isSequence(choice,3)); |
| Term *expression = TermChildAt(choice,1); |
| return buildExpression(builder,expression); |
| } |
| case 3: |
| return buildLiteral(builder,choice); |
| case 4: |
| return buildClass(builder,choice); |
| case 5: |
| return buildDot(builder,choice); |
| default: |
| assert(!"Invalid choice for Primary"); |
| return NULL; |
| } |
| } |
| |
| static Expression *buildSuffix(Builder *builder, Term *term) |
| { |
| // A suffix expression has two children. The second is optional, and is one of either QUESTION, |
| // STAR, or PLUS - which indicate that the first child may occur zero or one times, zero or more |
| // times, or one or more times. The first child is always present, and represents a primary |
| // expression with no prefix or suffix. |
| // |
| // If a QUESTION, STAR, or PLUS suffix is given, we wrap the constructed primary expression |
| // inside another Expression object of type OptExpr, StarExpr, or PlusExpr. Otherwise, we just |
| // return the primary expression directly. |
| |
| assert(isIdent(term,"Suffix")); |
| Term *body = TermChildAt(term,0); |
| assert(isSequence(body,2)); |
| Term *primary = TermChildAt(body,0); |
| Term *suffix = TermChildAt(body,1); |
| assert(isIdent(primary,"Primary")); |
| assert(TermKind(suffix) == OptExpr); |
| |
| Expression *primaryExpr = buildPrimary(builder,primary); |
| |
| assert((TermCount(suffix) == 0) || (TermCount(suffix) == 1)); |
| if (TermCount(suffix) == 1) { |
| Term *suffixChild = TermChildAt(suffix,0); |
| int index = choiceIndex(suffixChild); |
| Term *choice = TermChildAt(suffixChild,0); |
| switch (index) { |
| case 0: |
| assert(isIdent(choice,"QUESTION")); |
| return ExpressionNewOpt(primaryExpr); |
| case 1: |
| assert(isIdent(choice,"STAR")); |
| return ExpressionNewStar(primaryExpr); |
| case 2: |
| assert(isIdent(choice,"PLUS")); |
| return ExpressionNewPlus(primaryExpr); |
| default: |
| assert(!"Invalid choice for Suffix"); |
| break; |
| } |
| } |
| |
| return primaryExpr; |
| } |
| |
| static Expression *buildPrefix(Builder *builder, Term *term) |
| { |
| // A prefix expression has two children. The first is optional, and is one of either AND or |
| // NOT - which indicate a positive or negative lookahead assertion. The second is not optional, |
| // and represents a primary expression with an optional suffix. |
| // |
| // If an AND or NOT prefx is given, we wrap the constructed primary expression inside another |
| // Expression object of type AndExpr or NotExpr. Otherwise, we just return the primary |
| // expression directly. |
| |
| assert(isIdent(term,"Prefix")); |
| Term *body = TermChildAt(term,0); |
| assert(isSequence(body,2)); |
| Term *prefix = TermChildAt(body,0); |
| Term *suffix = TermChildAt(body,1); |
| assert(TermKind(prefix) == OptExpr); |
| assert(isIdent(suffix,"Suffix")); |
| |
| Expression *suffixExpr = buildSuffix(builder,suffix); |
| |
| assert((TermCount(prefix) == 0) || (TermCount(prefix) == 1)); |
| if (TermCount(prefix) == 1) { |
| Term *prefixChild = TermChildAt(prefix,0); |
| int index = choiceIndex(prefixChild); |
| Term *choice = TermChildAt(prefixChild,0); |
| switch (index) { |
| case 0: |
| assert(isIdent(choice,"AND")); |
| return ExpressionNewAnd(suffixExpr); |
| case 1: |
| assert(isIdent(choice,"NOT")); |
| return ExpressionNewNot(suffixExpr); |
| default: |
| assert(!"Invalid choice for Prefix"); |
| break; |
| } |
| } |
| |
| return suffixExpr; |
| } |
| |
| static Expression *buildSequence(Builder *builder, Term *term) |
| { |
| // An Sequence consists of one or more expressions, each of which has an optional prefix |
| // and suffix |
| assert(isIdent(term,"Sequence")); |
| Term *body = TermChildAt(term,0); |
| assert(TermKind(body) == StarExpr); |
| |
| Expression *children[MAX_CHILDREN]; |
| int count = 0; |
| for (TermList *item = TermChildren(body); (item != NULL) && (count < MAX_CHILDREN); item = item->next) { |
| children[count] = buildPrefix(builder,item->term); |
| count++; |
| } |
| if (count == 1) |
| return children[0]; |
| else |
| return ExpressionNewSequence(count,children); |
| } |
| |
| static Expression *buildExpression(Builder *builder, Term *term) |
| { |
| // An Expression consists of one or more choices |
| assert(isIdent(term,"Expression")); |
| Term *body = TermChildAt(term,0); |
| assert(isSequence(body,2)); |
| |
| Term *child0 = TermChildAt(body,0); |
| Term *child1 = TermChildAt(body,1); |
| |
| assert(isIdent(child0,"Sequence")); |
| assert(TermKind(child1) == StarExpr); |
| |
| Expression *initial = buildSequence(builder,child0); |
| if (TermCount(child1) == 0) |
| return initial; |
| |
| Expression *children[MAX_CHILDREN]; |
| children[0] = initial; |
| int count = 1; |
| for (TermList *item = TermChildren(child1); (item != NULL) && (count < MAX_CHILDREN); item = item->next) { |
| assert(isSequence(item->term,2)); |
| children[count] = buildSequence(builder,TermChildAt(item->term,1)); |
| count++; |
| } |
| return ExpressionNewChoice(count,children); |
| } |
| |
| static void buildGrammar(Builder *builder, Term *term) |
| { |
| assert(isSequence(term,3)); |
| Term *plus = TermChildAt(term,1); |
| assert(TermKind(plus) == PlusExpr); |
| |
| for (TermList *plusItem = plus->children; plusItem != NULL; plusItem = plusItem->next) { |
| Term *defIdent = plusItem->term; |
| assert(isIdent(defIdent,"Definition")); |
| Term *defSeq = TermChildAt(defIdent,0); |
| assert(isSequence(defSeq,4)); |
| Term *identTerm = TermChildAt(defSeq,0); |
| Term *exprTerm = TermChildAt(defSeq,2); |
| assert(isIdent(identTerm,"Identifier")); |
| assert(isIdent(exprTerm,"Expression")); |
| |
| char *ruleName = identifierString(builder,identTerm); |
| Expression *ruleExpr = buildExpression(builder,exprTerm); |
| GrammarDefine(builder->gram,ruleName,ruleExpr); |
| free(ruleName); |
| } |
| |
| GrammarResolve(builder->gram); |
| } |
| |
| // This function creates a new Grammar object from the result of parsing a file usin the built-in |
| // PEG grammar. The resulting Grammar object can then be used to parse other files which are written |
| // in another language that is accepted by that grammar. The Term objects are constructed by the |
| // parse function defined in Parser.c. |
| |
| Grammar *grammarFromTerm(Term *term, const char *input) |
| { |
| Grammar *gram = GrammarNew(); |
| |
| Builder *builder = (Builder *)calloc(1,sizeof(Builder)); |
| builder->gram = gram; |
| builder->input = input; |
| builder->len = strlen(input); |
| |
| buildGrammar(builder,term); |
| |
| free(builder); |
| |
| return gram; |
| } |