Flat: Basic recursive-descent parser Initial implementation of a recursive-descent parser that uses a Grammar object to product a tree of terms. The parser is very simplistic in nature; it does a tree walk of the expressions associated with the rules of the grammar, and does not perform any memoization or other optimizations. The resulting Term objects represent the parse tree of the input file. One Term object is created for each node in the expression tree, and in the case of character classes and . expressions, one for each character. The resulting parse trees are thus much larger than one would normally want, but this is at least a starting point. Terms and TermLists are not currently freed from memory - the intention is that these will eventually be handled via garbage collection, as they will form the set of values that are used for not just parsing but also any subsequent transformations.

commit: 329722fc1ef43989900b9b09b758987c20879e01 [log] [tgz]
author: Peter Kelly <peter@uxproductivity.com> Tue Jul 14 18:36:49 2015 +0700
committer: Peter Kelly <peter@uxproductivity.com> Tue Jul 14 18:36:49 2015 +0700
tree: 0338718de734d5ac724a075858524ef3dca5689c
parent: aa7176ab9b2a1ced56d32de2d6a30a9a0bbbb452 [diff]
diff --git a/experiments/flat/src/CMakeLists.txt b/experiments/flat/src/CMakeLists.txt
index a8e2877..eb969c1 100644
--- a/experiments/flat/src/CMakeLists.txt
+++ b/experiments/flat/src/CMakeLists.txt

@@ -29,6 +29,12 @@
     Expression.h
     Grammar.c
     Grammar.h
+    Parser.c
+    Parser.h
+    Term.c
+    Term.h
+    Util.c
+    Util.h
     flat.c)
 
 ###

diff --git a/experiments/flat/src/Grammar.c b/experiments/flat/src/Grammar.c
index 1fa2979..8201b89 100644
--- a/experiments/flat/src/Grammar.c
+++ b/experiments/flat/src/Grammar.c

@@ -73,11 +73,11 @@
     gram->nextDef = &def->next;
 }
 
-static Rule *GrammarLookup(Grammar *gram, const char *name)
+Expression *GrammarLookup(Grammar *gram, const char *name)
 {
     for (Rule *def = gram->defList; def != NULL; def = def->next) {
         if (!strcmp(def->name,name))
-            return def;
+            return def->expr;
     }
     return NULL;
 }
@@ -86,12 +86,12 @@
 {
     if (ExpressionKind(expr) == IdentExpr) {
         const char *targetName = ExprIdentValue(expr);
-        Rule *targetRule = GrammarLookup(gram,targetName);
-        if (targetRule == NULL) {
+        Expression *targetExpr = GrammarLookup(gram,targetName);
+        if (targetExpr == NULL) {
             fprintf(stderr,"%s: Cannot resolve reference %s\n",ruleName,targetName);
             exit(1);
         }
-        ExprIdentSetTarget(expr,targetRule->expr);
+        ExprIdentSetTarget(expr,targetExpr);
     }
     else {
         for (int i = 0; i < ExpressionCount(expr); i++)

diff --git a/experiments/flat/src/Grammar.h b/experiments/flat/src/Grammar.h
index ccbf012..e94e51b 100644
--- a/experiments/flat/src/Grammar.h
+++ b/experiments/flat/src/Grammar.h

@@ -25,5 +25,6 @@
 Grammar *GrammarNew(void);
 void GrammarFree(Grammar *gram);
 void GrammarDefine(Grammar *gram, const char *name, Expression *expr);
+Expression *GrammarLookup(Grammar *gram, const char *name);
 void GrammarResolve(Grammar *gram);
 void GrammarPrint(Grammar *gram);

diff --git a/experiments/flat/src/Parser.c b/experiments/flat/src/Parser.c
new file mode 100644
index 0000000..2fb7b13
--- /dev/null
+++ b/experiments/flat/src/Parser.c

@@ -0,0 +1,228 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "Parser.h"
+#include "Util.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+typedef struct Parser Parser;
+
+struct Parser {
+    Grammar *gram;
+    const char *input;
+    int start;
+    int end;
+    int pos;
+};
+
+static Term *parseExpr(Parser *p, Expression *expr)
+{
+    int startPos = p->pos;
+    switch (ExpressionKind(expr)) {
+        case ChoiceExpr:
+            for (int i = 0; i < ExprChoiceCount(expr); i++) {
+                Term *term = parseExpr(p,ExprChoiceChildAt(expr,i));
+
+                // If parsing of a choice succeeds, we return immediately.
+                if (term != NULL)
+                    return TermNew(expr,startPos,p->pos,TermListNew(term,NULL)); // Success
+
+                // If parsing of a choice fails, we reset the current position, and continue on with
+                // the next choice (if any). If there are no more choices, the loop complets and
+                // we return NULL.
+                p->pos = startPos;
+            }
+
+            // If we get here, none of the choices have matched, so evaluation of the
+            // choice expression as a whole fails.
+            return NULL; // Failure
+        case SequenceExpr: {
+            TermList *list = NULL;
+            TermList **listEnd = &list;
+            for (int i = 0; i < ExprSequenceCount(expr); i++) {
+                Term *term = parseExpr(p,ExprSequenceChildAt(expr,i));
+
+                // If parsing of a sequence item fails, the sequence as a whole fails. We reset
+                // the current position to the start.
+                if (term == NULL) {
+                    p->pos = startPos;
+                    return NULL; // Failure
+                }
+
+                // If parsing of a sequence item succeeds, we append it to the list of
+                // accumulated child terms, and continue with the next item.
+                TermListPtrAppend(&listEnd,term);
+            }
+
+            // If we get here, all items in the sequence have matched, and evaluation succeeds,
+            // returning a term with children comprising the parse results of all items.
+            return TermNew(expr,startPos,p->pos,list); // Success
+        }
+        case AndExpr: {
+            // Evaluate the child expression to see if it succeeds, but reset the position since
+            // this is a predicate which is not supposed to consume any input. AndExpr is a
+            // positive lookahead assertion, which means it succeeds if and only if the child
+            // evaluation succeeds.
+            Term *term = parseExpr(p,ExprAndChild(expr));
+            p->pos = startPos;
+            if (term != NULL)
+                return TermNew(expr,startPos,startPos,NULL); // Success
+            else
+                return NULL; // Failure
+        }
+        case NotExpr: {
+            // Evaluate the child expression to see if it succeeds, but reset the position since
+            // this is a predicate which is not supposed to consume any input. NotExpr is a
+            // *negative* lookahead assertion, which is the opposite of the above; it succeeds
+            // if and only if the child evaluation *fails*.
+            Term *term = parseExpr(p,ExprNotChild(expr));
+            p->pos = startPos;
+            if (term != NULL)
+                return NULL; // Failure
+            else
+                return TermNew(expr,startPos,startPos,NULL); // Success
+        }
+        case OptExpr: {
+            // An optional expression (? operator) succeeds regardless of whether or not the child
+            // expression succeeds. If the child did succeed, we return its result as a child of the
+            // OptExpr result.
+            Term *term = parseExpr(p,ExprOptChild(expr));
+            TermList *children;
+            if (term != NULL)
+                children = TermListNew(term,NULL);
+            else
+                children = NULL;
+            return TermNew(expr,startPos,p->pos,children); // Success
+        }
+        case StarExpr: {
+            // A zero-or-more expression (* operator) repeatedly matches is child as many times
+            // as possible, suceeding regardless of the number of matches.
+            TermList *list = NULL;
+            TermList **listEnd = &list;
+            for (;;) {
+                Term *term = parseExpr(p,ExprStarChild(expr));
+                if (term == NULL)
+                    break;
+                TermListPtrAppend(&listEnd,term);
+            }
+            return TermNew(expr,startPos,p->pos,list);
+        }
+        case PlusExpr: {
+            // A one-or-more expression (+ operator) operates like a zero-or-match, but fails if
+            // there is not at least one match
+            TermList *list = NULL;
+            TermList **listEnd = &list;
+
+            // First make sure we have at least one match. If this parse fails then we have zero
+            // matches, and thus the plus expression as a whole fails.
+            Term *term = parseExpr(p,ExprPlusChild(expr));
+            if (term == NULL)
+                return NULL; // Failure
+            TermListPtrAppend(&listEnd,term);
+
+            // Now parse any following matches
+            for (;;) {
+                Term *term = parseExpr(p,ExprPlusChild(expr));
+                if (term == NULL)
+                    break;
+                TermListPtrAppend(&listEnd,term);
+            }
+            return TermNew(expr,startPos,p->pos,list); // Success
+        }
+        case IdentExpr: {
+            Term *term = parseExpr(p,ExprIdentTarget(expr));
+            if (term != NULL)
+                return TermNew(expr,startPos,p->pos,TermListNew(term,NULL));
+            else
+                return NULL;
+        }
+        case LitExpr: {
+            const char *value = ExprLitValue(expr);
+            int len = (int)strlen(value);
+            if ((p->pos + len <= p->end) && !memcmp(&p->input[p->pos],value,len)) {
+                p->pos += len;
+                return TermNew(expr,startPos,p->pos,NULL);
+            }
+            else {
+                return NULL;
+            }
+        }
+        case ClassExpr:
+            // Actually identical to ChoiceExpr; we should really merge the two
+            for (int i = 0; i < ExprClassCount(expr); i++) {
+                Term *term = parseExpr(p,ExprClassChildAt(expr,i));
+
+                // If parsing of a choice succeeds, we return immediately.
+                if (term != NULL)
+                    return TermNew(expr,startPos,p->pos,TermListNew(term,NULL)); // Success
+
+                // If parsing of a choice fails, we reset the current position, and continue on with
+                // the next choice (if any). If there are no more choices, the loop complets and
+                // we return NULL.
+                p->pos = startPos;
+            }
+
+            // If we get here, none of the choices have matched, so evaluation of the
+            // choice expression as a whole fails.
+            return NULL; // Failure
+        case DotExpr: {
+            size_t offset = p->pos;
+            int c = UTF8NextChar(p->input,&offset); // FIXME: Should use uint32_t here
+            if (c == 0)
+                return NULL;
+            p->pos = offset;
+            return TermNew(expr,startPos,p->pos,NULL);
+        }
+        case RangeExpr: {
+            size_t offset = p->pos;
+            int c = UTF8NextChar(p->input,&offset); // FIXME: Should use uint32_t here
+            if (c == 0)
+                return NULL;
+            if ((c >= ExprRangeStart(expr)) && (c < ExprRangeEnd(expr))) {
+                p->pos = offset;
+                return TermNew(expr,startPos,p->pos,NULL);
+            }
+            else {
+                return NULL;
+            }
+        }
+    }
+    assert(!"unknown expression type");
+    return NULL;
+}
+
+Term *parse(Grammar *gram, const char *rule, const char *input, int start, int end)
+{
+    Expression *rootExpr = GrammarLookup(gram,rule);
+    if (rootExpr == NULL) {
+        fprintf(stderr,"%s: No such rule\n",rule);
+        exit(1);
+    }
+
+    Parser *p = (Parser *)calloc(1,sizeof(Parser));
+    p->gram = gram;
+    p->input = input;
+    p->start = start;
+    p->end = end;
+    p->pos = start;
+    Term *result = parseExpr(p,rootExpr);
+    free(p);
+    return result;
+}

diff --git a/experiments/flat/src/Parser.h b/experiments/flat/src/Parser.h
new file mode 100644
index 0000000..f5feb78
--- /dev/null
+++ b/experiments/flat/src/Parser.h

@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "Term.h"
+#include "Grammar.h"
+
+Term *parse(Grammar *gram, const char *rule, const char *input, int start, int end);

diff --git a/experiments/flat/src/Term.c b/experiments/flat/src/Term.c
new file mode 100644
index 0000000..735d609
--- /dev/null
+++ b/experiments/flat/src/Term.c

@@ -0,0 +1,52 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "Term.h"
+#include "Util.h"
+#include <stdlib.h>
+#include <string.h>
+#include <stdio.h>
+#include <assert.h>
+
+Term *TermNew(Expression *type, int start, int end, TermList *children)
+{
+    assert(type != NULL);
+    Term *term = (Term *)calloc(1,sizeof(Term));
+    term->type = type;
+    term->start = start;
+    term->end = end;
+    term->children = children;
+    return term;
+}
+
+TermList *TermListNew(Term *term, TermList *next)
+{
+    TermList *list = (TermList *)calloc(1,sizeof(TermList));
+    list->term = term;
+    list->next = next;
+    return list;
+}
+
+void TermListPtrAppend(TermList ***listPtr, Term *term)
+{
+    assert(term != NULL);
+    assert(listPtr != NULL);
+    assert(*listPtr != NULL);
+    assert(**listPtr == NULL);
+    **listPtr = TermListNew(term,NULL);
+    *listPtr = &(**listPtr)->next;
+}

diff --git a/experiments/flat/src/Term.h b/experiments/flat/src/Term.h
new file mode 100644
index 0000000..4b444c5
--- /dev/null
+++ b/experiments/flat/src/Term.h

@@ -0,0 +1,40 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include "Expression.h"
+
+typedef struct Term Term;
+typedef struct TermList TermList;
+
+struct Term {
+    Expression *type;
+    int start;
+    int end;
+    TermList *children;
+};
+
+struct TermList {
+    Term *term;
+    TermList *next;
+};
+
+Term *TermNew(Expression *type, int start, int end, TermList *children);
+
+TermList *TermListNew(Term *term, TermList *next);
+void TermListPtrAppend(TermList ***listPtr, Term *term);

diff --git a/experiments/flat/src/Util.c b/experiments/flat/src/Util.c
new file mode 100644
index 0000000..e8960a4
--- /dev/null
+++ b/experiments/flat/src/Util.c

@@ -0,0 +1,77 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#include "Util.h"
+#include <stdio.h>
+
+#define isstart1(c) (((c) & 0x80) == 0x00) // 0xxxxxxx
+#define isstart2(c) (((c) & 0xE0) == 0xC0) // 110xxxxx
+#define isstart3(c) (((c) & 0xF0) == 0xE0) // 1110xxxx
+#define isstart4(c) (((c) & 0xF8) == 0xF0) // 11110xxx
+#define iscont(c)   (((c) & 0xC0) == 0x80) // 10xxxxxx
+
+uint32_t UTF8NextChar(const char *str, size_t *offsetp)
+{
+    size_t pos = *offsetp;
+    const uint8_t *input = (const uint8_t *)str;
+
+    // If we're part-way through a multi-byte character, skip to the beginning of the next one
+    while ((input[pos] != 0) && iscont(input[pos]))
+        pos++;
+
+    // End of string
+    if (input[pos] == 0) {
+        *offsetp = pos;
+        return 0;
+    }
+
+    if (isstart1(input[pos])) {
+        // 1-byte character: 0xxxxxxx
+        uint32_t a = input[pos];
+        *offsetp = pos+1;
+        return a;
+    }
+    else if (isstart2(input[pos]) && iscont(input[pos+1])) {
+        // 2-byte character: 110xxxxx 10xxxxxx
+        uint32_t a = input[pos+1] & 0x3F; // a = bits 0-5
+        uint32_t b = input[pos+0] & 0x1F; // b = bits 6-10
+        *offsetp = pos+2;
+        return (a << 0) | (b << 6);
+    }
+    else if (isstart3(input[pos]) && iscont(input[pos+1]) && iscont(input[pos+2])) {
+        // 3-byte character: 1110xxxx 10xxxxxx 10xxxxxx
+        uint32_t a = input[pos+2] & 0x3F; // a = bits 0-5
+        uint32_t b = input[pos+1] & 0x3F; // b = bits 6-11
+        uint32_t c = input[pos+0] & 0x0F; // c = bits 12-15
+        *offsetp = pos+3;
+        return (a << 0) | (b << 6) | (c << 12);
+    }
+    else if (isstart4(input[pos]) && iscont(input[pos+1]) && iscont(input[pos+2]) && iscont(input[pos+3])) {
+        // 4-byte character: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
+        uint32_t a = input[pos+3] & 0x3F; // a = bits 0-5
+        uint32_t b = input[pos+2] & 0x3F; // b = bits 6-11
+        uint32_t c = input[pos+1] & 0x3F; // c = bits 12-17
+        uint32_t d = input[pos+0] & 0x07; // d = bits 18-20
+        *offsetp = pos+4;
+        return (a << 0) | (b << 6) | (c << 12) | (d << 18);
+    }
+    else {
+        // Invalid UTF8 byte sequence
+        *offsetp = pos;
+        return 0;
+    }
+}

diff --git a/experiments/flat/src/Util.h b/experiments/flat/src/Util.h
new file mode 100644
index 0000000..eb46072
--- /dev/null
+++ b/experiments/flat/src/Util.h

@@ -0,0 +1,23 @@
+// Licensed to the Apache Software Foundation (ASF) under one
+// or more contributor license agreements.  See the NOTICE file
+// distributed with this work for additional information
+// regarding copyright ownership.  The ASF licenses this file
+// to you under the Apache License, Version 2.0 (the
+// "License"); you may not use this file except in compliance
+// with the License.  You may obtain a copy of the License at
+//
+//   http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing,
+// software distributed under the License is distributed on an
+// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+// KIND, either express or implied.  See the License for the
+// specific language governing permissions and limitations
+// under the License.
+
+#pragma once
+
+#include <sys/types.h>
+#include <stdint.h>
+
+uint32_t UTF8NextChar(const char *str, size_t *offsetp);
commit	329722fc1ef43989900b9b09b758987c20879e01	[log] [tgz]
author	Peter Kelly <peter@uxproductivity.com>	Tue Jul 14 18:36:49 2015 +0700
committer	Peter Kelly <peter@uxproductivity.com>	Tue Jul 14 18:36:49 2015 +0700
tree	0338718de734d5ac724a075858524ef3dca5689c
parent	aa7176ab9b2a1ced56d32de2d6a30a9a0bbbb452 [diff]