blob: 2fb7b13de5b3dd5672bf8a95c2759bf125afed5d [file] [log] [blame]
// Licensed to the Apache Software Foundation (ASF) under one
// or more contributor license agreements. See the NOTICE file
// distributed with this work for additional information
// regarding copyright ownership. The ASF licenses this file
// to you under the Apache License, Version 2.0 (the
// "License"); you may not use this file except in compliance
// with the License. You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing,
// software distributed under the License is distributed on an
// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
// KIND, either express or implied. See the License for the
// specific language governing permissions and limitations
// under the License.
#include "Parser.h"
#include "Util.h"
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <assert.h>
typedef struct Parser Parser;
struct Parser {
Grammar *gram;
const char *input;
int start;
int end;
int pos;
};
static Term *parseExpr(Parser *p, Expression *expr)
{
int startPos = p->pos;
switch (ExpressionKind(expr)) {
case ChoiceExpr:
for (int i = 0; i < ExprChoiceCount(expr); i++) {
Term *term = parseExpr(p,ExprChoiceChildAt(expr,i));
// If parsing of a choice succeeds, we return immediately.
if (term != NULL)
return TermNew(expr,startPos,p->pos,TermListNew(term,NULL)); // Success
// If parsing of a choice fails, we reset the current position, and continue on with
// the next choice (if any). If there are no more choices, the loop complets and
// we return NULL.
p->pos = startPos;
}
// If we get here, none of the choices have matched, so evaluation of the
// choice expression as a whole fails.
return NULL; // Failure
case SequenceExpr: {
TermList *list = NULL;
TermList **listEnd = &list;
for (int i = 0; i < ExprSequenceCount(expr); i++) {
Term *term = parseExpr(p,ExprSequenceChildAt(expr,i));
// If parsing of a sequence item fails, the sequence as a whole fails. We reset
// the current position to the start.
if (term == NULL) {
p->pos = startPos;
return NULL; // Failure
}
// If parsing of a sequence item succeeds, we append it to the list of
// accumulated child terms, and continue with the next item.
TermListPtrAppend(&listEnd,term);
}
// If we get here, all items in the sequence have matched, and evaluation succeeds,
// returning a term with children comprising the parse results of all items.
return TermNew(expr,startPos,p->pos,list); // Success
}
case AndExpr: {
// Evaluate the child expression to see if it succeeds, but reset the position since
// this is a predicate which is not supposed to consume any input. AndExpr is a
// positive lookahead assertion, which means it succeeds if and only if the child
// evaluation succeeds.
Term *term = parseExpr(p,ExprAndChild(expr));
p->pos = startPos;
if (term != NULL)
return TermNew(expr,startPos,startPos,NULL); // Success
else
return NULL; // Failure
}
case NotExpr: {
// Evaluate the child expression to see if it succeeds, but reset the position since
// this is a predicate which is not supposed to consume any input. NotExpr is a
// *negative* lookahead assertion, which is the opposite of the above; it succeeds
// if and only if the child evaluation *fails*.
Term *term = parseExpr(p,ExprNotChild(expr));
p->pos = startPos;
if (term != NULL)
return NULL; // Failure
else
return TermNew(expr,startPos,startPos,NULL); // Success
}
case OptExpr: {
// An optional expression (? operator) succeeds regardless of whether or not the child
// expression succeeds. If the child did succeed, we return its result as a child of the
// OptExpr result.
Term *term = parseExpr(p,ExprOptChild(expr));
TermList *children;
if (term != NULL)
children = TermListNew(term,NULL);
else
children = NULL;
return TermNew(expr,startPos,p->pos,children); // Success
}
case StarExpr: {
// A zero-or-more expression (* operator) repeatedly matches is child as many times
// as possible, suceeding regardless of the number of matches.
TermList *list = NULL;
TermList **listEnd = &list;
for (;;) {
Term *term = parseExpr(p,ExprStarChild(expr));
if (term == NULL)
break;
TermListPtrAppend(&listEnd,term);
}
return TermNew(expr,startPos,p->pos,list);
}
case PlusExpr: {
// A one-or-more expression (+ operator) operates like a zero-or-match, but fails if
// there is not at least one match
TermList *list = NULL;
TermList **listEnd = &list;
// First make sure we have at least one match. If this parse fails then we have zero
// matches, and thus the plus expression as a whole fails.
Term *term = parseExpr(p,ExprPlusChild(expr));
if (term == NULL)
return NULL; // Failure
TermListPtrAppend(&listEnd,term);
// Now parse any following matches
for (;;) {
Term *term = parseExpr(p,ExprPlusChild(expr));
if (term == NULL)
break;
TermListPtrAppend(&listEnd,term);
}
return TermNew(expr,startPos,p->pos,list); // Success
}
case IdentExpr: {
Term *term = parseExpr(p,ExprIdentTarget(expr));
if (term != NULL)
return TermNew(expr,startPos,p->pos,TermListNew(term,NULL));
else
return NULL;
}
case LitExpr: {
const char *value = ExprLitValue(expr);
int len = (int)strlen(value);
if ((p->pos + len <= p->end) && !memcmp(&p->input[p->pos],value,len)) {
p->pos += len;
return TermNew(expr,startPos,p->pos,NULL);
}
else {
return NULL;
}
}
case ClassExpr:
// Actually identical to ChoiceExpr; we should really merge the two
for (int i = 0; i < ExprClassCount(expr); i++) {
Term *term = parseExpr(p,ExprClassChildAt(expr,i));
// If parsing of a choice succeeds, we return immediately.
if (term != NULL)
return TermNew(expr,startPos,p->pos,TermListNew(term,NULL)); // Success
// If parsing of a choice fails, we reset the current position, and continue on with
// the next choice (if any). If there are no more choices, the loop complets and
// we return NULL.
p->pos = startPos;
}
// If we get here, none of the choices have matched, so evaluation of the
// choice expression as a whole fails.
return NULL; // Failure
case DotExpr: {
size_t offset = p->pos;
int c = UTF8NextChar(p->input,&offset); // FIXME: Should use uint32_t here
if (c == 0)
return NULL;
p->pos = offset;
return TermNew(expr,startPos,p->pos,NULL);
}
case RangeExpr: {
size_t offset = p->pos;
int c = UTF8NextChar(p->input,&offset); // FIXME: Should use uint32_t here
if (c == 0)
return NULL;
if ((c >= ExprRangeStart(expr)) && (c < ExprRangeEnd(expr))) {
p->pos = offset;
return TermNew(expr,startPos,p->pos,NULL);
}
else {
return NULL;
}
}
}
assert(!"unknown expression type");
return NULL;
}
Term *parse(Grammar *gram, const char *rule, const char *input, int start, int end)
{
Expression *rootExpr = GrammarLookup(gram,rule);
if (rootExpr == NULL) {
fprintf(stderr,"%s: No such rule\n",rule);
exit(1);
}
Parser *p = (Parser *)calloc(1,sizeof(Parser));
p->gram = gram;
p->input = input;
p->start = start;
p->end = end;
p->pos = start;
Term *result = parseExpr(p,rootExpr);
free(p);
return result;
}