blob: a23068e334cfda14d3e016674fa3fa4f74c45302 [file] [log] [blame]
/***************************************************************************
*
* scanner.cpp
*
* $Id$
*
***************************************************************************
*
* Licensed to the Apache Software Foundation (ASF) under one or more
* contributor license agreements. See the NOTICE file distributed
* with this work for additional information regarding copyright
* ownership. The ASF licenses this file to you under the Apache
* License, Version 2.0 (the "License"); you may not use this file
* except in compliance with the License. You may obtain a copy of
* the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
* implied. See the License for the specific language governing
* permissions and limitations under the License.
*
* Copyright 2001-2006 Rogue Wave Software.
*
**************************************************************************/
#include "scanner.h"
#include "diagnostic.h"
#include "loc_exception.h"
#include <fstream>
#include <string>
#include <vector>
#include <cassert> // for assert()
#include <climits> // for UCHAR_MAX
#include <cstdlib> // for strtol()
#include <cstring> // for strcmp()
struct ScannerContext
{
ScannerContext (const char*, char = '#', char = '\\');
std::ifstream file; // file stream object
std::string filename; // filename
// comment and escape for current file
char comment_char;
char escape_char;
// current line and column for the scanner
int line;
// current line and position within it
std::string line_;
const char* pos_;
private:
// not defined (not copy constructible or assignable)
ScannerContext (const ScannerContext&);
void operator= (ScannerContext&);
};
/**************************************************************************/
// helpers
static void normal_path (std::string& s)
{
std::string::iterator it(s.begin ());
for (; it != s.end (); it++)
if (*it == '/' || *it == '\\') {
#if defined (_MSC_VER)
*it = '\\';
#else
*it = '/';
#endif
}
}
/**************************************************************************/
// ScannerContext class definitions
ScannerContext::
ScannerContext (const char* name, char cc, char ec)
: file (name), filename (name),
comment_char (cc), escape_char (ec),
line (0) // , column (0)
{
// update current position
pos_ = line_.c_str ();
if (!file.is_open ())
issue_diag (500, true, 0,
"%s could not be opened for reading\n", name);
issue_diag (I_OPENRD, false, 0, "reading %s\n", name);
}
/**************************************************************************/
// Scanner class definitions
Scanner::
Scanner ()
: context_ (0), nlines_ (0), ntokens_ (0), escaped_newline_ (false)
{
// no-op
}
Scanner::
~Scanner()
{
// empty the stack and destroy the current state
delete context_;
while (!context_stack_.empty ()) {
delete context_stack_.top ();
context_stack_.pop ();
}
}
char Scanner::
escape_char () const
{
return context_ ? context_->escape_char : 0;
}
void Scanner::
ignore_line ()
{
while (next_token ().token != tok_nl);
}
void Scanner::
open (std::string name, char cc, char ec)
{
normal_path (name);
if (context_)
context_stack_.push (context_);
try {
context_ = new ScannerContext (name.c_str (), cc, ec);
}
catch (loc_exception&) {
context_ = 0;
if (!context_stack_.empty ()) {
context_ = context_stack_.top ();
context_stack_.pop ();
}
throw;
}
nlines_ = 0;
ntokens_ = 0;
}
void Scanner::
close ()
{
assert (0 != context_);
issue_diag (I_OPENRD, false, 0,
"%s: %u tokens, %u lines\n",
context_->filename.c_str (), ntokens_, nlines_);
delete context_;
if (context_stack_.empty ())
context_ = 0;
else {
context_ = context_stack_.top ();
context_stack_.pop ();
}
}
Scanner::token_id Scanner::
process_token (const char* name)
{
assert (0 != name);
if (*name == context_->escape_char) {
switch (name [1]) {
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
case 'd':
case 'x':
// escaped numeric character value
return tok_char_value;
default:
break;
}
return tok_ndef;
}
// look for a predefined token
static const struct {
const char* name;
Scanner::token_id token;
} tok_map [] = {
// elements must be sorted in ascending order
{ "CHARMAP", tok_charmap },
{ "END", tok_end },
{ "IGNORE", tok_ignore },
{ "LC_ADDRESS", tok_addr },
{ "LC_COLLATE", tok_collate },
{ "LC_CTYPE", tok_ctype },
{ "LC_IDENTIFICATION", tok_ident },
{ "LC_MEASUREMENT", tok_measure },
{ "LC_MESSAGES", tok_messages },
{ "LC_MONETARY", tok_monetary },
{ "LC_NAME", tok_name },
{ "LC_NUMERIC", tok_numeric },
{ "LC_PAPER", tok_paper },
{ "LC_TELEPHONE", tok_phone },
{ "LC_TIME", tok_time },
{ "UNDEFINED", tok_undefined },
{ "WIDTH", tok_width },
{ "abday", tok_abday },
{ "abmon", tok_abmon },
{ "alpha", tok_alpha },
{ "alt_digits", tok_alt_digits },
{ "am_pm", tok_am_pm },
{ "backward", tok_backward },
{ "blank", tok_blank },
{ "cntrl", tok_cntrl },
{ "collating-element", tok_coll_elem },
{ "collating-symbol", tok_coll_sym },
{ "comment_char", tok_comment_char },
{ "copy", tok_copy },
{ "currency_symbol", tok_currency_symbol },
{ "d_fmt", tok_d_fmt },
{ "d_t_fmt", tok_d_t_fmt },
{ "day", tok_day },
{ "decimal_point", tok_decimal_point },
{ "digit", tok_digit },
{ "era", tok_era },
{ "era_d_fmt", tok_era_d_fmt },
{ "era_d_t_fmt", tok_era_d_t_fmt },
{ "era_t_fmt", tok_era_t_fmt },
{ "escape_char", tok_escape_char },
{ "falsename", tok_falsename },
{ "forward", tok_forward },
{ "frac_digits", tok_frac_digits },
{ "from", tok_from },
{ "graph", tok_graph },
{ "grouping", tok_grouping },
{ "include", tok_include },
{ "int_curr_symbol", tok_int_curr_symbol },
{ "int_frac_digits", tok_int_frac_digits },
{ "int_n_cs_precedes", tok_int_n_cs_precedes },
{ "int_n_sep_by_space", tok_int_n_sep_by_space },
{ "int_n_sign_posn", tok_int_n_sign_posn },
{ "int_p_cs_precedes", tok_int_p_cs_precedes },
{ "int_p_sep_by_space", tok_int_p_sep_by_space },
{ "int_p_sign_posn", tok_int_p_sign_posn },
{ "lower", tok_lower },
{ "mon", tok_mon },
{ "mon_decimal_point", tok_mon_decimal_point },
{ "mon_grouping", tok_mon_grouping },
{ "mon_thousands_sep", tok_mon_thousands_sep },
{ "n_cs_precedes", tok_n_cs_precedes },
{ "n_sep_by_space", tok_n_sep_by_space },
{ "n_sign_posn", tok_n_sign_posn },
{ "negative_sign", tok_negative_sign },
{ "noexpr", tok_noexpr },
{ "order_end", tok_order_end },
{ "order_start", tok_order_start },
{ "p_cs_precedes", tok_p_cs_precedes },
{ "p_sep_by_space", tok_p_sep_by_space },
{ "p_sign_posn", tok_p_sign_posn },
{ "position", tok_position },
{ "positive_sign", tok_positive_sign },
{ "print", tok_print },
{ "punct", tok_punct },
{ "reorder-after", tok_reorder },
{ "reorder-end", tok_reorder_end },
{ "reorder-section-after", tok_reorder_section },
{ "reorder-section-end", tok_reorder_section_end },
{ "script", tok_script },
{ "space", tok_space },
{ "t_fmt", tok_t_fmt },
{ "t_fmt_ampm", tok_t_fmt_ampm },
{ "thousands_sep", tok_thousands_sep },
{ "tolower", tok_tolower },
{ "toupper", tok_toupper },
{ "translit_end", tok_xlit_end },
{ "translit_start", tok_xlit_start },
{ "truename", tok_truename },
{ "upper", tok_upper },
{ "xdigit", tok_xdigit },
{ "yesexpr", tok_yesexpr }
};
int low = 0;
int high = sizeof tok_map / sizeof *tok_map - 1;
// this loop implements a binary search to find 'name' in the
// tok_map list and when found returns the token value.
while (low <= high) {
const int cur = (low + high) / 2;
const int cmp = std::strcmp (name, tok_map [cur].name);
if (0 == cmp)
return tok_map [cur].token;
if (cmp < 0)
high = cur - 1;
else
low = cur + 1;
}
return tok_ndef;
}
void Scanner::
read_line ()
{
context_->line_.clear ();
std::getline (context_->file, context_->line_);
context_->line_ += '\n';
context_->pos_ = context_->line_.c_str ();
++context_->line;
// context_->column = 0;
++nlines_;
assert (context_->line_.size ());
}
Scanner::token_t Scanner::
next_token ()
{
assert (0 != context_);
assert (context_->file.is_open ());
// token
token_t next_tok;
next_tok.name = "";
next_tok.token = tok_ndef;
next_tok.line = 0;
next_tok.column = 0;
next_tok.file = 0;
while (true) {
// store the *current* file name
next_tok.file = context_->filename.c_str ();
// the assert above for eof checks if the caller has lost it;
if (context_->file.eof ()) {
next_tok.token = tok_end_tokens;
return next_tok;
}
// if we exhausted the current line, advance
if ( context_->line_.size ()
<= std::size_t (context_->pos_ - context_->line_.c_str ())) {
read_line ();
}
// line and column for the token start; they are set at each
// iteration; the finding of a token breaks and next_tok leaves
// this loop having the line/col info
next_tok.line = context_->line;
next_tok.column = context_->pos_ - context_->line_.c_str ();
// plug in the pointer to current position
const char*& next = context_->pos_;
if (*next != context_->comment_char)
escaped_newline_ = false;
if (*next == '<') {
// beginning of a symbolic name or keyword
const char* tok_begin = next++;
for (; '>' != *next; ++next) {
// if has an escaped close angular, pass
if (*next == context_->escape_char) {
// append symbol name up to but not including the escape
next_tok.name.append (tok_begin, next - tok_begin);
// advance the next pointer to skip the escape
tok_begin = ++next;
}
else if ('\n' == *next) {
// past the end of the line
issue_diag (E_SYNTAX, true, &next_tok,
" unterminated symbolic name\n");
break;
}
}
next_tok.name.append (tok_begin, ++next - tok_begin);
// check the name fetched so far
if (next_tok.name == "<code_set_name>") {
next_tok.token = tok_code_set_name;
}
else if ( next_tok.name == "<escape_char>"
|| next_tok.name == "<comment_char>") {
// eat away spaces
while (' ' == *next || '\t' == *next) {
++next;
}
// test for end of line
if (*next == '\n')
issue_diag (E_SYNTAX, true, &next_tok,
"missing value for %s\n",
next_tok.name.c_str ());
// store character
if (next_tok.name == "<escape_char>")
context_->escape_char = *next;
else
context_->comment_char = *next;
// adjust positions;
context_->pos_ =
context_->line_.c_str () + context_->line_.size ();
// set token to a newline
next_tok.name = "";
next_tok.token = tok_nl;
}
else if (next_tok.name == "<mb_cur_max>") {
next_tok.token = tok_mb_cur_max;
}
else if (next_tok.name == "<mb_cur_min>") {
next_tok.token = tok_mb_cur_min;
}
else {
next_tok.token = tok_sym_name;
}
break;
}
else if (*next == ' ' || *next == '\t' || *next == ';') {
// ignore whitespace and separators
while (*next == ' ' || *next == '\t' || *next == ';') {
++next;
}
}
else if (*next == '\n') {
++next;
next_tok.token = tok_nl;
break;
}
else if (*next == context_->comment_char) {
// start of a comment - check as early as necessary
// adjust to end of line
context_->pos_ = context_->line_.c_str () + context_->line_.size ();
if (escaped_newline_)
continue;
next_tok.token = tok_nl;
next_tok.name = "\n";
break;
}
else if (*next == '(') {
// push open parenthesis
next_tok.name.push_back (*next++);
// start of a grouping
while (*next != ')') {
// contains a symbolic name
if (*next == '<') {
// push open angular parenthesis
next_tok.name.push_back (*next++);
while (*next != '\n') {
// if has an escaped close angular, pass
if (next [0] == context_->escape_char) {
next_tok.name.push_back (*next++);
next_tok.name.push_back (*next++);
continue;
}
// if we have reached the end of the sym name
if (*next == '>') {
next_tok.name.push_back (*next);
break;
}
// still inside the sym name/keyword
next_tok.name.push_back (*next++);
}
// check if we have gone past the end of the line
if (*next == '\n')
issue_diag (E_SYNTAX, true, &next_tok,
" unterminated symbolic name");
++next;
}
else {
// fetch the character
next_tok.name.push_back (*next++);
}
if (*next == '\n')
issue_diag (E_SYNTAX, true, &next_tok,
" unterminated grouping ");
}
next_tok.name.push_back (*next++);
next_tok.token = tok_grouping;
break;
}
else if (*next == '.') {
// ellipsis (see ISO/IEC TR 14652)
int ellipsis_count = 0;
// start of an interval
while (*next == '.') {
next_tok.name.push_back (*next++);
++ellipsis_count;
}
switch (ellipsis_count) {
case 2: {
const char* tmp = next;
if (*tmp++ == '(' && *tmp++ == '2' && *tmp++ == ')'
&& *tmp++ == '.' && *tmp++ == '.') {
// double increment hexadecimal symbolic ellipsis
next_tok.token = tok_dbl_ellipsis;
next = tmp;
}
else {
// hexadecimal symbolic ellipsis
next_tok.token = tok_hex_ellipsis;
}
break;
}
case 3:
// absolute symbolic ellipsis
next_tok.token = tok_abs_ellipsis;
break;
case 4:
// decimal symbolic ellipsis
next_tok.token = tok_dec_ellipsis;
break;
default:
issue_diag (E_SYNTAX, true, &next_tok, "illegal ellipsis\n");
}
break;
}
else if (*next == '\"') {
// start of a string
next_tok.name.push_back (*next++);
const char ec = context_->escape_char;
while (next[0] != '\n') {
// escaped newline; continue
if (next [0] == ec && next [1] == '\n') {
read_line ();
continue;
}
// escaped quote
if (next[0] == ec) {
next_tok.name.push_back (*next++);
next_tok.name.push_back (*next++);
continue;
}
if (next [0] == '\"') {
next_tok.name.push_back (*next);
break;
}
// still inside the string
next_tok.name.push_back (*next++);
}
// test for closure
if (*next == '\n')
issue_diag (E_SYNTAX, true, &next_tok, "unterminated string");
++next;
next_tok.token = tok_string;
break;
}
else if (*next == context_->escape_char) {
// start of an escape sequence
// escaped new line
if (next [1] == '\n') {
// adjust to end of line
context_->pos_ =
context_->line_.c_str () + context_->line_.size ();
escaped_newline_ = true;
continue;
}
// or
while ( *next != ' ' && *next != '\t'
&& *next != ';' && *next != '\n') {
next_tok.name.push_back (*next++);
}
// retrieve token based on value
next_tok.token = process_token (next_tok.name.c_str ());
break;
}
else {
// the rest of it
for (const char ec = context_->escape_char; ; ) {
// stop at esc-newline or at first "separator"
if ( (next [0] == ec && next [1] == '\n')
|| next [0] == ' '
|| next [0] == '\t'
|| next [0] == '\n'
|| next [0] == ';') {
// continuation of a line, separators
break;
}
// fetch characters
next_tok.name.push_back (*next++);
}
// assert length of input
assert (next_tok.name.size ());
// it wasn't a locale definition keyword so call process_token
// and add the result to the list
next_tok.token = process_token (next_tok.name.c_str ());
// an extra bit of processing since we keep comment and escape
// characters in the scanner for a faster processing
if ( next_tok.token == tok_escape_char
|| next_tok.token == tok_comment_char) {
// eat away spaces
while (' ' == *next || '\t' == *next) {
++next;
}
// test for end of line
if (*next == '\n')
issue_diag (E_SYNTAX, true, &next_tok,
"unterminated statement");
// store character
if (next_tok.token == tok_escape_char)
context_->escape_char = next [0];
else
context_->comment_char = next [0];
// adjust positions;
context_->pos_ =
context_->line_.c_str () + context_->line_.size ();
// return the token
next_tok.name = "";
next_tok.token = tok_nl;
}
break;
}
}
++ntokens_;
return next_tok;
}
unsigned long Scanner::
convert_escape (const char *esc,
const char **pend /* = 0 */,
bool multi /* = false */) const
{
assert (0 != esc);
const char escape = escape_char ();
if (escape != *esc)
issue_diag (E_SYNTAX, true, 0,
"expected the escape character ('%c'), got \"%s\"\n",
escape, esc);
unsigned long value = 0;
for (const char *s = esc; ; ) {
// escaped characters are octal by default
const char *basename = "octal";
int base = 8;
switch (*++s) {
case 'd': ++s; base = 10; basename = "decimal"; break;
case 'x': ++s; base = 16; basename = "hexadecimal"; break;
case 'o': ++s;
case '0': case '1': case '2': case '3':
case '4': case '5': case '6': case '7':
break;
default:
issue_diag (E_SYNTAX, true, 0,
"one of { 'o', 'd', 'x' } expected following "
"the escape character: %s\n", esc);
}
char *end = 0;
const unsigned long byte = std::strtoul (s, &end, base);
if (pend)
*pend = end;
// cast away constness below to work around an MSVC 7.0 bug:
// causing error C2446: '==' : no conversion from 'char ** '
// to 'const char ** ' Conversion loses qualifiers
if (!multi && _RWSTD_CONST_CAST (char**, pend) == &end && **pend)
issue_diag (E_SYNTAX, true, 0,
"%s constant expected: %s\n", basename, esc);
if (UCHAR_MAX < byte)
issue_diag (E_INVAL, true, 0,
"%s byte value must be in the range [0, %d]: %s\n",
basename, int (UCHAR_MAX), esc);
if (value >> (sizeof (unsigned long) - 1) * CHAR_BIT)
issue_diag (E_INVAL, true, 0, "integer overflow: %s\n", esc);
value = (value << CHAR_BIT) | byte;
if (**pend != escape || !multi)
break;
s = *pend;
}
return value;
}