util/scanner.cpp - stdcxx - Git at Google

 /***************************************************************************
  *
  * scanner.cpp
  *
  * $Id$
  *
  ***************************************************************************
  *
  * Licensed to the Apache Software  Foundation (ASF) under one or more
  * contributor  license agreements.  See  the NOTICE  file distributed
  * with  this  work  for  additional information  regarding  copyright
  * ownership.   The ASF  licenses this  file to  you under  the Apache
  * License, Version  2.0 (the  "License"); you may  not use  this file
  * except in  compliance with the License.   You may obtain  a copy of
  * the License at
  *
  * http://www.apache.org/licenses/LICENSE-2.0
  *
  * Unless required by applicable law or agreed to in writing, software
  * distributed under the  License is distributed on an  "AS IS" BASIS,
  * WITHOUT  WARRANTIES OR CONDITIONS  OF ANY  KIND, either  express or
  * implied.   See  the License  for  the  specific language  governing
  * permissions and limitations under the License.
  *
  * Copyright 2001-2006 Rogue Wave Software.
  *
  **************************************************************************/

 #include "scanner.h"

 #include "diagnostic.h"
 #include "loc_exception.h"

 #include <fstream>
 #include <string>
 #include <vector>

 #include <cassert>   // for assert()
 #include <climits>   // for UCHAR_MAX
 #include <cstdlib>   // for strtol()
 #include <cstring>   // for strcmp()


 struct ScannerContext
 {
     ScannerContext (const char*, char = '#', char = '\\');

     std::ifstream file;		// file stream object
     std::string   filename;	// filename

     // comment and escape for current file
     char comment_char;
     char escape_char;

     // current line and column for the scanner
     int line;

     // current line and position within it
     std::string line_;
     const char* pos_;

 private:
     // not defined (not copy constructible or assignable)
     ScannerContext (const ScannerContext&);
     void operator= (ScannerContext&);
 };

 /**************************************************************************/
 // helpers

 static void normal_path (std::string& s)
 {
     std::string::iterator it(s.begin ());

     for (; it != s.end (); it++)
         if (*it == '/' || *it == '\\') {
 #if defined (_MSC_VER)
             *it = '\\';
 #else
             *it = '/';
 #endif
         }
 }

 /**************************************************************************/
 // ScannerContext class definitions

 ScannerContext::
 ScannerContext (const char* name, char cc, char ec)
     : file (name),  filename (name),
       comment_char (cc), escape_char (ec),
     line (0) // , column (0)
 {
     // update current position
     pos_ = line_.c_str ();

     if (!file.is_open ())
         issue_diag (500, true, 0,
                     "%s could not be opened for reading\n", name);

     issue_diag (I_OPENRD, false, 0, "reading %s\n", name);
 }

 /**************************************************************************/
 // Scanner class definitions

 Scanner::
 Scanner ()
     : context_ (0), nlines_ (0), ntokens_ (0), escaped_newline_ (false)
 {
     // no-op
 }


 Scanner::
 ~Scanner()
 {
     // empty the stack and destroy the current state
     delete context_;

     while (!context_stack_.empty ()) {
         delete context_stack_.top ();
         context_stack_.pop ();
     }
 }


 char Scanner::
 escape_char () const
 {
     return context_ ? context_->escape_char : 0;
 }

 void Scanner::
 ignore_line ()
 {
     while (next_token ().token != tok_nl);
 }


 void Scanner::
 open (std::string name, char cc, char ec)
 {
     normal_path (name);

     if (context_)
         context_stack_.push (context_);

     try {
         context_ = new ScannerContext (name.c_str (), cc, ec);
     }
     catch (loc_exception&) {
         context_ = 0;

         if (!context_stack_.empty ()) {
             context_ = context_stack_.top ();
             context_stack_.pop ();
         }

         throw;
     }

     nlines_ = 0;
     ntokens_ = 0;
 }


 void Scanner::
 close ()
 {
     assert (0 != context_);

     issue_diag (I_OPENRD, false, 0,
                 "%s: %u tokens, %u lines\n",
                 context_->filename.c_str (), ntokens_, nlines_);

     delete context_;

     if (context_stack_.empty ())
         context_ = 0;
     else {
         context_ = context_stack_.top ();
         context_stack_.pop ();
     }
 }


 Scanner::token_id Scanner::
 process_token (const char* name)
 {
     assert (0 != name);

     if (*name == context_->escape_char) {
         switch (name [1]) {
         case '0': case '1': case '2': case '3':
         case '4': case '5': case '6': case '7':
         case 'd':
         case 'x':
             // escaped numeric character value
             return tok_char_value;
         default:
             break;
         }

         return tok_ndef;
     }

     // look for a predefined token

     static const struct {
         const char*       name;
         Scanner::token_id token;
     } tok_map [] = {
         // elements must be sorted in ascending order
         { "CHARMAP", tok_charmap },
         { "END", tok_end },
         { "IGNORE", tok_ignore },
         { "LC_ADDRESS", tok_addr },
         { "LC_COLLATE", tok_collate },
         { "LC_CTYPE", tok_ctype },
         { "LC_IDENTIFICATION", tok_ident },
         { "LC_MEASUREMENT", tok_measure },
         { "LC_MESSAGES", tok_messages },
         { "LC_MONETARY", tok_monetary },
         { "LC_NAME", tok_name },
         { "LC_NUMERIC", tok_numeric },
         { "LC_PAPER", tok_paper },
         { "LC_TELEPHONE", tok_phone },
         { "LC_TIME", tok_time },
         { "UNDEFINED", tok_undefined },
         { "WIDTH", tok_width },
         { "abday", tok_abday },
         { "abmon", tok_abmon },
         { "alpha", tok_alpha },
         { "alt_digits", tok_alt_digits },
         { "am_pm", tok_am_pm },
         { "backward", tok_backward },
         { "blank", tok_blank },
         { "cntrl", tok_cntrl },
         { "collating-element", tok_coll_elem },
         { "collating-symbol", tok_coll_sym },
         { "comment_char", tok_comment_char },
         { "copy", tok_copy },
         { "currency_symbol", tok_currency_symbol },
         { "d_fmt", tok_d_fmt },
         { "d_t_fmt", tok_d_t_fmt },
         { "day", tok_day },
         { "decimal_point", tok_decimal_point },
         { "digit", tok_digit },
         { "era", tok_era },
         { "era_d_fmt", tok_era_d_fmt },
         { "era_d_t_fmt", tok_era_d_t_fmt },
         { "era_t_fmt", tok_era_t_fmt },
         { "escape_char", tok_escape_char },
         { "falsename", tok_falsename },
         { "forward", tok_forward },
         { "frac_digits", tok_frac_digits },
         { "from", tok_from },
         { "graph", tok_graph },
         { "grouping", tok_grouping },
         { "include", tok_include },
         { "int_curr_symbol", tok_int_curr_symbol },
         { "int_frac_digits", tok_int_frac_digits },
         { "int_n_cs_precedes", tok_int_n_cs_precedes },
         { "int_n_sep_by_space", tok_int_n_sep_by_space },
         { "int_n_sign_posn", tok_int_n_sign_posn },
         { "int_p_cs_precedes", tok_int_p_cs_precedes },
         { "int_p_sep_by_space", tok_int_p_sep_by_space },
         { "int_p_sign_posn", tok_int_p_sign_posn },
         { "lower", tok_lower },
         { "mon", tok_mon },
         { "mon_decimal_point", tok_mon_decimal_point },
         { "mon_grouping", tok_mon_grouping },
         { "mon_thousands_sep", tok_mon_thousands_sep },
         { "n_cs_precedes", tok_n_cs_precedes },
         { "n_sep_by_space", tok_n_sep_by_space },
         { "n_sign_posn", tok_n_sign_posn },
         { "negative_sign", tok_negative_sign },
         { "noexpr", tok_noexpr },
         { "order_end", tok_order_end },
         { "order_start", tok_order_start },
         { "p_cs_precedes", tok_p_cs_precedes },
         { "p_sep_by_space", tok_p_sep_by_space },
         { "p_sign_posn", tok_p_sign_posn },
         { "position", tok_position },
         { "positive_sign", tok_positive_sign },
         { "print", tok_print },
         { "punct", tok_punct },
         { "reorder-after", tok_reorder },
         { "reorder-end", tok_reorder_end },
         { "reorder-section-after", tok_reorder_section },
         { "reorder-section-end", tok_reorder_section_end },
         { "script", tok_script },
         { "space", tok_space },
         { "t_fmt", tok_t_fmt },
         { "t_fmt_ampm", tok_t_fmt_ampm },
         { "thousands_sep", tok_thousands_sep },
         { "tolower", tok_tolower },
         { "toupper", tok_toupper },
         { "translit_end", tok_xlit_end },
         { "translit_start", tok_xlit_start },
         { "truename", tok_truename },
         { "upper", tok_upper },
         { "xdigit", tok_xdigit },
         { "yesexpr", tok_yesexpr }
     };

     int low  = 0;
     int high = sizeof tok_map / sizeof *tok_map - 1;

     // this loop implements a binary search to find 'name' in the
     // tok_map list and when found returns the token value.
     while (low <= high) {

         const int cur = (low + high) / 2;

         const int cmp = std::strcmp (name, tok_map [cur].name);
         if (0 == cmp)
             return tok_map [cur].token;

         if (cmp < 0)
             high = cur - 1;
         else
             low = cur + 1;
     }

     return tok_ndef;
 }


 void Scanner::
 read_line ()
 {
     context_->line_.clear ();

     std::getline (context_->file, context_->line_);

     context_->line_ += '\n';

     context_->pos_ = context_->line_.c_str ();

     ++context_->line;
     // context_->column = 0;

     ++nlines_;

     assert (context_->line_.size ());
 }


 Scanner::token_t Scanner::
 next_token ()
 {
     assert (0 != context_);
     assert (context_->file.is_open ());

     // token
     token_t next_tok;

     next_tok.name   = "";
     next_tok.token  = tok_ndef;
     next_tok.line   = 0;
     next_tok.column = 0;
     next_tok.file   = 0;

     while (true) {

         // store the *current* file name
         next_tok.file = context_->filename.c_str ();

         // the assert above for eof checks if the caller has lost it;
         if (context_->file.eof ()) {
             next_tok.token = tok_end_tokens;
             return next_tok;
         }

         // if we exhausted the current line, advance
         if (   context_->line_.size ()
             <= std::size_t (context_->pos_ - context_->line_.c_str ())) {
             read_line ();
         }

         // line and column for the token start; they are set at each
         // iteration; the finding of a token breaks and next_tok leaves
         // this loop having the line/col info
         next_tok.line   = context_->line;
         next_tok.column = context_->pos_ - context_->line_.c_str ();

         // plug in the pointer to current position
         const char*& next = context_->pos_;

         if (*next != context_->comment_char)
             escaped_newline_ = false;

         if (*next == '<') {
             // beginning of a symbolic name or keyword
             const char* tok_begin = next++;

             for (; '>' != *next; ++next) {

                 // if has an escaped close angular, pass
                 if (*next == context_->escape_char) {

                     // append symbol name up to but not including the escape
                     next_tok.name.append (tok_begin, next - tok_begin);

                     // advance the next pointer to skip the escape
                     tok_begin = ++next;
                 }
                 else if ('\n' == *next) {
                     // past the end of the line
                     issue_diag (E_SYNTAX, true, &next_tok,
                                 " unterminated symbolic name\n");
                     break;
                 }
             }

             next_tok.name.append (tok_begin, ++next - tok_begin);

             // check the name fetched so far
             if (next_tok.name == "<code_set_name>") {
                 next_tok.token = tok_code_set_name;
             }
             else if (   next_tok.name == "<escape_char>"
                      || next_tok.name == "<comment_char>") {

                 // eat away spaces
                 while (' ' == *next || '\t' == *next) {
                     ++next;
                 }

                 // test for end of line
                 if (*next == '\n')
                     issue_diag (E_SYNTAX, true, &next_tok,
                                 "missing value for %s\n",
                                 next_tok.name.c_str ());

                 // store character
                 if (next_tok.name == "<escape_char>")
                     context_->escape_char = *next;
                 else
                     context_->comment_char = *next;

                 // adjust positions;
                 context_->pos_ =
                     context_->line_.c_str () + context_->line_.size ();

                 // set token to a newline
                 next_tok.name = "";
                 next_tok.token = tok_nl;
             }
             else if (next_tok.name == "<mb_cur_max>") {
                 next_tok.token = tok_mb_cur_max;
             }
             else if (next_tok.name == "<mb_cur_min>") {
                 next_tok.token = tok_mb_cur_min;
             }
             else {
                 next_tok.token = tok_sym_name;
             }
             break;
         }
         else if (*next == ' ' || *next == '\t' || *next == ';') {
             // ignore whitespace and separators
             while (*next == ' ' || *next == '\t' || *next == ';') {
                 ++next;
             }
         }
         else if (*next == '\n') {
             ++next;
             next_tok.token = tok_nl;
             break;
         }
         else if (*next == context_->comment_char) {
             // start of a comment - check as early as necessary
             // adjust to end of line
             context_->pos_ = context_->line_.c_str () + context_->line_.size ();

             if (escaped_newline_)
                 continue;

             next_tok.token = tok_nl;
             next_tok.name = "\n";
             break;
         }
         else if (*next == '(') {
             // push open parenthesis
             next_tok.name.push_back (*next++);

             // start of a grouping
             while (*next != ')') {
                 // contains a symbolic name
                 if (*next == '<') {
                     // push open angular parenthesis
                     next_tok.name.push_back (*next++);

                     while (*next != '\n') {
                         // if has an escaped close angular, pass
                         if (next [0] == context_->escape_char) {
                             next_tok.name.push_back (*next++);
                             next_tok.name.push_back (*next++);
                             continue;
                         }

                         // if we have reached the end of the sym name
                         if (*next == '>') {
                             next_tok.name.push_back (*next);
                             break;
                         }

                         // still inside the sym name/keyword
                         next_tok.name.push_back (*next++);
                     }

                     // check if we have gone past the end of the line
                     if (*next == '\n')
                         issue_diag (E_SYNTAX, true, &next_tok,
                                     " unterminated symbolic name");

                     ++next;
                 }
                 else {
                     // fetch the character
                     next_tok.name.push_back (*next++);
                 }

                 if (*next == '\n')
                     issue_diag (E_SYNTAX, true, &next_tok,
                                 " unterminated grouping ");
             }

             next_tok.name.push_back (*next++);
             next_tok.token = tok_grouping;
             break;
         }
         else if (*next == '.') {
             // ellipsis (see ISO/IEC TR 14652)
             int ellipsis_count = 0;
             // start of an interval
             while (*next == '.') {
                 next_tok.name.push_back (*next++);
                 ++ellipsis_count;
             }

             switch (ellipsis_count) {
             case 2: {
                 const char* tmp = next;
                 if (*tmp++ == '(' && *tmp++ == '2' && *tmp++ == ')'
                     && *tmp++ == '.' && *tmp++ == '.') {
                     // double increment hexadecimal symbolic ellipsis
                     next_tok.token = tok_dbl_ellipsis;
                     next = tmp;
                 }
                 else {
                     // hexadecimal symbolic ellipsis
                     next_tok.token = tok_hex_ellipsis;
                 }
                 break;
             }

             case 3:
                 // absolute symbolic ellipsis
                 next_tok.token = tok_abs_ellipsis;
                 break;

             case 4:
                 // decimal symbolic ellipsis
                 next_tok.token = tok_dec_ellipsis;
                 break;

             default:
                 issue_diag (E_SYNTAX, true, &next_tok, "illegal ellipsis\n");
             }
             break;

         }
         else if (*next == '\"') {

             // start of a string
             next_tok.name.push_back (*next++);
             const char ec = context_->escape_char;

             while (next[0] != '\n') {

                 // escaped newline; continue
                 if (next [0] == ec && next [1] == '\n') {
                     read_line ();
                     continue;
                 }

                 // escaped quote
                 if (next[0] == ec) {
                     next_tok.name.push_back (*next++);
                     next_tok.name.push_back (*next++);
                     continue;
                 }

                 if (next [0] == '\"') {
                     next_tok.name.push_back (*next);
                     break;
                 }

                 // still inside the string
                 next_tok.name.push_back (*next++);
             }

             // test for closure
             if (*next == '\n')
                 issue_diag (E_SYNTAX, true, &next_tok, "unterminated string");

             ++next;
             next_tok.token = tok_string;
             break;

         }
         else if (*next == context_->escape_char) {
             // start of an escape sequence
             // escaped new line
             if (next [1] == '\n') {
                 // adjust to end of line
                 context_->pos_ =
                     context_->line_.c_str () + context_->line_.size ();

                 escaped_newline_ = true;
                 continue;
             }

             // or
             while (   *next != ' ' && *next != '\t'
                    && *next != ';' && *next != '\n') {
                 next_tok.name.push_back (*next++);
             }

             // retrieve token based on value
             next_tok.token = process_token (next_tok.name.c_str ());
             break;
         }
         else {
             // the rest of it
             for (const char ec = context_->escape_char; ; ) {

                 // stop at esc-newline or at first "separator"
                 if (   (next [0] == ec && next [1] == '\n')
                     || next [0] == ' '
                     || next [0] == '\t'
                     || next [0] == '\n'
                     || next [0] == ';') {
                     // continuation of a line, separators
                     break;
                 }

                 // fetch characters
                 next_tok.name.push_back (*next++);
             }

             // assert length of input
             assert (next_tok.name.size ());

             // it wasn't a locale definition keyword so call process_token
             // and add the result to the list
             next_tok.token = process_token (next_tok.name.c_str ());


             // an extra bit of processing since we keep comment and escape
             // characters in the scanner for a faster processing
             if (   next_tok.token == tok_escape_char
                 || next_tok.token == tok_comment_char) {

                 // eat away spaces
                 while (' ' == *next || '\t' == *next) {
                     ++next;
                 }

                 // test for end of line
                 if (*next == '\n')
                     issue_diag (E_SYNTAX, true, &next_tok,
                                 "unterminated statement");

                 // store character
                 if (next_tok.token == tok_escape_char)
                     context_->escape_char = next [0];
                 else
                     context_->comment_char = next [0];

                 // adjust positions;
                 context_->pos_ =
                     context_->line_.c_str () + context_->line_.size ();

                 // return the token
                 next_tok.name = "";
                 next_tok.token = tok_nl;
             }

             break;
         }
     }

     ++ntokens_;

     return next_tok;
 }


 unsigned long Scanner::
 convert_escape (const char  *esc,
                 const char **pend  /* = 0 */,
                 bool         multi /* = false */) const
 {
     assert (0 != esc);

     const char escape = escape_char ();

     if (escape != *esc)
         issue_diag (E_SYNTAX, true, 0,
                     "expected the escape character ('%c'), got \"%s\"\n",
                     escape, esc);

     unsigned long value = 0;

     for (const char *s = esc; ; ) {

         // escaped characters are octal by default
         const char *basename = "octal";
         int         base     = 8;

         switch (*++s) {
         case 'd': ++s; base = 10; basename = "decimal"; break;
         case 'x': ++s; base = 16; basename = "hexadecimal"; break;

         case 'o': ++s;
         case '0': case '1': case '2': case '3':
         case '4': case '5': case '6': case '7':
             break;

         default:
             issue_diag (E_SYNTAX, true, 0,
                         "one of { 'o', 'd', 'x' } expected following "
                         "the escape character: %s\n", esc);
         }

         char *end = 0;

         const unsigned long byte = std::strtoul (s, &end, base);

         if (pend)
             *pend = end;

         // cast away constness below to work around an MSVC 7.0 bug:
         // causing error C2446: '==' : no conversion from 'char ** '
         // to 'const char ** ' Conversion loses qualifiers
         if (!multi && _RWSTD_CONST_CAST (char**, pend) == &end && **pend)
             issue_diag (E_SYNTAX, true, 0,
                         "%s constant expected: %s\n", basename, esc);

         if (UCHAR_MAX < byte)
             issue_diag (E_INVAL, true, 0,
                         "%s byte value must be in the range [0, %d]: %s\n",
                         basename, int (UCHAR_MAX), esc);

         if (value >> (sizeof (unsigned long) - 1) * CHAR_BIT)
             issue_diag (E_INVAL, true, 0, "integer overflow: %s\n", esc);

         value = (value << CHAR_BIT) | byte;

         if (**pend != escape || !multi)
             break;

         s = *pend;
     }

     return value;
 }
	/***************************************************************************
	*
	* scanner.cpp
	*
	* $Id$
	*
	***************************************************************************
	*
	* Licensed to the Apache Software Foundation (ASF) under one or more
	* contributor license agreements. See the NOTICE file distributed
	* with this work for additional information regarding copyright
	* ownership. The ASF licenses this file to you under the Apache
	* License, Version 2.0 (the "License"); you may not use this file
	* except in compliance with the License. You may obtain a copy of
	* the License at
	*
	* http://www.apache.org/licenses/LICENSE-2.0
	*
	* Unless required by applicable law or agreed to in writing, software
	* distributed under the License is distributed on an "AS IS" BASIS,
	* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
	* implied. See the License for the specific language governing
	* permissions and limitations under the License.
	*
	* Copyright 2001-2006 Rogue Wave Software.
	*
	**************************************************************************/

	#include "scanner.h"

	#include "diagnostic.h"
	#include "loc_exception.h"

	#include <fstream>
	#include <string>
	#include <vector>

	#include <cassert> // for assert()
	#include <climits> // for UCHAR_MAX
	#include <cstdlib> // for strtol()
	#include <cstring> // for strcmp()


	struct ScannerContext
	{
	ScannerContext (const char*, char = '#', char = '\\');

	std::ifstream file; // file stream object
	std::string filename; // filename

	// comment and escape for current file
	char comment_char;
	char escape_char;

	// current line and column for the scanner
	int line;

	// current line and position within it
	std::string line_;
	const char* pos_;

	private:
	// not defined (not copy constructible or assignable)
	ScannerContext (const ScannerContext&);
	void operator= (ScannerContext&);
	};

	/**************************************************************************/
	// helpers

	static void normal_path (std::string& s)
	{
	std::string::iterator it(s.begin ());

	for (; it != s.end (); it++)
	if (it == '/' \|\| it == '\\') {
	#if defined (_MSC_VER)
	*it = '\\';
	#else
	*it = '/';
	#endif
	}
	}

	/**************************************************************************/
	// ScannerContext class definitions

	ScannerContext::
	ScannerContext (const char* name, char cc, char ec)
	: file (name), filename (name),
	comment_char (cc), escape_char (ec),
	line (0) // , column (0)
	{
	// update current position
	pos_ = line_.c_str ();

	if (!file.is_open ())
	issue_diag (500, true, 0,
	"%s could not be opened for reading\n", name);

	issue_diag (I_OPENRD, false, 0, "reading %s\n", name);
	}

	/**************************************************************************/
	// Scanner class definitions

	Scanner::
	Scanner ()
	: context_ (0), nlines_ (0), ntokens_ (0), escaped_newline_ (false)
	{
	// no-op
	}


	Scanner::
	~Scanner()
	{
	// empty the stack and destroy the current state
	delete context_;

	while (!context_stack_.empty ()) {
	delete context_stack_.top ();
	context_stack_.pop ();
	}
	}


	char Scanner::
	escape_char () const
	{
	return context_ ? context_->escape_char : 0;
	}

	void Scanner::
	ignore_line ()
	{
	while (next_token ().token != tok_nl);
	}


	void Scanner::
	open (std::string name, char cc, char ec)
	{
	normal_path (name);

	if (context_)
	context_stack_.push (context_);

	try {
	context_ = new ScannerContext (name.c_str (), cc, ec);
	}
	catch (loc_exception&) {
	context_ = 0;

	if (!context_stack_.empty ()) {
	context_ = context_stack_.top ();
	context_stack_.pop ();
	}

	throw;
	}

	nlines_ = 0;
	ntokens_ = 0;
	}


	void Scanner::
	close ()
	{
	assert (0 != context_);

	issue_diag (I_OPENRD, false, 0,
	"%s: %u tokens, %u lines\n",
	context_->filename.c_str (), ntokens_, nlines_);

	delete context_;

	if (context_stack_.empty ())
	context_ = 0;
	else {
	context_ = context_stack_.top ();
	context_stack_.pop ();
	}
	}


	Scanner::token_id Scanner::
	process_token (const char* name)
	{
	assert (0 != name);

	if (*name == context_->escape_char) {
	switch (name [1]) {
	case '0': case '1': case '2': case '3':
	case '4': case '5': case '6': case '7':
	case 'd':
	case 'x':
	// escaped numeric character value
	return tok_char_value;
	default:
	break;
	}

	return tok_ndef;
	}

	// look for a predefined token

	static const struct {
	const char* name;
	Scanner::token_id token;
	} tok_map [] = {
	// elements must be sorted in ascending order
	{ "CHARMAP", tok_charmap },
	{ "END", tok_end },
	{ "IGNORE", tok_ignore },
	{ "LC_ADDRESS", tok_addr },
	{ "LC_COLLATE", tok_collate },
	{ "LC_CTYPE", tok_ctype },
	{ "LC_IDENTIFICATION", tok_ident },
	{ "LC_MEASUREMENT", tok_measure },
	{ "LC_MESSAGES", tok_messages },
	{ "LC_MONETARY", tok_monetary },
	{ "LC_NAME", tok_name },
	{ "LC_NUMERIC", tok_numeric },
	{ "LC_PAPER", tok_paper },
	{ "LC_TELEPHONE", tok_phone },
	{ "LC_TIME", tok_time },
	{ "UNDEFINED", tok_undefined },
	{ "WIDTH", tok_width },
	{ "abday", tok_abday },
	{ "abmon", tok_abmon },
	{ "alpha", tok_alpha },
	{ "alt_digits", tok_alt_digits },
	{ "am_pm", tok_am_pm },
	{ "backward", tok_backward },
	{ "blank", tok_blank },
	{ "cntrl", tok_cntrl },
	{ "collating-element", tok_coll_elem },
	{ "collating-symbol", tok_coll_sym },
	{ "comment_char", tok_comment_char },
	{ "copy", tok_copy },
	{ "currency_symbol", tok_currency_symbol },
	{ "d_fmt", tok_d_fmt },
	{ "d_t_fmt", tok_d_t_fmt },
	{ "day", tok_day },
	{ "decimal_point", tok_decimal_point },
	{ "digit", tok_digit },
	{ "era", tok_era },
	{ "era_d_fmt", tok_era_d_fmt },
	{ "era_d_t_fmt", tok_era_d_t_fmt },
	{ "era_t_fmt", tok_era_t_fmt },
	{ "escape_char", tok_escape_char },
	{ "falsename", tok_falsename },
	{ "forward", tok_forward },
	{ "frac_digits", tok_frac_digits },
	{ "from", tok_from },
	{ "graph", tok_graph },
	{ "grouping", tok_grouping },
	{ "include", tok_include },
	{ "int_curr_symbol", tok_int_curr_symbol },
	{ "int_frac_digits", tok_int_frac_digits },
	{ "int_n_cs_precedes", tok_int_n_cs_precedes },
	{ "int_n_sep_by_space", tok_int_n_sep_by_space },
	{ "int_n_sign_posn", tok_int_n_sign_posn },
	{ "int_p_cs_precedes", tok_int_p_cs_precedes },
	{ "int_p_sep_by_space", tok_int_p_sep_by_space },
	{ "int_p_sign_posn", tok_int_p_sign_posn },
	{ "lower", tok_lower },
	{ "mon", tok_mon },
	{ "mon_decimal_point", tok_mon_decimal_point },
	{ "mon_grouping", tok_mon_grouping },
	{ "mon_thousands_sep", tok_mon_thousands_sep },
	{ "n_cs_precedes", tok_n_cs_precedes },
	{ "n_sep_by_space", tok_n_sep_by_space },
	{ "n_sign_posn", tok_n_sign_posn },
	{ "negative_sign", tok_negative_sign },
	{ "noexpr", tok_noexpr },
	{ "order_end", tok_order_end },
	{ "order_start", tok_order_start },
	{ "p_cs_precedes", tok_p_cs_precedes },
	{ "p_sep_by_space", tok_p_sep_by_space },
	{ "p_sign_posn", tok_p_sign_posn },
	{ "position", tok_position },
	{ "positive_sign", tok_positive_sign },
	{ "print", tok_print },
	{ "punct", tok_punct },
	{ "reorder-after", tok_reorder },
	{ "reorder-end", tok_reorder_end },
	{ "reorder-section-after", tok_reorder_section },
	{ "reorder-section-end", tok_reorder_section_end },
	{ "script", tok_script },
	{ "space", tok_space },
	{ "t_fmt", tok_t_fmt },
	{ "t_fmt_ampm", tok_t_fmt_ampm },
	{ "thousands_sep", tok_thousands_sep },
	{ "tolower", tok_tolower },
	{ "toupper", tok_toupper },
	{ "translit_end", tok_xlit_end },
	{ "translit_start", tok_xlit_start },
	{ "truename", tok_truename },
	{ "upper", tok_upper },
	{ "xdigit", tok_xdigit },
	{ "yesexpr", tok_yesexpr }
	};

	int low = 0;
	int high = sizeof tok_map / sizeof *tok_map - 1;

	// this loop implements a binary search to find 'name' in the
	// tok_map list and when found returns the token value.
	while (low <= high) {

	const int cur = (low + high) / 2;

	const int cmp = std::strcmp (name, tok_map [cur].name);
	if (0 == cmp)
	return tok_map [cur].token;

	if (cmp < 0)
	high = cur - 1;
	else
	low = cur + 1;
	}

	return tok_ndef;
	}


	void Scanner::
	read_line ()
	{
	context_->line_.clear ();

	std::getline (context_->file, context_->line_);

	context_->line_ += '\n';

	context_->pos_ = context_->line_.c_str ();

	++context_->line;
	// context_->column = 0;

	++nlines_;

	assert (context_->line_.size ());
	}


	Scanner::token_t Scanner::
	next_token ()
	{
	assert (0 != context_);
	assert (context_->file.is_open ());

	// token
	token_t next_tok;

	next_tok.name = "";
	next_tok.token = tok_ndef;
	next_tok.line = 0;
	next_tok.column = 0;
	next_tok.file = 0;

	while (true) {

	// store the current file name
	next_tok.file = context_->filename.c_str ();

	// the assert above for eof checks if the caller has lost it;
	if (context_->file.eof ()) {
	next_tok.token = tok_end_tokens;
	return next_tok;
	}

	// if we exhausted the current line, advance
	if ( context_->line_.size ()
	<= std::size_t (context_->pos_ - context_->line_.c_str ())) {
	read_line ();
	}

	// line and column for the token start; they are set at each
	// iteration; the finding of a token breaks and next_tok leaves
	// this loop having the line/col info
	next_tok.line = context_->line;
	next_tok.column = context_->pos_ - context_->line_.c_str ();

	// plug in the pointer to current position
	const char*& next = context_->pos_;

	if (*next != context_->comment_char)
	escaped_newline_ = false;

	if (*next == '<') {
	// beginning of a symbolic name or keyword
	const char* tok_begin = next++;

	for (; '>' != *next; ++next) {

	// if has an escaped close angular, pass
	if (*next == context_->escape_char) {

	// append symbol name up to but not including the escape
	next_tok.name.append (tok_begin, next - tok_begin);

	// advance the next pointer to skip the escape
	tok_begin = ++next;
	}
	else if ('\n' == *next) {
	// past the end of the line
	issue_diag (E_SYNTAX, true, &next_tok,
	" unterminated symbolic name\n");
	break;
	}
	}

	next_tok.name.append (tok_begin, ++next - tok_begin);

	// check the name fetched so far
	if (next_tok.name == "<code_set_name>") {
	next_tok.token = tok_code_set_name;
	}
	else if ( next_tok.name == "<escape_char>"
	\|\| next_tok.name == "<comment_char>") {

	// eat away spaces
	while (' ' == next \|\| '\t' == next) {
	++next;
	}

	// test for end of line
	if (*next == '\n')
	issue_diag (E_SYNTAX, true, &next_tok,
	"missing value for %s\n",
	next_tok.name.c_str ());

	// store character
	if (next_tok.name == "<escape_char>")
	context_->escape_char = *next;
	else
	context_->comment_char = *next;

	// adjust positions;
	context_->pos_ =
	context_->line_.c_str () + context_->line_.size ();

	// set token to a newline
	next_tok.name = "";
	next_tok.token = tok_nl;
	}
	else if (next_tok.name == "<mb_cur_max>") {
	next_tok.token = tok_mb_cur_max;
	}
	else if (next_tok.name == "<mb_cur_min>") {
	next_tok.token = tok_mb_cur_min;
	}
	else {
	next_tok.token = tok_sym_name;
	}
	break;
	}
	else if (next == ' ' \|\| next == '\t' \|\| *next == ';') {
	// ignore whitespace and separators
	while (next == ' ' \|\| next == '\t' \|\| *next == ';') {
	++next;
	}
	}
	else if (*next == '\n') {
	++next;
	next_tok.token = tok_nl;
	break;
	}
	else if (*next == context_->comment_char) {
	// start of a comment - check as early as necessary
	// adjust to end of line
	context_->pos_ = context_->line_.c_str () + context_->line_.size ();

	if (escaped_newline_)
	continue;

	next_tok.token = tok_nl;
	next_tok.name = "\n";
	break;
	}
	else if (*next == '(') {
	// push open parenthesis
	next_tok.name.push_back (*next++);

	// start of a grouping
	while (*next != ')') {
	// contains a symbolic name
	if (*next == '<') {
	// push open angular parenthesis
	next_tok.name.push_back (*next++);

	while (*next != '\n') {
	// if has an escaped close angular, pass
	if (next [0] == context_->escape_char) {
	next_tok.name.push_back (*next++);
	next_tok.name.push_back (*next++);
	continue;
	}

	// if we have reached the end of the sym name
	if (*next == '>') {
	next_tok.name.push_back (*next);
	break;
	}

	// still inside the sym name/keyword
	next_tok.name.push_back (*next++);
	}

	// check if we have gone past the end of the line
	if (*next == '\n')
	issue_diag (E_SYNTAX, true, &next_tok,
	" unterminated symbolic name");

	++next;
	}
	else {
	// fetch the character
	next_tok.name.push_back (*next++);
	}

	if (*next == '\n')
	issue_diag (E_SYNTAX, true, &next_tok,
	" unterminated grouping ");
	}

	next_tok.name.push_back (*next++);
	next_tok.token = tok_grouping;
	break;
	}
	else if (*next == '.') {
	// ellipsis (see ISO/IEC TR 14652)
	int ellipsis_count = 0;
	// start of an interval
	while (*next == '.') {
	next_tok.name.push_back (*next++);
	++ellipsis_count;
	}

	switch (ellipsis_count) {
	case 2: {
	const char* tmp = next;
	if (tmp++ == '(' && tmp++ == '2' && *tmp++ == ')'
	&& tmp++ == '.' && tmp++ == '.') {
	// double increment hexadecimal symbolic ellipsis
	next_tok.token = tok_dbl_ellipsis;
	next = tmp;
	}
	else {
	// hexadecimal symbolic ellipsis
	next_tok.token = tok_hex_ellipsis;
	}
	break;
	}

	case 3:
	// absolute symbolic ellipsis
	next_tok.token = tok_abs_ellipsis;
	break;

	case 4:
	// decimal symbolic ellipsis
	next_tok.token = tok_dec_ellipsis;
	break;

	default:
	issue_diag (E_SYNTAX, true, &next_tok, "illegal ellipsis\n");
	}
	break;

	}
	else if (*next == '\"') {

	// start of a string
	next_tok.name.push_back (*next++);
	const char ec = context_->escape_char;

	while (next[0] != '\n') {

	// escaped newline; continue
	if (next [0] == ec && next [1] == '\n') {
	read_line ();
	continue;
	}

	// escaped quote
	if (next[0] == ec) {
	next_tok.name.push_back (*next++);
	next_tok.name.push_back (*next++);
	continue;
	}

	if (next [0] == '\"') {
	next_tok.name.push_back (*next);
	break;
	}

	// still inside the string
	next_tok.name.push_back (*next++);
	}

	// test for closure
	if (*next == '\n')
	issue_diag (E_SYNTAX, true, &next_tok, "unterminated string");

	++next;
	next_tok.token = tok_string;
	break;

	}
	else if (*next == context_->escape_char) {
	// start of an escape sequence
	// escaped new line
	if (next [1] == '\n') {
	// adjust to end of line
	context_->pos_ =
	context_->line_.c_str () + context_->line_.size ();

	escaped_newline_ = true;
	continue;
	}

	// or
	while ( next != ' ' && next != '\t'
	&& next != ';' && next != '\n') {
	next_tok.name.push_back (*next++);
	}

	// retrieve token based on value
	next_tok.token = process_token (next_tok.name.c_str ());
	break;
	}
	else {
	// the rest of it
	for (const char ec = context_->escape_char; ; ) {

	// stop at esc-newline or at first "separator"
	if ( (next [0] == ec && next [1] == '\n')
	\|\| next [0] == ' '
	\|\| next [0] == '\t'
	\|\| next [0] == '\n'
	\|\| next [0] == ';') {
	// continuation of a line, separators
	break;
	}

	// fetch characters
	next_tok.name.push_back (*next++);
	}

	// assert length of input
	assert (next_tok.name.size ());

	// it wasn't a locale definition keyword so call process_token
	// and add the result to the list
	next_tok.token = process_token (next_tok.name.c_str ());


	// an extra bit of processing since we keep comment and escape
	// characters in the scanner for a faster processing
	if ( next_tok.token == tok_escape_char
	\|\| next_tok.token == tok_comment_char) {

	// eat away spaces
	while (' ' == next \|\| '\t' == next) {
	++next;
	}

	// test for end of line
	if (*next == '\n')
	issue_diag (E_SYNTAX, true, &next_tok,
	"unterminated statement");

	// store character
	if (next_tok.token == tok_escape_char)
	context_->escape_char = next [0];
	else
	context_->comment_char = next [0];

	// adjust positions;
	context_->pos_ =
	context_->line_.c_str () + context_->line_.size ();

	// return the token
	next_tok.name = "";
	next_tok.token = tok_nl;
	}

	break;
	}
	}

	++ntokens_;

	return next_tok;
	}


	unsigned long Scanner::
	convert_escape (const char *esc,
	const char *pend / = 0 */,
	bool multi /* = false */) const
	{
	assert (0 != esc);

	const char escape = escape_char ();

	if (escape != *esc)
	issue_diag (E_SYNTAX, true, 0,
	"expected the escape character ('%c'), got \"%s\"\n",
	escape, esc);

	unsigned long value = 0;

	for (const char *s = esc; ; ) {

	// escaped characters are octal by default
	const char *basename = "octal";
	int base = 8;

	switch (*++s) {
	case 'd': ++s; base = 10; basename = "decimal"; break;
	case 'x': ++s; base = 16; basename = "hexadecimal"; break;

	case 'o': ++s;
	case '0': case '1': case '2': case '3':
	case '4': case '5': case '6': case '7':
	break;

	default:
	issue_diag (E_SYNTAX, true, 0,
	"one of { 'o', 'd', 'x' } expected following "
	"the escape character: %s\n", esc);
	}

	char *end = 0;

	const unsigned long byte = std::strtoul (s, &end, base);

	if (pend)
	*pend = end;

	// cast away constness below to work around an MSVC 7.0 bug:
	// causing error C2446: '==' : no conversion from 'char ** '
	// to 'const char ** ' Conversion loses qualifiers
	if (!multi && _RWSTD_CONST_CAST (char, pend) == &end && pend)
	issue_diag (E_SYNTAX, true, 0,
	"%s constant expected: %s\n", basename, esc);

	if (UCHAR_MAX < byte)
	issue_diag (E_INVAL, true, 0,
	"%s byte value must be in the range [0, %d]: %s\n",
	basename, int (UCHAR_MAX), esc);

	if (value >> (sizeof (unsigned long) - 1) * CHAR_BIT)
	issue_diag (E_INVAL, true, 0, "integer overflow: %s\n", esc);

	value = (value << CHAR_BIT) \| byte;

	if (**pend != escape \|\| !multi)
	break;

	s = *pend;
	}

	return value;
	}