node_modules/uncrustify/src/tokenize.cpp - cordova-osx - Git at Google

 /**
  * @file tokenize.cpp
  * This file breaks up the text stream into tokens or chunks.
  *
  * Each routine needs to set pc.len and pc.type.
  *
  * @author  Ben Gardner
  * @license GPL v2+
  */
 #include "uncrustify_types.h"
 #include "char_table.h"
 #include "prototypes.h"
 #include "chunk_list.h"
 #include <cstdio>
 #include <cstdlib>
 #include <cstring>
 #include <cerrno>
 #include "unc_ctype.h"

 struct tok_info
 {
    tok_info() : last_ch(0), idx(0), row(1), col(1)
    {
    }
    int last_ch;
    int idx;
    int row;
    int col;
 };

 struct tok_ctx
 {
    tok_ctx(const deque<int>& d) : data(d)
    {
    }

    /* save before trying to parse something that may fail */
    void save()
    {
       save(s);
    }
    void save(tok_info& info)
    {
       info = c;
    }

    /* restore previous saved state */
    void restore()
    {
       restore(s);
    }
    void restore(const tok_info& info)
    {
       c = info;
    }

    bool more()
    {
       return(c.idx < (int)data.size());
    }

    int peek()
    {
       return(more() ? data[c.idx] : -1);
    }

    int peek(int idx)
    {
       idx += c.idx;
       return((idx < (int)data.size()) ? data[idx] : -1);
    }

    int get()
    {
       if (more())
       {
          int ch = data[c.idx++];
          switch (ch)
          {
          case '\t':
             c.col = calc_next_tab_column(c.col, cpd.settings[UO_input_tab_size].n);
             break;

          case '\n':
             if (c.last_ch != '\r')
             {
                c.row++;
                c.col = 1;
             }
             break;

          case '\r':
             c.row++;
             c.col = 1;
             break;

          default:
             c.col++;
             break;
          }
          c.last_ch = ch;
          return ch;
       }
       return -1;
    }

    bool expect(int ch)
    {
       if (peek() == ch)
       {
          get();
          return true;
       }
       return false;
    }

    const deque<int>& data;
    tok_info          c; /* current */
    tok_info          s; /* saved */
 };

 static bool parse_string(tok_ctx& ctx, chunk_t& pc, int quote_idx, bool allow_escape);


 /**
  * Parses all legal D string constants.
  *
  * Quoted strings:
  *   r"Wysiwyg"      # WYSIWYG string
  *   x"hexstring"    # Hexadecimal array
  *   `Wysiwyg`       # WYSIWYG string
  *   'char'          # single character
  *   "reg_string"    # regular string
  *
  * Non-quoted strings:
  * \x12              # 1-byte hex constant
  * \u1234            # 2-byte hex constant
  * \U12345678        # 4-byte hex constant
  * \123              # octal constant
  * \&amp;            # named entity
  * \n                # single character
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a string was parsed
  */
 static bool d_parse_string(tok_ctx& ctx, chunk_t& pc)
 {
    int ch = ctx.peek();

    if ((ch == '"') || (ch == '\'') || (ch == '`'))
    {
       return(parse_string(ctx, pc, 0, true));
    }
    else if (ch == '\\')
    {
       ctx.save();
       int cnt;
       pc.str.clear();
       while (ctx.peek() == '\\')
       {
          pc.str.append(ctx.get());
          /* Check for end of file */
          switch (ctx.peek())
          {
          case 'x':
             /* \x HexDigit HexDigit */
             cnt = 3;
             while (cnt--)
             {
                pc.str.append(ctx.get());
             }
             break;

          case 'u':
             /* \u HexDigit HexDigit HexDigit HexDigit */
             cnt = 5;
             while (cnt--)
             {
                pc.str.append(ctx.get());
             }
             break;

          case 'U':
             /* \U HexDigit (x8) */
             cnt = 9;
             while (cnt--)
             {
                pc.str.append(ctx.get());
             }
             break;

          case '0':
          case '1':
          case '2':
          case '3':
          case '4':
          case '5':
          case '6':
          case '7':
             /* handle up to 3 octal digits */
             pc.str.append(ctx.get());
             ch = ctx.peek();
             if ((ch >= '0') && (ch <= '7'))
             {
                pc.str.append(ctx.get());
                ch = ctx.peek();
                if ((ch >= '0') && (ch <= '7'))
                {
                   pc.str.append(ctx.get());
                }
             }
             break;

          case '&':
             /* \& NamedCharacterEntity ; */
             pc.str.append(ctx.get());
             while (unc_isalpha(ctx.peek()))
             {
                pc.str.append(ctx.get());
             }
             if (ctx.peek() == ';')
             {
                pc.str.append(ctx.get());
             }
             break;

          default:
             /* Everything else is a single character */
             pc.str.append(ctx.get());
             break;
          }
       }

       if (pc.str.size() > 1)
       {
          pc.type = CT_STRING;
          return(true);
       }
       ctx.restore();
    }
    else if (((ch == 'r') || (ch == 'x')) && (ctx.peek(1) == '"'))
    {
       return(parse_string(ctx, pc, 1, false));
    }
    return(false);
 }


 // /**
 //  * A string-in-string search.  Like strstr() with a haystack length.
 //  */
 // static const char *str_search(const char *needle, const char *haystack, int haystack_len)
 // {
 //    int needle_len = strlen(needle);
 //
 //    while (haystack_len-- >= needle_len)
 //    {
 //       if (memcmp(needle, haystack, needle_len) == 0)
 //       {
 //          return(haystack);
 //       }
 //       haystack++;
 //    }
 //    return(NULL);
 // }


 /**
  * Figure of the length of the comment at text.
  * The next bit of text starts with a '/', so it might be a comment.
  * There are three types of comments:
  *  - C comments that start with  '/ *' and end with '* /'
  *  - C++ comments that start with //
  *  - D nestable comments '/+' '+/'
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a comment was parsed
  */
 static bool parse_comment(tok_ctx& ctx, chunk_t& pc)
 {
    int  ch;
    bool is_d    = (cpd.lang_flags & LANG_D) != 0;
    bool is_cs   = (cpd.lang_flags & LANG_CS) != 0;
    int  d_level = 0;
    int  bs_cnt;

    /* does this start with '/ /' or '/ *' or '/ +' (d) */
    if ((ctx.peek() != '/') ||
        ((ctx.peek(1) != '*') && (ctx.peek(1) != '/') &&
         ((ctx.peek(1) != '+') || !is_d)))
    {
       return(false);
    }

    ctx.save();

    /* account for opening two chars */
    pc.str = ctx.get();   /* opening '/' */
    ch = ctx.get();
    pc.str.append(ch);    /* second char */

    if (ch == '/')
    {
       pc.type = CT_COMMENT_CPP;
       while (true)
       {
          bs_cnt = 0;
          while (ctx.more())
          {
             ch = ctx.peek();
             if ((ch == '\r') || (ch == '\n'))
             {
                break;
             }
             if (ch == '\\' && !is_cs) /* backslashes aren't special in comments in C# */
             {
                bs_cnt++;
             }
             else
             {
                bs_cnt = 0;
             }
             pc.str.append(ctx.get());
          }

          /* If we hit an odd number of backslashes right before the newline,
           * then we keep going.
           */
          if (((bs_cnt & 1) == 0) || !ctx.more())
          {
             break;
          }
          if (ctx.peek() == '\r')
          {
             pc.str.append(ctx.get());
          }
          if (ctx.peek() == '\n')
          {
             pc.str.append(ctx.get());
          }
          pc.nl_count++;
          cpd.did_newline = true;
       }
    }
    else if (!ctx.more())
    {
       /* unexpected end of file */
       ctx.restore();
       return(false);
    }
    else if (ch == '+')
    {
       pc.type = CT_COMMENT;
       d_level++;
       while ((d_level > 0) && ctx.more())
       {
          if ((ctx.peek() == '+') && (ctx.peek(1) == '/'))
          {
             pc.str.append(ctx.get());  /* store the '+' */
             pc.str.append(ctx.get());  /* store the '/' */
             d_level--;
             continue;
          }

          if ((ctx.peek() == '/') && (ctx.peek(1) == '+'))
          {
             pc.str.append(ctx.get());  /* store the '/' */
             pc.str.append(ctx.get());  /* store the '+' */
             d_level++;
             continue;
          }

          ch = ctx.get();
          pc.str.append(ch);
          if ((ch == '\n') || (ch == '\r'))
          {
             pc.type = CT_COMMENT_MULTI;
             pc.nl_count++;

             if (ch == '\r')
             {
                if (ctx.peek() == '\n')
                {
                   cpd.le_counts[LE_CRLF]++;
                   pc.str.append(ctx.get());  /* store the '\n' */
                }
                else
                {
                   cpd.le_counts[LE_CR]++;
                }
             }
             else
             {
                cpd.le_counts[LE_LF]++;
             }
          }
       }
    }
    else  /* must be '/ *' */
    {
       pc.type = CT_COMMENT;
       while (ctx.more())
       {
          if ((ctx.peek() == '*') && (ctx.peek(1) == '/'))
          {
             pc.str.append(ctx.get());  /* store the '*' */
             pc.str.append(ctx.get());  /* store the '/' */

             tok_info ss;
             ctx.save(ss);
             int oldsize = pc.str.size();

             /* If there is another C comment right after this one, combine them */
             while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
             {
                pc.str.append(ctx.get());
             }
             if ((ctx.peek() != '/') || (ctx.peek(1) != '*'))
             {
                /* undo the attempt to join */
                ctx.restore(ss);
                pc.str.resize(oldsize);
                break;
             }
          }

          ch = ctx.get();
          pc.str.append(ch);
          if ((ch == '\n') || (ch == '\r'))
          {
             pc.type = CT_COMMENT_MULTI;
             pc.nl_count++;

             if (ch == '\r')
             {
                if (ctx.peek() == '\n')
                {
                   cpd.le_counts[LE_CRLF]++;
                   pc.str.append(ctx.get());  /* store the '\n' */
                }
                else
                {
                   cpd.le_counts[LE_CR]++;
                }
             }
             else
             {
                cpd.le_counts[LE_LF]++;
             }
          }
       }
    }

    if (cpd.unc_off)
    {
       const char* ontext = cpd.settings[UO_enable_processing_cmt].str;
       if (ontext == NULL)
       {
          ontext = UNCRUSTIFY_ON_TEXT;
       }

       if (pc.str.find(ontext) >= 0)
       {
          LOG_FMT(LBCTRL, "Found '%s' on line %d\n", ontext, pc.orig_line);
          cpd.unc_off = false;
       }
    }
    else
    {
       const char* offtext = cpd.settings[UO_disable_processing_cmt].str;
       if (offtext == NULL)
       {
          offtext = UNCRUSTIFY_OFF_TEXT;
       }

       if (pc.str.find(offtext) >= 0)
       {
          LOG_FMT(LBCTRL, "Found '%s' on line %d\n", offtext, pc.orig_line);
          cpd.unc_off = true;
       }
    }
    return(true);
 }

 /**
  * Figure of the length of the code placeholder at text, if present.
  * This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a placeholder was parsed.
  */
 static bool parse_code_placeholder(tok_ctx& ctx, chunk_t& pc)
 {
    int last2 = 0, last1 = 0;

    if ((ctx.peek() != '<') || (ctx.peek(1) != '#'))
    {
       return(false);
    }

    ctx.save();

    /* account for opening two chars '<#' */
    pc.str = ctx.get();
    pc.str.append(ctx.get());

    /* grab everything until '#>', fail if not found. */
    while (ctx.more())
    {
       last2 = last1;
       last1 = ctx.get();
       pc.str.append(last1);

       if ((last2 == '#') && (last1 == '>'))
       {
          pc.type = CT_WORD;
          return(true);
       }
    }
    ctx.restore();
    return(false);
 }


 /**
  * Parse any attached suffix, which may be a user-defined literal suffix.
  * If for a string, explicitly exclude common format and scan specifiers, ie,
  * PRIx32 and SCNx64.
  */
 static void parse_suffix(tok_ctx& ctx, chunk_t& pc, bool forstring = false)
 {
    if (CharTable::IsKw1(ctx.peek()))
    {
       int slen = 0;
       int oldsize = pc.str.size();
       tok_info ss;

       /* don't add the suffix if we see L" or L' or S" */
       int p1 = ctx.peek();
       int p2 = ctx.peek(1);
       if (forstring &&
           (((p1 == 'L') && ((p2 == '"') || (p2 == '\''))) ||
            ((p1 == 'S') && (p2 == '"'))))
       {
           return;
       }
       ctx.save(ss);
       while (ctx.more() && CharTable::IsKw2(ctx.peek()))
       {
          slen++;
          pc.str.append(ctx.get());
       }

       if (forstring && (slen >= 4) &&
           (pc.str.startswith("PRI", oldsize) ||
            pc.str.startswith("SCN", oldsize)))
       {
          ctx.restore(ss);
          pc.str.resize(oldsize);
       }
    }
 }


 static bool is_bin(int ch)
 {
    return((ch == '0') || (ch == '1'));
 }

 static bool is_bin_(int ch)
 {
    return(is_bin(ch) || (ch == '_'));
 }

 static bool is_oct(int ch)
 {
    return((ch >= '0') && (ch <= '7'));
 }

 static bool is_oct_(int ch)
 {
    return(is_oct(ch) || (ch == '_'));
 }

 static bool is_dec(int ch)
 {
    return((ch >= '0') && (ch <= '9'));
 }

 static bool is_dec_(int ch)
 {
    return(is_dec(ch) || (ch == '_'));
 }

 static bool is_hex(int ch)
 {
    return(((ch >= '0') && (ch <= '9')) ||
           ((ch >= 'a') && (ch <= 'f')) ||
           ((ch >= 'A') && (ch <= 'F')));
 }

 static bool is_hex_(int ch)
 {
    return(is_hex(ch) || (ch == '_'));
 }


 /**
  * Count the number of characters in the number.
  * The next bit of text starts with a number (0-9 or '.'), so it is a number.
  * Count the number of characters in the number.
  *
  * This should cover all number formats for all languages.
  * Note that this is not a strict parser. It will happily parse numbers in
  * an invalid format.
  *
  * For example, only D allows underscores in the numbers, but they are
  * allowed in all formats.
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a number was parsed
  */
 static bool parse_number(tok_ctx& ctx, chunk_t& pc)
 {
    int  tmp;
    bool is_float;
    bool did_hex = false;

    /* A number must start with a digit or a dot, followed by a digit */
    if (!is_dec(ctx.peek()) &&
        ((ctx.peek() != '.') || !is_dec(ctx.peek(1))))
    {
       return(false);
    }

    is_float = (ctx.peek() == '.');
    if (is_float && (ctx.peek(1) == '.'))
    {
       return(false);
    }

    /* Check for Hex, Octal, or Binary
     * Note that only D and Pawn support binary, but who cares?
     */
    if (ctx.peek() == '0')
    {
       pc.str.append(ctx.get());  /* store the '0' */

       switch (unc_toupper(ctx.peek()))
       {
       case 'X':               /* hex */
          did_hex = true;
          do
          {
             pc.str.append(ctx.get());  /* store the 'x' and then the rest */
          } while (is_hex_(ctx.peek()));
          break;

       case 'B':               /* binary */
          do
          {
             pc.str.append(ctx.get());  /* store the 'b' and then the rest */
          } while (is_bin_(ctx.peek()));
          break;

       case '0':                /* octal or decimal */
       case '1':
       case '2':
       case '3':
       case '4':
       case '5':
       case '6':
       case '7':
       case '8':
       case '9':
          do
          {
             pc.str.append(ctx.get());
          } while (is_oct_(ctx.peek()));
          break;

       default:
          /* either just 0 or 0.1 or 0UL, etc */
          break;
       }
    }
    else
    {
       /* Regular int or float */
       while (is_dec_(ctx.peek()))
       {
          pc.str.append(ctx.get());
       }
    }

    /* Check if we stopped on a decimal point & make sure it isn't '..' */
    if ((ctx.peek() == '.') && (ctx.peek(1) != '.'))
    {
       pc.str.append(ctx.get());
       is_float = true;
       if (did_hex)
       {
          while (is_hex_(ctx.peek()))
          {
             pc.str.append(ctx.get());
          }
       }
       else
       {
          while (is_dec_(ctx.peek()))
          {
             pc.str.append(ctx.get());
          }
       }
    }

    /* Check exponent
     * Valid exponents per language (not that it matters):
     * C/C++/D/Java: eEpP
     * C#/Pawn:      eE
     */
    tmp = unc_toupper(ctx.peek());
    if ((tmp == 'E') || (tmp == 'P'))
    {
       is_float = true;
       pc.str.append(ctx.get());
       if ((ctx.peek() == '+') || (ctx.peek() == '-'))
       {
          pc.str.append(ctx.get());
       }
       while (is_dec_(ctx.peek()))
       {
          pc.str.append(ctx.get());
       }
    }

    /* Check the suffixes
     * Valid suffixes per language (not that it matters):
     *        Integer       Float
     * C/C++: uUlL64        lLfF
     * C#:    uUlL          fFdDMm
     * D:     uUL           ifFL
     * Java:  lL            fFdD
     * Pawn:  (none)        (none)
     *
     * Note that i, f, d, and m only appear in floats.
     */
    while (1)
    {
       tmp = unc_toupper(ctx.peek());
       if ((tmp == 'I') || (tmp == 'F') || (tmp == 'D') || (tmp == 'M'))
       {
          is_float = true;
       }
       else if ((tmp != 'L') && (tmp != 'U'))
       {
          break;
       }
       pc.str.append(ctx.get());
    }

    /* skip the Microsoft-specific '64' suffix */
    if ((ctx.peek() == '6') && (ctx.peek(1) == '4'))
    {
       pc.str.append(ctx.get());
       pc.str.append(ctx.get());
    }

    pc.type = is_float ? CT_NUMBER_FP : CT_NUMBER;

    /* If there is anything left, then we are probably dealing with garbage or
     * some sick macro junk. Eat it.
     */
    parse_suffix(ctx, pc);

    return(true);
 }


 /**
  * Count the number of characters in a quoted string.
  * The next bit of text starts with a quote char " or ' or <.
  * Count the number of characters until the matching character.
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a string was parsed
  */
 static bool parse_string(tok_ctx& ctx, chunk_t& pc, int quote_idx, bool allow_escape)
 {
    bool escaped = 0;
    int  end_ch;
    char escape_char  = cpd.settings[UO_string_escape_char].n;
    char escape_char2 = cpd.settings[UO_string_escape_char2].n;
    bool should_escape_tabs = cpd.settings[UO_string_replace_tab_chars].b && (cpd.lang_flags & LANG_ALLC);

    pc.str.clear();
    while (quote_idx-- > 0)
    {
       pc.str.append(ctx.get());
    }

    pc.type = CT_STRING;
    end_ch  = CharTable::Get(ctx.peek()) & 0xff;
    pc.str.append(ctx.get());  /* store the " */

    while (ctx.more())
    {
       int lastcol = ctx.c.col;
       int ch = ctx.get();

       if ((ch == '\t') && should_escape_tabs)
       {
          ctx.c.col = lastcol + 2;
          pc.str.append(escape_char);
          pc.str.append('t');
          continue;
       }

       pc.str.append(ch);
       if (ch == '\n')
       {
          pc.nl_count++;
          pc.type = CT_STRING_MULTI;
          escaped = 0;
          continue;
       }
       if ((ch == '\r') && (ctx.peek() != '\n'))
       {
          pc.str.append(ctx.get());
          pc.nl_count++;
          pc.type = CT_STRING_MULTI;
          escaped = 0;
          continue;
       }
       if (!escaped)
       {
          if (ch == escape_char)
          {
             escaped = (escape_char != 0);
          }
          else if ((ch == escape_char2) && (ctx.peek() == end_ch))
          {
             escaped = allow_escape;
          }
          else if (ch == end_ch)
          {
             break;
          }
       }
       else
       {
          escaped = false;
       }
    }

    parse_suffix(ctx, pc, true);
    return(true);
 }


 /**
  * Literal string, ends with single "
  * Two "" don't end the string.
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a string was parsed
  */
 static bool parse_cs_string(tok_ctx& ctx, chunk_t& pc)
 {
    pc.str = ctx.get();
    pc.str.append(ctx.get());
    pc.type = CT_STRING;

    /* go until we hit a zero (end of file) or a single " */
    while (ctx.more())
    {
       int ch = ctx.get();
       pc.str.append(ch);
       if ((ch == '\n') || (ch == '\r'))
       {
          pc.type = CT_STRING_MULTI;
          pc.nl_count++;
       }
       if (ch == '"')
       {
          if (ctx.peek() == '"')
          {
             pc.str.append(ctx.get());
          }
          else
          {
             break;
          }
       }
    }

    return(true);
 }


 /**
  * VALA verbatim string, ends with three quotes (""")
  *
  * @param pc   The structure to update, str is an input.
  */
 static void parse_verbatim_string(tok_ctx& ctx, chunk_t& pc)
 {
    pc.type = CT_STRING;

    // consume the initial """
    pc.str = ctx.get();
    pc.str.append(ctx.get());
    pc.str.append(ctx.get());

    /* go until we hit a zero (end of file) or a """ */
    while (ctx.more())
    {
       int ch = ctx.get();
       pc.str.append(ch);
       if ((ch == '"') &&
           (ctx.peek() == '"') &&
           (ctx.peek(1) == '"'))
       {
          pc.str.append(ctx.get());
          pc.str.append(ctx.get());
          break;
       }
       if ((ch == '\n') || (ch == '\r'))
       {
          pc.type = CT_STRING_MULTI;
          pc.nl_count++;
       }
    }
 }


 static bool tag_compare(const deque<int>& d, int a_idx, int b_idx, int len)
 {
    if (a_idx != b_idx)
    {
       while (len-- > 0)
       {
          if (d[a_idx] != d[b_idx])
          {
             return false;
          }
       }
    }
    return true;
 }


 /**
  * Parses a C++0x 'R' string. R"( xxx )" R"tag(  )tag" u8R"(x)" uR"(x)"
  * Newlines may be in the string.
  */
 static bool parse_cr_string(tok_ctx& ctx, chunk_t& pc, int q_idx)
 {
    int cnt;
    int tag_idx = ctx.c.idx + q_idx + 1;
    int tag_len = 0;

    ctx.save();

    /* Copy the prefix + " to the string */
    pc.str.clear();
    cnt = q_idx + 1;
    while (cnt--)
    {
       pc.str.append(ctx.get());
    }

    /* Add the tag and get the length of the tag */
    while (ctx.more() && (ctx.peek() != '('))
    {
       tag_len++;
       pc.str.append(ctx.get());
    }
    if (ctx.peek() != '(')
    {
       ctx.restore();
       return(false);
    }

    pc.type = CT_STRING;
    while (ctx.more())
    {
       if ((ctx.peek() == ')') &&
           (ctx.peek(tag_len + 1) == '"') &&
           tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
       {
          cnt = tag_len + 2;   /* for the )" */
          while (cnt--)
          {
             pc.str.append(ctx.get());
          }
          parse_suffix(ctx, pc);
          return(true);
       }
       if (ctx.peek() == '\n')
       {
          pc.str.append(ctx.get());
          pc.nl_count++;
          pc.type = CT_STRING_MULTI;
       }
       else
       {
          pc.str.append(ctx.get());
       }
    }
    ctx.restore();
    return(false);
 }


 /**
  * Count the number of characters in a word.
  * The first character is already valid for a keyword
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether a word was parsed (always true)
  */
 bool parse_word(tok_ctx& ctx, chunk_t& pc, bool skipcheck)
 {
    int             ch;
    static unc_text intr_txt("@interface");

    /* The first character is already valid */
    pc.str.clear();
    pc.str.append(ctx.get());

    while (ctx.more() && CharTable::IsKw2(ctx.peek()))
    {
       ch = ctx.get();
       pc.str.append(ch);

       /* HACK: Non-ASCII character are only allowed in identifiers */
       if (ch > 0x7f)
       {
          skipcheck = true;
       }
    }
    pc.type = CT_WORD;

    if (skipcheck)
    {
       return(true);
    }

    /* Detect pre-processor functions now */
    if ((cpd.in_preproc == CT_PP_DEFINE) &&
        (cpd.preproc_ncnl_count == 1))
    {
       if (ctx.peek() == '(')
       {
          pc.type = CT_MACRO_FUNC;
       }
       else
       {
          pc.type = CT_MACRO;
       }
    }
    else
    {
       /* '@interface' is reserved, not an interface itself */
       if ((cpd.lang_flags & LANG_JAVA) && pc.str.startswith("@") &&
           !pc.str.equals(intr_txt))
       {
          pc.type = CT_ANNOTATION;
       }
       else
       {
          /* Turn it into a keyword now */
          pc.type = find_keyword_type(pc.str.c_str(), pc.str.size());
       }
    }

    return(true);
 }


 /**
  * Count the number of whitespace characters.
  *
  * @param pc   The structure to update, str is an input.
  * @return     Whether whitespace was parsed
  */
 static bool parse_whitespace(tok_ctx& ctx, chunk_t& pc)
 {
    int nl_count = 0;
    int ch       = -2;

    /* REVISIT: use a better whitespace detector? */
    while (ctx.more() && unc_isspace(ctx.peek()))
    {
       ch = ctx.get();   /* throw away the whitespace char */
       switch (ch)
       {
       case '\r':
          if (ctx.expect('\n'))
          {
             /* CRLF ending */
             cpd.le_counts[LE_CRLF]++;
          }
          else
          {
             /* CR ending */
             cpd.le_counts[LE_CR]++;
          }
          nl_count++;
          pc.orig_prev_sp = 0;
          break;

       case '\n':
          /* LF ending */
          cpd.le_counts[LE_LF]++;
          nl_count++;
          pc.orig_prev_sp = 0;
          break;

       case '\t':
          pc.orig_prev_sp += calc_next_tab_column(cpd.column, cpd.settings[UO_input_tab_size].n) - cpd.column;
          break;

       case ' ':
          pc.orig_prev_sp++;
          break;

       default:
          break;
       }
    }

    if (ch != -2)
    {
       pc.str.clear();
       pc.nl_count  = nl_count;
       pc.type      = nl_count ? CT_NEWLINE : CT_WHITESPACE;
       pc.after_tab = (ctx.c.last_ch == '\t');
       return(true);
    }
    return(false);
 }


 /**
  * Called when we hit a backslash.
  * If there is nothing but whitespace until the newline, then this is a
  * backslash newline
  */
 static bool parse_bs_newline(tok_ctx& ctx, chunk_t& pc)
 {
    ctx.save();
    ctx.get(); /* skip the '\' */

    int ch;
    while (ctx.more() && unc_isspace(ch = ctx.peek()))
    {
       ctx.get();
       if ((ch == '\r') || (ch == '\n'))
       {
          if (ch == '\r')
          {
             ctx.expect('\n');
          }
          pc.str      = "\\";
          pc.type     = CT_NL_CONT;
          pc.nl_count = 1;
          return(true);
       }
    }

    ctx.restore();
    return(false);
 }


 /**
  * Parses any number of tab or space chars followed by a newline.
  * Does not change pc.len if a newline isn't found.
  * This is not the same as parse_whitespace() because it only consumes until
  * a single newline is encountered.
  */
 static bool parse_newline(tok_ctx& ctx)
 {
    ctx.save();

    /* Eat whitespace */
    while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
    {
       ctx.get();
    }
    if ((ctx.peek() == '\r') || (ctx.peek() == '\n'))
    {
       if (!ctx.expect('\n'))
       {
          ctx.get();
          ctx.expect('\n');
       }
       return(true);
    }
    ctx.restore();
    return(false);
 }


 static bool parse_ignored(tok_ctx& ctx, chunk_t& pc)
 {
    int nl_count = 0;

    /* Parse off newlines/blank lines */
    while (parse_newline(ctx))
    {
       nl_count++;
    }
    if (nl_count > 0)
    {
       pc.nl_count = nl_count;
       pc.type     = CT_NEWLINE;
       return(true);
    }

    /* See if the UO_enable_processing_cmt text is on this line */
    ctx.save();
    pc.str.clear();
    while (ctx.more() &&
           (ctx.peek() != '\r') &&
           (ctx.peek() != '\n'))
    {
       pc.str.append(ctx.get());
    }
    if (pc.str.size() == 0)
    {
       /* end of file? */
       return(false);
    }
    /* Note that we aren't actually making sure this is in a comment, yet */
    const char* ontext = cpd.settings[UO_enable_processing_cmt].str;
    if (ontext == NULL)
    {
       ontext = UNCRUSTIFY_ON_TEXT;
    }
    if (pc.str.find(ontext) < 0)
    {
       pc.type = CT_IGNORED;
       return(true);
    }
    ctx.restore();

    /* parse off whitespace leading to the comment */
    if (parse_whitespace(ctx, pc))
    {
       pc.type = CT_IGNORED;
       return(true);
    }

    /* Look for the ending comment and let it pass */
    if (parse_comment(ctx, pc) && !cpd.unc_off)
    {
       return(true);
    }

    /* Reset the chunk & scan to until a newline */
    pc.str.clear();
    while (ctx.more() &&
           (ctx.peek() != '\r') &&
           (ctx.peek() != '\n'))
    {
       pc.str.append(ctx.get());
    }
    if (pc.str.size() > 0)
    {
       pc.type = CT_IGNORED;
       return(true);
    }
    return(false);
 }


 /**
  * Skips the next bit of whatever and returns the type of block.
  *
  * pc.str is the input text.
  * pc.len in the output length.
  * pc.type is the output type
  * pc.column is output column
  *
  * @param pc      The structure to update, str is an input.
  * @return        true/false - whether anything was parsed
  */
 static bool parse_next(tok_ctx& ctx, chunk_t& pc)
 {
    const chunk_tag_t *punc;
    int ch, ch1;

    if (!ctx.more())
    {
       //fprintf(stderr, "All done!\n");
       return(false);
    }

    /* Save off the current column */
    pc.orig_line = ctx.c.row;
    pc.column    = ctx.c.col;
    pc.orig_col  = ctx.c.col;
    pc.type      = CT_NONE;
    pc.nl_count  = 0;
    pc.flags     = 0;

    /* If it is turned off, we put everything except newlines into CT_UNKNOWN */
    if (cpd.unc_off)
    {
       if (parse_ignored(ctx, pc))
       {
          return(true);
       }
    }

    /**
     * Parse whitespace
     */
    if (parse_whitespace(ctx, pc))
    {
       return(true);
    }

    /**
     * Handle unknown/unhandled preprocessors
     */
    if ((cpd.in_preproc > CT_PP_BODYCHUNK) &&
        (cpd.in_preproc <= CT_PP_OTHER))
    {
       pc.str.clear();
       tok_info ss;
       ctx.save(ss);
       /* Chunk to a newline or comment */
       pc.type = CT_PREPROC_BODY;
       int last = 0;
       while (ctx.more())
       {
          int ch = ctx.peek();

          if ((ch == '\n') || (ch == '\r'))
          {
             /* Back off if this is an escaped newline */
             if (last == '\\')
             {
                ctx.restore(ss);
                pc.str.pop_back();
             }
             break;
          }

          /* Quit on a C++ comment start */
          if ((ch == '/') && (ctx.peek(1) == '/'))
          {
             break;
          }
          last = ch;
          ctx.save(ss);

          pc.str.append(ctx.get());
       }
       if (pc.str.size() > 0)
       {
          return(true);
       }
    }

    /**
     * Detect backslash-newline
     */
    if ((ctx.peek() == '\\') && parse_bs_newline(ctx, pc))
    {
       return(true);
    }

    /**
     * Parse comments
     */
    if (parse_comment(ctx, pc))
    {
       return(true);
    }

    /* Parse code placeholders */
    if (parse_code_placeholder(ctx, pc))
    {
       return(true);
    }

    /* Check for C# literal strings, ie @"hello" and identifiers @for*/
    if (((cpd.lang_flags & LANG_CS) != 0) && (ctx.peek() == '@'))
    {
       if (ctx.peek(1) == '"')
       {
          parse_cs_string(ctx, pc);
          return(true);
       }
       /* check for non-keyword identifiers such as @if @switch, etc */
       if (CharTable::IsKw1(ctx.peek(1)))
       {
          parse_word(ctx, pc, true);
          return(true);
       }
    }

    /* handle VALA """ strings """ */
    if (((cpd.lang_flags & LANG_VALA) != 0) &&
        (ctx.peek() == '"') &&
        (ctx.peek(1) == '"') &&
        (ctx.peek(2) == '"'))
    {
        parse_verbatim_string(ctx, pc);
        return true;
    }

    /* handle C++0x strings u8"x" u"x" U"x" R"x" u8R"XXX(I'm a "raw UTF-8" string.)XXX" */
    ch = ctx.peek();
    if (((cpd.lang_flags & LANG_CPP) != 0) &&
        ((ch == 'u') || (ch == 'U') || (ch == 'R')))
    {
       int idx = 0;
       bool is_real = false;

       if ((ch == 'u') && (ctx.peek(1) == '8'))
       {
          idx = 2;
       }
       else if (unc_tolower(ch) == 'u')
       {
          idx++;
       }

       if (ctx.peek(idx) == 'R')
       {
          idx++;
          is_real = true;
       }
       if (ctx.peek(idx) == '"')
       {
          if (is_real)
          {
             if (parse_cr_string(ctx, pc, idx))
             {
                return(true);
             }
          }
          else
          {
             if (parse_string(ctx, pc, idx, true))
             {
                parse_suffix(ctx, pc, true);
                return(true);
             }
          }
       }
    }

    /* PAWN specific stuff */
    if ((cpd.lang_flags & LANG_PAWN) != 0)
    {
       /* Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" */
       if ((ctx.peek() == '\\') || (ctx.peek() == '!'))
       {
          if (ctx.peek(1) == '"')
          {
             parse_string(ctx, pc, 1, (ctx.peek() == '!'));
             return(true);
          }
          else if (((ctx.peek(1) == '\\') || (ctx.peek(1) == '!')) &&
                   (ctx.peek(2) == '"'))
          {
             parse_string(ctx, pc, 2, false);
             return(true);
          }
       }
    }

    /**
     * Parse strings and character constants
     */

    if (parse_number(ctx, pc))
    {
       return(true);
    }

    if ((cpd.lang_flags & LANG_D) != 0)
    {
       /* D specific stuff */
       if (d_parse_string(ctx, pc))
       {
          return(true);
       }
    }
    else
    {
       /* Not D stuff */

       /* Check for L'a', L"abc", 'a', "abc", <abc> strings */
       ch  = ctx.peek();
       ch1 = ctx.peek(1);
       if ((((ch == 'L') || (ch == 'S')) &&
            ((ch1 == '"') || (ch1 == '\''))) ||
           (ch == '"') ||
           (ch == '\'') ||
           ((ch == '<') && (cpd.in_preproc == CT_PP_INCLUDE)))
       {
          parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
          return(true);
       }

       if ((ch == '<') && (cpd.in_preproc == CT_PP_DEFINE))
       {
          if (chunk_get_tail()->type == CT_MACRO)
          {
             /* We have "#define XXX <", assume '<' starts an include string */
             parse_string(ctx, pc, 0, false);
             return(true);
          }
       }
    }

    /* Check for Objective C literals and VALA identifiers ('@1', '@if')*/
    if ((cpd.lang_flags & (LANG_OC | LANG_VALA)) && (ctx.peek() == '@'))
    {
       int nc = ctx.peek(1);
       if ((nc == '"') || (nc == '\''))
       {
          /* literal string */
          parse_string(ctx, pc, 1, true);
          return true;
       }
       else if ((nc >= '0') && (nc <= '9'))
       {
          /* literal number */
          pc.str.append(ctx.get());  /* store the '@' */
          parse_number(ctx, pc);
          return true;
       }
    }

    /* Check for pawn/ObjectiveC/Java and normal identifiers */
    if (CharTable::IsKw1(ctx.peek()) ||
        ((ctx.peek() == '@') && CharTable::IsKw1(ctx.peek(1))))
    {
       parse_word(ctx, pc, false);
       return(true);
    }

    /* see if we have a punctuator */
    char punc_txt[4];
    punc_txt[0] = ctx.peek();
    punc_txt[1] = ctx.peek(1);
    punc_txt[2] = ctx.peek(2);
    punc_txt[3] = ctx.peek(3);
    if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != NULL)
    {
       int cnt = strlen(punc->tag);
       while (cnt--)
       {
          pc.str.append(ctx.get());
       }
       pc.type   = punc->type;
       pc.flags |= PCF_PUNCTUATOR;
       return(true);
    }

    /* throw away this character */
    pc.type = CT_UNKNOWN;
    pc.str.append(ctx.get());

    LOG_FMT(LWARN, "%s:%d Garbage in col %d: %x\n",
            cpd.filename, pc.orig_line, (int)ctx.c.col, pc.str[0]);
    cpd.error_count++;
    return(true);
 }


 /**
  * This function parses or tokenizes the whole buffer into a list.
  * It has to do some tricks to parse preprocessors.
  *
  * If output_text() were called immediately after, two things would happen:
  *  - trailing whitespace are removed.
  *  - leading space & tabs are converted to the appropriate format.
  *
  * All the tokens are inserted before ref. If ref is NULL, they are inserted
  * at the end of the list.  Line numbers are relative to the start of the data.
  */
 void tokenize(const deque<int>& data, chunk_t *ref)
 {
    tok_ctx            ctx(data);
    chunk_t            chunk;
    chunk_t            *pc    = NULL;
    chunk_t            *rprev = NULL;
    struct parse_frame frm;
    bool               last_was_tab = false;
    int                prev_sp = 0;

    memset(&frm, 0, sizeof(frm));

    while (ctx.more())
    {
       chunk.reset();
       if (!parse_next(ctx, chunk))
       {
          LOG_FMT(LERR, "%s:%d Bailed before the end?\n",
                  cpd.filename, ctx.c.row);
          cpd.error_count++;
          break;
       }

       /* Don't create an entry for whitespace */
       if (chunk.type == CT_WHITESPACE)
       {
          last_was_tab = chunk.after_tab;
          prev_sp = chunk.orig_prev_sp;
          continue;
       }
       chunk.orig_prev_sp = prev_sp;
       prev_sp = 0;

       if (chunk.type == CT_NEWLINE)
       {
          last_was_tab    = chunk.after_tab;
          chunk.after_tab = false;
          chunk.str.clear();
       }
       else if (chunk.type == CT_NL_CONT)
       {
          last_was_tab    = chunk.after_tab;
          chunk.after_tab = false;
          chunk.str       = "\\\n";
       }
       else
       {
          chunk.after_tab = last_was_tab;
          last_was_tab    = false;
       }

       /* Strip trailing whitespace (for CPP comments and PP blocks) */
       while ((chunk.str.size() > 0) &&
              ((chunk.str[chunk.str.size() - 1] == ' ') ||
               (chunk.str[chunk.str.size() - 1] == '\t')))
       {
          // If comment contains backslash '\' followed by whitespace chars, keep last one;
          // this will prevent it from turning '\' into line continuation.
          if (chunk.str.size() > 1 && chunk.str[chunk.str.size() - 2] == '\\')
             break;
          chunk.str.pop_back();
       }

       /* Store off the end column */
       chunk.orig_col_end = ctx.c.col;

       /* Add the chunk to the list */
       rprev = pc;
       if (rprev != NULL)
       {
          pc->flags |= rprev->flags & PCF_COPY_FLAGS;

          /* a newline can't be in a preprocessor */
          if (pc->type == CT_NEWLINE)
          {
             pc->flags &= ~PCF_IN_PREPROC;
          }
       }
       if (ref != NULL)
       {
          chunk.flags |= PCF_INSERTED;
       }
       else
       {
          chunk.flags &= ~PCF_INSERTED;
       }
       pc = chunk_add_before(&chunk, ref);

       /* A newline marks the end of a preprocessor */
       if (pc->type == CT_NEWLINE) // || (pc->type == CT_COMMENT_MULTI))
       {
          cpd.in_preproc         = CT_NONE;
          cpd.preproc_ncnl_count = 0;
       }

       /* Special handling for preprocessor stuff */
       if (cpd.in_preproc != CT_NONE)
       {
          pc->flags |= PCF_IN_PREPROC;

          /* Count words after the preprocessor */
          if (!chunk_is_comment(pc) && !chunk_is_newline(pc))
          {
             cpd.preproc_ncnl_count++;
          }

          /* Figure out the type of preprocessor for #include parsing */
          if (cpd.in_preproc == CT_PREPROC)
          {
             if ((pc->type < CT_PP_DEFINE) || (pc->type > CT_PP_OTHER))
             {
                set_chunk_type(pc, CT_PP_OTHER);
             }
             cpd.in_preproc = pc->type;
          }
       }
       else
       {
          /* Check for a preprocessor start */
          if ((pc->type == CT_POUND) &&
              ((rprev == NULL) || (rprev->type == CT_NEWLINE)))
          {
             set_chunk_type(pc, CT_PREPROC);
             pc->flags     |= PCF_IN_PREPROC;
             cpd.in_preproc = CT_PREPROC;
          }
       }
    }

    /* Set the cpd.newline string for this file */
    if ((cpd.settings[UO_newlines].le == LE_LF) ||
        ((cpd.settings[UO_newlines].le == LE_AUTO) &&
         (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CRLF]) &&
         (cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CR])))
    {
       /* LF line ends */
       cpd.newline = "\n";
       LOG_FMT(LLINEENDS, "Using LF line endings\n");
    }
    else if ((cpd.settings[UO_newlines].le == LE_CRLF) ||
             ((cpd.settings[UO_newlines].le == LE_AUTO) &&
              (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_LF]) &&
              (cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_CR])))
    {
       /* CRLF line ends */
       cpd.newline = "\r\n";
       LOG_FMT(LLINEENDS, "Using CRLF line endings\n");
    }
    else
    {
       /* CR line ends */
       cpd.newline = "\r";
       LOG_FMT(LLINEENDS, "Using CR line endings\n");
    }
 }


 // /**
 //  * A simplistic fixed-sized needle in the fixed-size haystack string search.
 //  */
 // int str_find(const char *needle, int needle_len,
 //              const char *haystack, int haystack_len)
 // {
 //    int idx;
 //
 //    for (idx = 0; idx < (haystack_len - needle_len); idx++)
 //    {
 //       if (memcmp(needle, haystack + idx, needle_len) == 0)
 //       {
 //          return(idx);
 //       }
 //    }
 //    return(-1);
 // }