blob: 00392e4dee036664fba372ad34da733825ac1bd1 [file] [log] [blame]
/**
* @file tokenize.cpp
* This file breaks up the text stream into tokens or chunks.
*
* Each routine needs to set pc.len and pc.type.
*
* @author Ben Gardner
* @license GPL v2+
*/
#include "uncrustify_types.h"
#include "char_table.h"
#include "prototypes.h"
#include "chunk_list.h"
#include <cstdio>
#include <cstdlib>
#include <cstring>
#include <cerrno>
#include "unc_ctype.h"
struct tok_info
{
tok_info() : last_ch(0), idx(0), row(1), col(1)
{
}
int last_ch;
int idx;
int row;
int col;
};
struct tok_ctx
{
tok_ctx(const deque<int>& d) : data(d)
{
}
/* save before trying to parse something that may fail */
void save()
{
save(s);
}
void save(tok_info& info)
{
info = c;
}
/* restore previous saved state */
void restore()
{
restore(s);
}
void restore(const tok_info& info)
{
c = info;
}
bool more()
{
return(c.idx < (int)data.size());
}
int peek()
{
return(more() ? data[c.idx] : -1);
}
int peek(int idx)
{
idx += c.idx;
return((idx < (int)data.size()) ? data[idx] : -1);
}
int get()
{
if (more())
{
int ch = data[c.idx++];
switch (ch)
{
case '\t':
c.col = calc_next_tab_column(c.col, cpd.settings[UO_input_tab_size].n);
break;
case '\n':
if (c.last_ch != '\r')
{
c.row++;
c.col = 1;
}
break;
case '\r':
c.row++;
c.col = 1;
break;
default:
c.col++;
break;
}
c.last_ch = ch;
return ch;
}
return -1;
}
bool expect(int ch)
{
if (peek() == ch)
{
get();
return true;
}
return false;
}
const deque<int>& data;
tok_info c; /* current */
tok_info s; /* saved */
};
static bool parse_string(tok_ctx& ctx, chunk_t& pc, int quote_idx, bool allow_escape);
/**
* Parses all legal D string constants.
*
* Quoted strings:
* r"Wysiwyg" # WYSIWYG string
* x"hexstring" # Hexadecimal array
* `Wysiwyg` # WYSIWYG string
* 'char' # single character
* "reg_string" # regular string
*
* Non-quoted strings:
* \x12 # 1-byte hex constant
* \u1234 # 2-byte hex constant
* \U12345678 # 4-byte hex constant
* \123 # octal constant
* \&amp; # named entity
* \n # single character
*
* @param pc The structure to update, str is an input.
* @return Whether a string was parsed
*/
static bool d_parse_string(tok_ctx& ctx, chunk_t& pc)
{
int ch = ctx.peek();
if ((ch == '"') || (ch == '\'') || (ch == '`'))
{
return(parse_string(ctx, pc, 0, true));
}
else if (ch == '\\')
{
ctx.save();
int cnt;
pc.str.clear();
while (ctx.peek() == '\\')
{
pc.str.append(ctx.get());
/* Check for end of file */
switch (ctx.peek())
{
case 'x':
/* \x HexDigit HexDigit */
cnt = 3;
while (cnt--)
{
pc.str.append(ctx.get());
}
break;
case 'u':
/* \u HexDigit HexDigit HexDigit HexDigit */
cnt = 5;
while (cnt--)
{
pc.str.append(ctx.get());
}
break;
case 'U':
/* \U HexDigit (x8) */
cnt = 9;
while (cnt--)
{
pc.str.append(ctx.get());
}
break;
case '0':
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
/* handle up to 3 octal digits */
pc.str.append(ctx.get());
ch = ctx.peek();
if ((ch >= '0') && (ch <= '7'))
{
pc.str.append(ctx.get());
ch = ctx.peek();
if ((ch >= '0') && (ch <= '7'))
{
pc.str.append(ctx.get());
}
}
break;
case '&':
/* \& NamedCharacterEntity ; */
pc.str.append(ctx.get());
while (unc_isalpha(ctx.peek()))
{
pc.str.append(ctx.get());
}
if (ctx.peek() == ';')
{
pc.str.append(ctx.get());
}
break;
default:
/* Everything else is a single character */
pc.str.append(ctx.get());
break;
}
}
if (pc.str.size() > 1)
{
pc.type = CT_STRING;
return(true);
}
ctx.restore();
}
else if (((ch == 'r') || (ch == 'x')) && (ctx.peek(1) == '"'))
{
return(parse_string(ctx, pc, 1, false));
}
return(false);
}
// /**
// * A string-in-string search. Like strstr() with a haystack length.
// */
// static const char *str_search(const char *needle, const char *haystack, int haystack_len)
// {
// int needle_len = strlen(needle);
//
// while (haystack_len-- >= needle_len)
// {
// if (memcmp(needle, haystack, needle_len) == 0)
// {
// return(haystack);
// }
// haystack++;
// }
// return(NULL);
// }
/**
* Figure of the length of the comment at text.
* The next bit of text starts with a '/', so it might be a comment.
* There are three types of comments:
* - C comments that start with '/ *' and end with '* /'
* - C++ comments that start with //
* - D nestable comments '/+' '+/'
*
* @param pc The structure to update, str is an input.
* @return Whether a comment was parsed
*/
static bool parse_comment(tok_ctx& ctx, chunk_t& pc)
{
int ch;
bool is_d = (cpd.lang_flags & LANG_D) != 0;
bool is_cs = (cpd.lang_flags & LANG_CS) != 0;
int d_level = 0;
int bs_cnt;
/* does this start with '/ /' or '/ *' or '/ +' (d) */
if ((ctx.peek() != '/') ||
((ctx.peek(1) != '*') && (ctx.peek(1) != '/') &&
((ctx.peek(1) != '+') || !is_d)))
{
return(false);
}
ctx.save();
/* account for opening two chars */
pc.str = ctx.get(); /* opening '/' */
ch = ctx.get();
pc.str.append(ch); /* second char */
if (ch == '/')
{
pc.type = CT_COMMENT_CPP;
while (true)
{
bs_cnt = 0;
while (ctx.more())
{
ch = ctx.peek();
if ((ch == '\r') || (ch == '\n'))
{
break;
}
if (ch == '\\' && !is_cs) /* backslashes aren't special in comments in C# */
{
bs_cnt++;
}
else
{
bs_cnt = 0;
}
pc.str.append(ctx.get());
}
/* If we hit an odd number of backslashes right before the newline,
* then we keep going.
*/
if (((bs_cnt & 1) == 0) || !ctx.more())
{
break;
}
if (ctx.peek() == '\r')
{
pc.str.append(ctx.get());
}
if (ctx.peek() == '\n')
{
pc.str.append(ctx.get());
}
pc.nl_count++;
cpd.did_newline = true;
}
}
else if (!ctx.more())
{
/* unexpected end of file */
ctx.restore();
return(false);
}
else if (ch == '+')
{
pc.type = CT_COMMENT;
d_level++;
while ((d_level > 0) && ctx.more())
{
if ((ctx.peek() == '+') && (ctx.peek(1) == '/'))
{
pc.str.append(ctx.get()); /* store the '+' */
pc.str.append(ctx.get()); /* store the '/' */
d_level--;
continue;
}
if ((ctx.peek() == '/') && (ctx.peek(1) == '+'))
{
pc.str.append(ctx.get()); /* store the '/' */
pc.str.append(ctx.get()); /* store the '+' */
d_level++;
continue;
}
ch = ctx.get();
pc.str.append(ch);
if ((ch == '\n') || (ch == '\r'))
{
pc.type = CT_COMMENT_MULTI;
pc.nl_count++;
if (ch == '\r')
{
if (ctx.peek() == '\n')
{
cpd.le_counts[LE_CRLF]++;
pc.str.append(ctx.get()); /* store the '\n' */
}
else
{
cpd.le_counts[LE_CR]++;
}
}
else
{
cpd.le_counts[LE_LF]++;
}
}
}
}
else /* must be '/ *' */
{
pc.type = CT_COMMENT;
while (ctx.more())
{
if ((ctx.peek() == '*') && (ctx.peek(1) == '/'))
{
pc.str.append(ctx.get()); /* store the '*' */
pc.str.append(ctx.get()); /* store the '/' */
tok_info ss;
ctx.save(ss);
int oldsize = pc.str.size();
/* If there is another C comment right after this one, combine them */
while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
{
pc.str.append(ctx.get());
}
if ((ctx.peek() != '/') || (ctx.peek(1) != '*'))
{
/* undo the attempt to join */
ctx.restore(ss);
pc.str.resize(oldsize);
break;
}
}
ch = ctx.get();
pc.str.append(ch);
if ((ch == '\n') || (ch == '\r'))
{
pc.type = CT_COMMENT_MULTI;
pc.nl_count++;
if (ch == '\r')
{
if (ctx.peek() == '\n')
{
cpd.le_counts[LE_CRLF]++;
pc.str.append(ctx.get()); /* store the '\n' */
}
else
{
cpd.le_counts[LE_CR]++;
}
}
else
{
cpd.le_counts[LE_LF]++;
}
}
}
}
if (cpd.unc_off)
{
const char* ontext = cpd.settings[UO_enable_processing_cmt].str;
if (ontext == NULL)
{
ontext = UNCRUSTIFY_ON_TEXT;
}
if (pc.str.find(ontext) >= 0)
{
LOG_FMT(LBCTRL, "Found '%s' on line %d\n", ontext, pc.orig_line);
cpd.unc_off = false;
}
}
else
{
const char* offtext = cpd.settings[UO_disable_processing_cmt].str;
if (offtext == NULL)
{
offtext = UNCRUSTIFY_OFF_TEXT;
}
if (pc.str.find(offtext) >= 0)
{
LOG_FMT(LBCTRL, "Found '%s' on line %d\n", offtext, pc.orig_line);
cpd.unc_off = true;
}
}
return(true);
}
/**
* Figure of the length of the code placeholder at text, if present.
* This is only for Xcode which sometimes inserts temporary code placeholder chunks, which in plaintext <#look like this#>.
*
* @param pc The structure to update, str is an input.
* @return Whether a placeholder was parsed.
*/
static bool parse_code_placeholder(tok_ctx& ctx, chunk_t& pc)
{
int last2 = 0, last1 = 0;
if ((ctx.peek() != '<') || (ctx.peek(1) != '#'))
{
return(false);
}
ctx.save();
/* account for opening two chars '<#' */
pc.str = ctx.get();
pc.str.append(ctx.get());
/* grab everything until '#>', fail if not found. */
while (ctx.more())
{
last2 = last1;
last1 = ctx.get();
pc.str.append(last1);
if ((last2 == '#') && (last1 == '>'))
{
pc.type = CT_WORD;
return(true);
}
}
ctx.restore();
return(false);
}
/**
* Parse any attached suffix, which may be a user-defined literal suffix.
* If for a string, explicitly exclude common format and scan specifiers, ie,
* PRIx32 and SCNx64.
*/
static void parse_suffix(tok_ctx& ctx, chunk_t& pc, bool forstring = false)
{
if (CharTable::IsKw1(ctx.peek()))
{
int slen = 0;
int oldsize = pc.str.size();
tok_info ss;
/* don't add the suffix if we see L" or L' or S" */
int p1 = ctx.peek();
int p2 = ctx.peek(1);
if (forstring &&
(((p1 == 'L') && ((p2 == '"') || (p2 == '\''))) ||
((p1 == 'S') && (p2 == '"'))))
{
return;
}
ctx.save(ss);
while (ctx.more() && CharTable::IsKw2(ctx.peek()))
{
slen++;
pc.str.append(ctx.get());
}
if (forstring && (slen >= 4) &&
(pc.str.startswith("PRI", oldsize) ||
pc.str.startswith("SCN", oldsize)))
{
ctx.restore(ss);
pc.str.resize(oldsize);
}
}
}
static bool is_bin(int ch)
{
return((ch == '0') || (ch == '1'));
}
static bool is_bin_(int ch)
{
return(is_bin(ch) || (ch == '_'));
}
static bool is_oct(int ch)
{
return((ch >= '0') && (ch <= '7'));
}
static bool is_oct_(int ch)
{
return(is_oct(ch) || (ch == '_'));
}
static bool is_dec(int ch)
{
return((ch >= '0') && (ch <= '9'));
}
static bool is_dec_(int ch)
{
return(is_dec(ch) || (ch == '_'));
}
static bool is_hex(int ch)
{
return(((ch >= '0') && (ch <= '9')) ||
((ch >= 'a') && (ch <= 'f')) ||
((ch >= 'A') && (ch <= 'F')));
}
static bool is_hex_(int ch)
{
return(is_hex(ch) || (ch == '_'));
}
/**
* Count the number of characters in the number.
* The next bit of text starts with a number (0-9 or '.'), so it is a number.
* Count the number of characters in the number.
*
* This should cover all number formats for all languages.
* Note that this is not a strict parser. It will happily parse numbers in
* an invalid format.
*
* For example, only D allows underscores in the numbers, but they are
* allowed in all formats.
*
* @param pc The structure to update, str is an input.
* @return Whether a number was parsed
*/
static bool parse_number(tok_ctx& ctx, chunk_t& pc)
{
int tmp;
bool is_float;
bool did_hex = false;
/* A number must start with a digit or a dot, followed by a digit */
if (!is_dec(ctx.peek()) &&
((ctx.peek() != '.') || !is_dec(ctx.peek(1))))
{
return(false);
}
is_float = (ctx.peek() == '.');
if (is_float && (ctx.peek(1) == '.'))
{
return(false);
}
/* Check for Hex, Octal, or Binary
* Note that only D and Pawn support binary, but who cares?
*/
if (ctx.peek() == '0')
{
pc.str.append(ctx.get()); /* store the '0' */
switch (unc_toupper(ctx.peek()))
{
case 'X': /* hex */
did_hex = true;
do
{
pc.str.append(ctx.get()); /* store the 'x' and then the rest */
} while (is_hex_(ctx.peek()));
break;
case 'B': /* binary */
do
{
pc.str.append(ctx.get()); /* store the 'b' and then the rest */
} while (is_bin_(ctx.peek()));
break;
case '0': /* octal or decimal */
case '1':
case '2':
case '3':
case '4':
case '5':
case '6':
case '7':
case '8':
case '9':
do
{
pc.str.append(ctx.get());
} while (is_oct_(ctx.peek()));
break;
default:
/* either just 0 or 0.1 or 0UL, etc */
break;
}
}
else
{
/* Regular int or float */
while (is_dec_(ctx.peek()))
{
pc.str.append(ctx.get());
}
}
/* Check if we stopped on a decimal point & make sure it isn't '..' */
if ((ctx.peek() == '.') && (ctx.peek(1) != '.'))
{
pc.str.append(ctx.get());
is_float = true;
if (did_hex)
{
while (is_hex_(ctx.peek()))
{
pc.str.append(ctx.get());
}
}
else
{
while (is_dec_(ctx.peek()))
{
pc.str.append(ctx.get());
}
}
}
/* Check exponent
* Valid exponents per language (not that it matters):
* C/C++/D/Java: eEpP
* C#/Pawn: eE
*/
tmp = unc_toupper(ctx.peek());
if ((tmp == 'E') || (tmp == 'P'))
{
is_float = true;
pc.str.append(ctx.get());
if ((ctx.peek() == '+') || (ctx.peek() == '-'))
{
pc.str.append(ctx.get());
}
while (is_dec_(ctx.peek()))
{
pc.str.append(ctx.get());
}
}
/* Check the suffixes
* Valid suffixes per language (not that it matters):
* Integer Float
* C/C++: uUlL64 lLfF
* C#: uUlL fFdDMm
* D: uUL ifFL
* Java: lL fFdD
* Pawn: (none) (none)
*
* Note that i, f, d, and m only appear in floats.
*/
while (1)
{
tmp = unc_toupper(ctx.peek());
if ((tmp == 'I') || (tmp == 'F') || (tmp == 'D') || (tmp == 'M'))
{
is_float = true;
}
else if ((tmp != 'L') && (tmp != 'U'))
{
break;
}
pc.str.append(ctx.get());
}
/* skip the Microsoft-specific '64' suffix */
if ((ctx.peek() == '6') && (ctx.peek(1) == '4'))
{
pc.str.append(ctx.get());
pc.str.append(ctx.get());
}
pc.type = is_float ? CT_NUMBER_FP : CT_NUMBER;
/* If there is anything left, then we are probably dealing with garbage or
* some sick macro junk. Eat it.
*/
parse_suffix(ctx, pc);
return(true);
}
/**
* Count the number of characters in a quoted string.
* The next bit of text starts with a quote char " or ' or <.
* Count the number of characters until the matching character.
*
* @param pc The structure to update, str is an input.
* @return Whether a string was parsed
*/
static bool parse_string(tok_ctx& ctx, chunk_t& pc, int quote_idx, bool allow_escape)
{
bool escaped = 0;
int end_ch;
char escape_char = cpd.settings[UO_string_escape_char].n;
char escape_char2 = cpd.settings[UO_string_escape_char2].n;
bool should_escape_tabs = cpd.settings[UO_string_replace_tab_chars].b && (cpd.lang_flags & LANG_ALLC);
pc.str.clear();
while (quote_idx-- > 0)
{
pc.str.append(ctx.get());
}
pc.type = CT_STRING;
end_ch = CharTable::Get(ctx.peek()) & 0xff;
pc.str.append(ctx.get()); /* store the " */
while (ctx.more())
{
int lastcol = ctx.c.col;
int ch = ctx.get();
if ((ch == '\t') && should_escape_tabs)
{
ctx.c.col = lastcol + 2;
pc.str.append(escape_char);
pc.str.append('t');
continue;
}
pc.str.append(ch);
if (ch == '\n')
{
pc.nl_count++;
pc.type = CT_STRING_MULTI;
escaped = 0;
continue;
}
if ((ch == '\r') && (ctx.peek() != '\n'))
{
pc.str.append(ctx.get());
pc.nl_count++;
pc.type = CT_STRING_MULTI;
escaped = 0;
continue;
}
if (!escaped)
{
if (ch == escape_char)
{
escaped = (escape_char != 0);
}
else if ((ch == escape_char2) && (ctx.peek() == end_ch))
{
escaped = allow_escape;
}
else if (ch == end_ch)
{
break;
}
}
else
{
escaped = false;
}
}
parse_suffix(ctx, pc, true);
return(true);
}
/**
* Literal string, ends with single "
* Two "" don't end the string.
*
* @param pc The structure to update, str is an input.
* @return Whether a string was parsed
*/
static bool parse_cs_string(tok_ctx& ctx, chunk_t& pc)
{
pc.str = ctx.get();
pc.str.append(ctx.get());
pc.type = CT_STRING;
/* go until we hit a zero (end of file) or a single " */
while (ctx.more())
{
int ch = ctx.get();
pc.str.append(ch);
if ((ch == '\n') || (ch == '\r'))
{
pc.type = CT_STRING_MULTI;
pc.nl_count++;
}
if (ch == '"')
{
if (ctx.peek() == '"')
{
pc.str.append(ctx.get());
}
else
{
break;
}
}
}
return(true);
}
/**
* VALA verbatim string, ends with three quotes (""")
*
* @param pc The structure to update, str is an input.
*/
static void parse_verbatim_string(tok_ctx& ctx, chunk_t& pc)
{
pc.type = CT_STRING;
// consume the initial """
pc.str = ctx.get();
pc.str.append(ctx.get());
pc.str.append(ctx.get());
/* go until we hit a zero (end of file) or a """ */
while (ctx.more())
{
int ch = ctx.get();
pc.str.append(ch);
if ((ch == '"') &&
(ctx.peek() == '"') &&
(ctx.peek(1) == '"'))
{
pc.str.append(ctx.get());
pc.str.append(ctx.get());
break;
}
if ((ch == '\n') || (ch == '\r'))
{
pc.type = CT_STRING_MULTI;
pc.nl_count++;
}
}
}
static bool tag_compare(const deque<int>& d, int a_idx, int b_idx, int len)
{
if (a_idx != b_idx)
{
while (len-- > 0)
{
if (d[a_idx] != d[b_idx])
{
return false;
}
}
}
return true;
}
/**
* Parses a C++0x 'R' string. R"( xxx )" R"tag( )tag" u8R"(x)" uR"(x)"
* Newlines may be in the string.
*/
static bool parse_cr_string(tok_ctx& ctx, chunk_t& pc, int q_idx)
{
int cnt;
int tag_idx = ctx.c.idx + q_idx + 1;
int tag_len = 0;
ctx.save();
/* Copy the prefix + " to the string */
pc.str.clear();
cnt = q_idx + 1;
while (cnt--)
{
pc.str.append(ctx.get());
}
/* Add the tag and get the length of the tag */
while (ctx.more() && (ctx.peek() != '('))
{
tag_len++;
pc.str.append(ctx.get());
}
if (ctx.peek() != '(')
{
ctx.restore();
return(false);
}
pc.type = CT_STRING;
while (ctx.more())
{
if ((ctx.peek() == ')') &&
(ctx.peek(tag_len + 1) == '"') &&
tag_compare(ctx.data, tag_idx, ctx.c.idx + 1, tag_len))
{
cnt = tag_len + 2; /* for the )" */
while (cnt--)
{
pc.str.append(ctx.get());
}
parse_suffix(ctx, pc);
return(true);
}
if (ctx.peek() == '\n')
{
pc.str.append(ctx.get());
pc.nl_count++;
pc.type = CT_STRING_MULTI;
}
else
{
pc.str.append(ctx.get());
}
}
ctx.restore();
return(false);
}
/**
* Count the number of characters in a word.
* The first character is already valid for a keyword
*
* @param pc The structure to update, str is an input.
* @return Whether a word was parsed (always true)
*/
bool parse_word(tok_ctx& ctx, chunk_t& pc, bool skipcheck)
{
int ch;
static unc_text intr_txt("@interface");
/* The first character is already valid */
pc.str.clear();
pc.str.append(ctx.get());
while (ctx.more() && CharTable::IsKw2(ctx.peek()))
{
ch = ctx.get();
pc.str.append(ch);
/* HACK: Non-ASCII character are only allowed in identifiers */
if (ch > 0x7f)
{
skipcheck = true;
}
}
pc.type = CT_WORD;
if (skipcheck)
{
return(true);
}
/* Detect pre-processor functions now */
if ((cpd.in_preproc == CT_PP_DEFINE) &&
(cpd.preproc_ncnl_count == 1))
{
if (ctx.peek() == '(')
{
pc.type = CT_MACRO_FUNC;
}
else
{
pc.type = CT_MACRO;
}
}
else
{
/* '@interface' is reserved, not an interface itself */
if ((cpd.lang_flags & LANG_JAVA) && pc.str.startswith("@") &&
!pc.str.equals(intr_txt))
{
pc.type = CT_ANNOTATION;
}
else
{
/* Turn it into a keyword now */
pc.type = find_keyword_type(pc.str.c_str(), pc.str.size());
}
}
return(true);
}
/**
* Count the number of whitespace characters.
*
* @param pc The structure to update, str is an input.
* @return Whether whitespace was parsed
*/
static bool parse_whitespace(tok_ctx& ctx, chunk_t& pc)
{
int nl_count = 0;
int ch = -2;
/* REVISIT: use a better whitespace detector? */
while (ctx.more() && unc_isspace(ctx.peek()))
{
ch = ctx.get(); /* throw away the whitespace char */
switch (ch)
{
case '\r':
if (ctx.expect('\n'))
{
/* CRLF ending */
cpd.le_counts[LE_CRLF]++;
}
else
{
/* CR ending */
cpd.le_counts[LE_CR]++;
}
nl_count++;
pc.orig_prev_sp = 0;
break;
case '\n':
/* LF ending */
cpd.le_counts[LE_LF]++;
nl_count++;
pc.orig_prev_sp = 0;
break;
case '\t':
pc.orig_prev_sp += calc_next_tab_column(cpd.column, cpd.settings[UO_input_tab_size].n) - cpd.column;
break;
case ' ':
pc.orig_prev_sp++;
break;
default:
break;
}
}
if (ch != -2)
{
pc.str.clear();
pc.nl_count = nl_count;
pc.type = nl_count ? CT_NEWLINE : CT_WHITESPACE;
pc.after_tab = (ctx.c.last_ch == '\t');
return(true);
}
return(false);
}
/**
* Called when we hit a backslash.
* If there is nothing but whitespace until the newline, then this is a
* backslash newline
*/
static bool parse_bs_newline(tok_ctx& ctx, chunk_t& pc)
{
ctx.save();
ctx.get(); /* skip the '\' */
int ch;
while (ctx.more() && unc_isspace(ch = ctx.peek()))
{
ctx.get();
if ((ch == '\r') || (ch == '\n'))
{
if (ch == '\r')
{
ctx.expect('\n');
}
pc.str = "\\";
pc.type = CT_NL_CONT;
pc.nl_count = 1;
return(true);
}
}
ctx.restore();
return(false);
}
/**
* Parses any number of tab or space chars followed by a newline.
* Does not change pc.len if a newline isn't found.
* This is not the same as parse_whitespace() because it only consumes until
* a single newline is encountered.
*/
static bool parse_newline(tok_ctx& ctx)
{
ctx.save();
/* Eat whitespace */
while ((ctx.peek() == ' ') || (ctx.peek() == '\t'))
{
ctx.get();
}
if ((ctx.peek() == '\r') || (ctx.peek() == '\n'))
{
if (!ctx.expect('\n'))
{
ctx.get();
ctx.expect('\n');
}
return(true);
}
ctx.restore();
return(false);
}
static bool parse_ignored(tok_ctx& ctx, chunk_t& pc)
{
int nl_count = 0;
/* Parse off newlines/blank lines */
while (parse_newline(ctx))
{
nl_count++;
}
if (nl_count > 0)
{
pc.nl_count = nl_count;
pc.type = CT_NEWLINE;
return(true);
}
/* See if the UO_enable_processing_cmt text is on this line */
ctx.save();
pc.str.clear();
while (ctx.more() &&
(ctx.peek() != '\r') &&
(ctx.peek() != '\n'))
{
pc.str.append(ctx.get());
}
if (pc.str.size() == 0)
{
/* end of file? */
return(false);
}
/* Note that we aren't actually making sure this is in a comment, yet */
const char* ontext = cpd.settings[UO_enable_processing_cmt].str;
if (ontext == NULL)
{
ontext = UNCRUSTIFY_ON_TEXT;
}
if (pc.str.find(ontext) < 0)
{
pc.type = CT_IGNORED;
return(true);
}
ctx.restore();
/* parse off whitespace leading to the comment */
if (parse_whitespace(ctx, pc))
{
pc.type = CT_IGNORED;
return(true);
}
/* Look for the ending comment and let it pass */
if (parse_comment(ctx, pc) && !cpd.unc_off)
{
return(true);
}
/* Reset the chunk & scan to until a newline */
pc.str.clear();
while (ctx.more() &&
(ctx.peek() != '\r') &&
(ctx.peek() != '\n'))
{
pc.str.append(ctx.get());
}
if (pc.str.size() > 0)
{
pc.type = CT_IGNORED;
return(true);
}
return(false);
}
/**
* Skips the next bit of whatever and returns the type of block.
*
* pc.str is the input text.
* pc.len in the output length.
* pc.type is the output type
* pc.column is output column
*
* @param pc The structure to update, str is an input.
* @return true/false - whether anything was parsed
*/
static bool parse_next(tok_ctx& ctx, chunk_t& pc)
{
const chunk_tag_t *punc;
int ch, ch1;
if (!ctx.more())
{
//fprintf(stderr, "All done!\n");
return(false);
}
/* Save off the current column */
pc.orig_line = ctx.c.row;
pc.column = ctx.c.col;
pc.orig_col = ctx.c.col;
pc.type = CT_NONE;
pc.nl_count = 0;
pc.flags = 0;
/* If it is turned off, we put everything except newlines into CT_UNKNOWN */
if (cpd.unc_off)
{
if (parse_ignored(ctx, pc))
{
return(true);
}
}
/**
* Parse whitespace
*/
if (parse_whitespace(ctx, pc))
{
return(true);
}
/**
* Handle unknown/unhandled preprocessors
*/
if ((cpd.in_preproc > CT_PP_BODYCHUNK) &&
(cpd.in_preproc <= CT_PP_OTHER))
{
pc.str.clear();
tok_info ss;
ctx.save(ss);
/* Chunk to a newline or comment */
pc.type = CT_PREPROC_BODY;
int last = 0;
while (ctx.more())
{
int ch = ctx.peek();
if ((ch == '\n') || (ch == '\r'))
{
/* Back off if this is an escaped newline */
if (last == '\\')
{
ctx.restore(ss);
pc.str.pop_back();
}
break;
}
/* Quit on a C++ comment start */
if ((ch == '/') && (ctx.peek(1) == '/'))
{
break;
}
last = ch;
ctx.save(ss);
pc.str.append(ctx.get());
}
if (pc.str.size() > 0)
{
return(true);
}
}
/**
* Detect backslash-newline
*/
if ((ctx.peek() == '\\') && parse_bs_newline(ctx, pc))
{
return(true);
}
/**
* Parse comments
*/
if (parse_comment(ctx, pc))
{
return(true);
}
/* Parse code placeholders */
if (parse_code_placeholder(ctx, pc))
{
return(true);
}
/* Check for C# literal strings, ie @"hello" and identifiers @for*/
if (((cpd.lang_flags & LANG_CS) != 0) && (ctx.peek() == '@'))
{
if (ctx.peek(1) == '"')
{
parse_cs_string(ctx, pc);
return(true);
}
/* check for non-keyword identifiers such as @if @switch, etc */
if (CharTable::IsKw1(ctx.peek(1)))
{
parse_word(ctx, pc, true);
return(true);
}
}
/* handle VALA """ strings """ */
if (((cpd.lang_flags & LANG_VALA) != 0) &&
(ctx.peek() == '"') &&
(ctx.peek(1) == '"') &&
(ctx.peek(2) == '"'))
{
parse_verbatim_string(ctx, pc);
return true;
}
/* handle C++0x strings u8"x" u"x" U"x" R"x" u8R"XXX(I'm a "raw UTF-8" string.)XXX" */
ch = ctx.peek();
if (((cpd.lang_flags & LANG_CPP) != 0) &&
((ch == 'u') || (ch == 'U') || (ch == 'R')))
{
int idx = 0;
bool is_real = false;
if ((ch == 'u') && (ctx.peek(1) == '8'))
{
idx = 2;
}
else if (unc_tolower(ch) == 'u')
{
idx++;
}
if (ctx.peek(idx) == 'R')
{
idx++;
is_real = true;
}
if (ctx.peek(idx) == '"')
{
if (is_real)
{
if (parse_cr_string(ctx, pc, idx))
{
return(true);
}
}
else
{
if (parse_string(ctx, pc, idx, true))
{
parse_suffix(ctx, pc, true);
return(true);
}
}
}
}
/* PAWN specific stuff */
if ((cpd.lang_flags & LANG_PAWN) != 0)
{
/* Check for PAWN strings: \"hi" or !"hi" or !\"hi" or \!"hi" */
if ((ctx.peek() == '\\') || (ctx.peek() == '!'))
{
if (ctx.peek(1) == '"')
{
parse_string(ctx, pc, 1, (ctx.peek() == '!'));
return(true);
}
else if (((ctx.peek(1) == '\\') || (ctx.peek(1) == '!')) &&
(ctx.peek(2) == '"'))
{
parse_string(ctx, pc, 2, false);
return(true);
}
}
}
/**
* Parse strings and character constants
*/
if (parse_number(ctx, pc))
{
return(true);
}
if ((cpd.lang_flags & LANG_D) != 0)
{
/* D specific stuff */
if (d_parse_string(ctx, pc))
{
return(true);
}
}
else
{
/* Not D stuff */
/* Check for L'a', L"abc", 'a', "abc", <abc> strings */
ch = ctx.peek();
ch1 = ctx.peek(1);
if ((((ch == 'L') || (ch == 'S')) &&
((ch1 == '"') || (ch1 == '\''))) ||
(ch == '"') ||
(ch == '\'') ||
((ch == '<') && (cpd.in_preproc == CT_PP_INCLUDE)))
{
parse_string(ctx, pc, unc_isalpha(ch) ? 1 : 0, true);
return(true);
}
if ((ch == '<') && (cpd.in_preproc == CT_PP_DEFINE))
{
if (chunk_get_tail()->type == CT_MACRO)
{
/* We have "#define XXX <", assume '<' starts an include string */
parse_string(ctx, pc, 0, false);
return(true);
}
}
}
/* Check for Objective C literals and VALA identifiers ('@1', '@if')*/
if ((cpd.lang_flags & (LANG_OC | LANG_VALA)) && (ctx.peek() == '@'))
{
int nc = ctx.peek(1);
if ((nc == '"') || (nc == '\''))
{
/* literal string */
parse_string(ctx, pc, 1, true);
return true;
}
else if ((nc >= '0') && (nc <= '9'))
{
/* literal number */
pc.str.append(ctx.get()); /* store the '@' */
parse_number(ctx, pc);
return true;
}
}
/* Check for pawn/ObjectiveC/Java and normal identifiers */
if (CharTable::IsKw1(ctx.peek()) ||
((ctx.peek() == '@') && CharTable::IsKw1(ctx.peek(1))))
{
parse_word(ctx, pc, false);
return(true);
}
/* see if we have a punctuator */
char punc_txt[4];
punc_txt[0] = ctx.peek();
punc_txt[1] = ctx.peek(1);
punc_txt[2] = ctx.peek(2);
punc_txt[3] = ctx.peek(3);
if ((punc = find_punctuator(punc_txt, cpd.lang_flags)) != NULL)
{
int cnt = strlen(punc->tag);
while (cnt--)
{
pc.str.append(ctx.get());
}
pc.type = punc->type;
pc.flags |= PCF_PUNCTUATOR;
return(true);
}
/* throw away this character */
pc.type = CT_UNKNOWN;
pc.str.append(ctx.get());
LOG_FMT(LWARN, "%s:%d Garbage in col %d: %x\n",
cpd.filename, pc.orig_line, (int)ctx.c.col, pc.str[0]);
cpd.error_count++;
return(true);
}
/**
* This function parses or tokenizes the whole buffer into a list.
* It has to do some tricks to parse preprocessors.
*
* If output_text() were called immediately after, two things would happen:
* - trailing whitespace are removed.
* - leading space & tabs are converted to the appropriate format.
*
* All the tokens are inserted before ref. If ref is NULL, they are inserted
* at the end of the list. Line numbers are relative to the start of the data.
*/
void tokenize(const deque<int>& data, chunk_t *ref)
{
tok_ctx ctx(data);
chunk_t chunk;
chunk_t *pc = NULL;
chunk_t *rprev = NULL;
struct parse_frame frm;
bool last_was_tab = false;
int prev_sp = 0;
memset(&frm, 0, sizeof(frm));
while (ctx.more())
{
chunk.reset();
if (!parse_next(ctx, chunk))
{
LOG_FMT(LERR, "%s:%d Bailed before the end?\n",
cpd.filename, ctx.c.row);
cpd.error_count++;
break;
}
/* Don't create an entry for whitespace */
if (chunk.type == CT_WHITESPACE)
{
last_was_tab = chunk.after_tab;
prev_sp = chunk.orig_prev_sp;
continue;
}
chunk.orig_prev_sp = prev_sp;
prev_sp = 0;
if (chunk.type == CT_NEWLINE)
{
last_was_tab = chunk.after_tab;
chunk.after_tab = false;
chunk.str.clear();
}
else if (chunk.type == CT_NL_CONT)
{
last_was_tab = chunk.after_tab;
chunk.after_tab = false;
chunk.str = "\\\n";
}
else
{
chunk.after_tab = last_was_tab;
last_was_tab = false;
}
/* Strip trailing whitespace (for CPP comments and PP blocks) */
while ((chunk.str.size() > 0) &&
((chunk.str[chunk.str.size() - 1] == ' ') ||
(chunk.str[chunk.str.size() - 1] == '\t')))
{
// If comment contains backslash '\' followed by whitespace chars, keep last one;
// this will prevent it from turning '\' into line continuation.
if (chunk.str.size() > 1 && chunk.str[chunk.str.size() - 2] == '\\')
break;
chunk.str.pop_back();
}
/* Store off the end column */
chunk.orig_col_end = ctx.c.col;
/* Add the chunk to the list */
rprev = pc;
if (rprev != NULL)
{
pc->flags |= rprev->flags & PCF_COPY_FLAGS;
/* a newline can't be in a preprocessor */
if (pc->type == CT_NEWLINE)
{
pc->flags &= ~PCF_IN_PREPROC;
}
}
if (ref != NULL)
{
chunk.flags |= PCF_INSERTED;
}
else
{
chunk.flags &= ~PCF_INSERTED;
}
pc = chunk_add_before(&chunk, ref);
/* A newline marks the end of a preprocessor */
if (pc->type == CT_NEWLINE) // || (pc->type == CT_COMMENT_MULTI))
{
cpd.in_preproc = CT_NONE;
cpd.preproc_ncnl_count = 0;
}
/* Special handling for preprocessor stuff */
if (cpd.in_preproc != CT_NONE)
{
pc->flags |= PCF_IN_PREPROC;
/* Count words after the preprocessor */
if (!chunk_is_comment(pc) && !chunk_is_newline(pc))
{
cpd.preproc_ncnl_count++;
}
/* Figure out the type of preprocessor for #include parsing */
if (cpd.in_preproc == CT_PREPROC)
{
if ((pc->type < CT_PP_DEFINE) || (pc->type > CT_PP_OTHER))
{
set_chunk_type(pc, CT_PP_OTHER);
}
cpd.in_preproc = pc->type;
}
}
else
{
/* Check for a preprocessor start */
if ((pc->type == CT_POUND) &&
((rprev == NULL) || (rprev->type == CT_NEWLINE)))
{
set_chunk_type(pc, CT_PREPROC);
pc->flags |= PCF_IN_PREPROC;
cpd.in_preproc = CT_PREPROC;
}
}
}
/* Set the cpd.newline string for this file */
if ((cpd.settings[UO_newlines].le == LE_LF) ||
((cpd.settings[UO_newlines].le == LE_AUTO) &&
(cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CRLF]) &&
(cpd.le_counts[LE_LF] >= cpd.le_counts[LE_CR])))
{
/* LF line ends */
cpd.newline = "\n";
LOG_FMT(LLINEENDS, "Using LF line endings\n");
}
else if ((cpd.settings[UO_newlines].le == LE_CRLF) ||
((cpd.settings[UO_newlines].le == LE_AUTO) &&
(cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_LF]) &&
(cpd.le_counts[LE_CRLF] >= cpd.le_counts[LE_CR])))
{
/* CRLF line ends */
cpd.newline = "\r\n";
LOG_FMT(LLINEENDS, "Using CRLF line endings\n");
}
else
{
/* CR line ends */
cpd.newline = "\r";
LOG_FMT(LLINEENDS, "Using CR line endings\n");
}
}
// /**
// * A simplistic fixed-sized needle in the fixed-size haystack string search.
// */
// int str_find(const char *needle, int needle_len,
// const char *haystack, int haystack_len)
// {
// int idx;
//
// for (idx = 0; idx < (haystack_len - needle_len); idx++)
// {
// if (memcmp(needle, haystack + idx, needle_len) == 0)
// {
// return(idx);
// }
// }
// return(-1);
// }