node_modules/uncrustify/scripts/tokenize.py - cordova-osx - Git at Google

 #! /usr/bin/env python
 # tokenize.py
 #
 # Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
 # tuples (string, type)
 #

 # punctuator lookup table
 punc_table = [
    [ '!',  25,  26, '!'   ],   #   0: '!'
    [ '#',  24,  35, '#'   ],   #   1: '#'
    [ '$',  23,   0, '$'   ],   #   2: '$'
    [ '%',  22,  36, '%'   ],   #   3: '%'
    [ '&',  21,  41, '&'   ],   #   4: '&'
    [ '(',  20,   0, '('   ],   #   5: '('
    [ ')',  19,   0, ')'   ],   #   6: ')'
    [ '*',  18,  43, '*'   ],   #   7: '*'
    [ '+',  17,  44, '+'   ],   #   8: '+'
    [ ',',  16,   0, ','   ],   #   9: ','
    [ '-',  15,  46, '-'   ],   #  10: '-'
    [ '.',  14,  50, '.'   ],   #  11: '.'
    [ '/',  13,  53, '/'   ],   #  12: '/'
    [ ':',  12,  54, ':'   ],   #  13: ':'
    [ ';',  11,   0, ';'   ],   #  14: ';'
    [ '<',  10,  56, '<'   ],   #  15: '<'
    [ '=',   9,  63, '='   ],   #  16: '='
    [ '>',   8,  65, '>'   ],   #  17: '>'
    [ '?',   7,   0, '?'   ],   #  18: '?'
    [ '[',   6,  70, '['   ],   #  19: '['
    [ ']',   5,   0, ']'   ],   #  20: ']'
    [ '^',   4,  71, '^'   ],   #  21: '^'
    [ '{',   3,   0, '{'   ],   #  22: '{'
    [ '|',   2,  72, '|'   ],   #  23: '|'
    [ '}',   1,   0, '}'   ],   #  24: '}'
    [ '~',   0,  74, '~'   ],   #  25: '~'
    [ '<',   3,  30, '!<'  ],   #  26: '!<'
    [ '=',   2,  33, '!='  ],   #  27: '!='
    [ '>',   1,  34, '!>'  ],   #  28: '!>'
    [ '~',   0,   0, '!~'  ],   #  29: '!~'
    [ '=',   1,   0, '!<=' ],   #  30: '!<='
    [ '>',   0,  32, '!<>' ],   #  31: '!<>'
    [ '=',   0,   0, '!<>='],   #  32: '!<>='
    [ '=',   0,   0, '!==' ],   #  33: '!=='
    [ '=',   0,   0, '!>=' ],   #  34: '!>='
    [ '#',   0,   0, '##'  ],   #  35: '##'
    [ ':',   2,  39, '%:'  ],   #  36: '%:'
    [ '=',   1,   0, '%='  ],   #  37: '%='
    [ '>',   0,   0, '%>'  ],   #  38: '%>'
    [ '%',   0,  40, None  ],   #  39: '%:%'
    [ ':',   0,   0, '%:%:'],   #  40: '%:%:'
    [ '&',   1,   0, '&&'  ],   #  41: '&&'
    [ '=',   0,   0, '&='  ],   #  42: '&='
    [ '=',   0,   0, '*='  ],   #  43: '*='
    [ '+',   1,   0, '++'  ],   #  44: '++'
    [ '=',   0,   0, '+='  ],   #  45: '+='
    [ '-',   2,   0, '--'  ],   #  46: '--'
    [ '=',   1,   0, '-='  ],   #  47: '-='
    [ '>',   0,  49, '->'  ],   #  48: '->'
    [ '*',   0,   0, '->*' ],   #  49: '->*'
    [ '*',   1,   0, '.*'  ],   #  50: '.*'
    [ '.',   0,  52, '..'  ],   #  51: '..'
    [ '.',   0,   0, '...' ],   #  52: '...'
    [ '=',   0,   0, '/='  ],   #  53: '/='
    [ ':',   1,   0, '::'  ],   #  54: '::'
    [ '>',   0,   0, ':>'  ],   #  55: ':>'
    [ '%',   4,   0, '<%'  ],   #  56: '<%'
    [ ':',   3,   0, '<:'  ],   #  57: '<:'
    [ '<',   2,  61, '<<'  ],   #  58: '<<'
    [ '=',   1,   0, '<='  ],   #  59: '<='
    [ '>',   0,  62, '<>'  ],   #  60: '<>'
    [ '=',   0,   0, '<<=' ],   #  61: '<<='
    [ '=',   0,   0, '<>=' ],   #  62: '<>='
    [ '=',   0,  64, '=='  ],   #  63: '=='
    [ '=',   0,   0, '===' ],   #  64: '==='
    [ '=',   1,   0, '>='  ],   #  65: '>='
    [ '>',   0,  67, '>>'  ],   #  66: '>>'
    [ '=',   1,   0, '>>=' ],   #  67: '>>='
    [ '>',   0,  69, '>>>' ],   #  68: '>>>'
    [ '=',   0,   0, '>>>='],   #  69: '>>>='
    [ ']',   0,   0, '[]'  ],   #  70: '[]'
    [ '=',   0,   0, '^='  ],   #  71: '^='
    [ '=',   1,   0, '|='  ],   #  72: '|='
    [ '|',   0,   0, '||'  ],   #  73: '||'
    [ '=',   1,   0, '~='  ],   #  74: '~='
    [ '~',   0,   0, '~~'  ],   #  75: '~~'
 ]

 #
 # Token types:
 #  0 = newline
 #  1 = punctuator
 #  2 = integer
 #  3 = float
 #  4 = string
 #  5 = identifier
 #

 class tokenizer:
 	def __init__(self):
 		self.tokens   = []
 		self.text     = ''
 		self.text_idx = 0

 	def tokenize_text (self, in_text):
 		self.tokens   = []
 		self.text     = in_text
 		self.text_idx = 0

 		print in_text
 		try:
 			while self.text_idx < len(self.text):
 				if self.parse_whitespace():
 					continue
 				elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
 					self.text_idx += 2
 					continue
 				elif self.parse_comment():
 					continue
 				elif self.parse_number():
 					continue
 				elif self.parse_identifier():
 					continue
 				elif self.parse_string():
 					continue
 				elif self.parse_punctuator():
 					continue
 				else:
 					print 'confused:', self.text[self.text_idx:]
 					break
 		except:
 			print 'bombed'
 			raise

 	def parse_whitespace(self):
 		start_idx = self.text_idx
 		hit_newline = False
 		while self.text_idx < len(self.text):
 			if self.text[self.text_idx] in '\n\r':
 				hit_newline = True
 			elif not self.text[self.text_idx] in ' \t':
 				break
 			self.text_idx += 1

 		if hit_newline:
 			self.tokens.append(('\n', 0))
 		return start_idx != self.text_idx

 	def parse_comment(self):
 		if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
 			return False
 		if self.text[self.text_idx + 1] == '/':
 			while self.text_idx < len(self.text):
 				if self.text[self.text_idx] in '\n\r':
 					break;
 				self.text_idx += 1
 		else:
 			while self.text_idx < len(self.text) - 1:
 				if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
 					self.text_idx += 2
 					break;
 				self.text_idx += 1
 		return True

 	def parse_identifier(self):
 		if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
 			return False
 		start_idx = self.text_idx
 		while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
 			self.text_idx += 1
 		self.tokens.append((self.text[start_idx : self.text_idx], 5))
 		return True

 	def parse_string(self):
 		starter = 0
 		start_ch = self.text[self.text_idx]
 		if start_ch == 'L':
 			starter = 1
 			start_ch = self.text[self.text_idx + 1]
 		if not start_ch in '"\'':
 			return False
 		start_idx = self.text_idx
 		self.text_idx += starter + 1
 		escaped = False
 		while self.text_idx < len(self.text):
 			if escaped:
 				escaped = False
 			else:
 				if self.text[self.text_idx] == '\\':
 					escaped = True
 				elif self.text[self.text_idx] == start_ch:
 					self.text_idx += 1
 					break;
 			self.text_idx += 1

 		self.tokens.append((self.text[start_idx : self.text_idx], 4))
 		return True

 	# Checks for punctuators
 	# Returns whether a punctuator was consumed (True or False)
 	def parse_punctuator(self):
 		tab_idx = 0
 		punc_len = 0
 		saved_punc = None
 		while 1:
 			pte = punc_table[tab_idx]
 			if pte[0] == self.text[self.text_idx]:
 				if pte[3] != None:
 					saved_punc = pte[3]
 				self.text_idx += 1
 				tab_idx = pte[2]
 				if tab_idx == 0:
 					break
 			elif pte[1] == 0:
 				break
 			else:
 				tab_idx += 1
 		if saved_punc != None:
 			self.tokens.append((saved_punc, 1))
 			return True
 		return False

 	def parse_number(self):
 		# A number must start with a digit or a dot followed by a digit
 		ch = self.text[self.text_idx]
 		if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
 			return False;
 		token_type = 2 # integer
 		if (ch == '.'):
 			token_type = 3 # float
 		did_hex  = False;
 		start_idx = self.text_idx

 		# Check for Hex, Octal, or Binary
 		# Note that only D and Pawn support binary, but who cares?
 		#
 		if ch == '0':
 			self.text_idx += 1
 			ch = self.text[self.text_idx].upper()
 			if ch == 'X':                # hex
 				did_hex = True
 				self.text_idx += 1
 				while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
 					self.text_idx += 1
 			elif ch == 'B':              # binary
 				self.text_idx += 1
 				while self.text[self.text_idx] in '_01':
 					self.text_idx += 1
 			elif ch >= '0' and ch <= 7:  # octal (but allow decimal)
 				self.text_idx += 1
 				while self.text[self.text_idx] in '_0123456789':
 					self.text_idx += 1
 			else:
 				# either just 0 or 0.1 or 0UL, etc
 				pass
 		else:
 			# Regular int or float
 			while self.text[self.text_idx] in '_0123456789':
 				self.text_idx += 1

 		# Check if we stopped on a decimal point
 		if self.text[self.text_idx] == '.':
 			self.text_idx += 1
 			token_type = 3 # float
 			if did_hex:
 				while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
 					self.text_idx += 1
 			else:
 				while self.text[self.text_idx] in '_0123456789':
 					self.text_idx += 1

 		# Check exponent
 		# Valid exponents per language (not that it matters):
 		# C/C++/D/Java: eEpP
 		# C#/Pawn:      eE
 		if self.text[self.text_idx] in 'eEpP':
 			token_type = 3 # float
 			self.text_idx += 1
 			if self.text[self.text_idx] in '+-':
 				self.text_idx += 1
 			while self.text[self.text_idx] in '_0123456789':
 				self.text_idx += 1

 		# Check the suffixes
 		# Valid suffixes per language (not that it matters):
 		#        Integer       Float
 		# C/C++: uUlL          lLfF
 		# C#:    uUlL          fFdDMm
 		# D:     uUL           ifFL
 		# Java:  lL            fFdD
 		# Pawn:  (none)        (none)
 		#
 		# Note that i, f, d, and m only appear in floats.
 		while 1:
 			if self.text[self.text_idx] in 'tTfFdDmM':
 				token_type = 3 # float
 			elif not self.text[self.text_idx] in 'lLuU':
 				break;
 			self.text_idx += 1

 		self.tokens.append((self.text[start_idx : self.text_idx], token_type))
 		return True

 text = """
 1.23+4-3*16%2 *sin(1.e-3 + .5p32) "hello" and "hello\\"there"
 123 // some comment
 a = b + c;
 #define abc \\
         5
 d = 5 /* hello */ + 3;
 """

 t=tokenizer()
 t.tokenize_text(text)
 print t.tokens
	#! /usr/bin/env python
	# tokenize.py
	#
	# Parses a C/C++/C#/D/Java/Pawn/whatever file in an array of
	# tuples (string, type)
	#

	# punctuator lookup table
	punc_table = [
	[ '!', 25, 26, '!' ], # 0: '!'
	[ '#', 24, 35, '#' ], # 1: '#'
	[ '$', 23, 0, '$' ], # 2: '$'
	[ '%', 22, 36, '%' ], # 3: '%'
	[ '&', 21, 41, '&' ], # 4: '&'
	[ '(', 20, 0, '(' ], # 5: '('
	[ ')', 19, 0, ')' ], # 6: ')'
	[ '', 18, 43, '' ], # 7: '*'
	[ '+', 17, 44, '+' ], # 8: '+'
	[ ',', 16, 0, ',' ], # 9: ','
	[ '-', 15, 46, '-' ], # 10: '-'
	[ '.', 14, 50, '.' ], # 11: '.'
	[ '/', 13, 53, '/' ], # 12: '/'
	[ ':', 12, 54, ':' ], # 13: ':'
	[ ';', 11, 0, ';' ], # 14: ';'
	[ '<', 10, 56, '<' ], # 15: '<'
	[ '=', 9, 63, '=' ], # 16: '='
	[ '>', 8, 65, '>' ], # 17: '>'
	[ '?', 7, 0, '?' ], # 18: '?'
	[ '[', 6, 70, '[' ], # 19: '['
	[ ']', 5, 0, ']' ], # 20: ']'
	[ '^', 4, 71, '^' ], # 21: '^'
	[ '{', 3, 0, '{' ], # 22: '{'
	[ '\|', 2, 72, '\|' ], # 23: '\|'
	[ '}', 1, 0, '}' ], # 24: '}'
	[ '~', 0, 74, '~' ], # 25: '~'
	[ '<', 3, 30, '!<' ], # 26: '!<'
	[ '=', 2, 33, '!=' ], # 27: '!='
	[ '>', 1, 34, '!>' ], # 28: '!>'
	[ '~', 0, 0, '!~' ], # 29: '!~'
	[ '=', 1, 0, '!<=' ], # 30: '!<='
	[ '>', 0, 32, '!<>' ], # 31: '!<>'
	[ '=', 0, 0, '!<>='], # 32: '!<>='
	[ '=', 0, 0, '!==' ], # 33: '!=='
	[ '=', 0, 0, '!>=' ], # 34: '!>='
	[ '#', 0, 0, '##' ], # 35: '##'
	[ ':', 2, 39, '%:' ], # 36: '%:'
	[ '=', 1, 0, '%=' ], # 37: '%='
	[ '>', 0, 0, '%>' ], # 38: '%>'
	[ '%', 0, 40, None ], # 39: '%:%'
	[ ':', 0, 0, '%:%:'], # 40: '%:%:'
	[ '&', 1, 0, '&&' ], # 41: '&&'
	[ '=', 0, 0, '&=' ], # 42: '&='
	[ '=', 0, 0, '=' ], # 43: '='
	[ '+', 1, 0, '++' ], # 44: '++'
	[ '=', 0, 0, '+=' ], # 45: '+='
	[ '-', 2, 0, '--' ], # 46: '--'
	[ '=', 1, 0, '-=' ], # 47: '-='
	[ '>', 0, 49, '->' ], # 48: '->'
	[ '', 0, 0, '->' ], # 49: '->*'
	[ '', 1, 0, '.' ], # 50: '.*'
	[ '.', 0, 52, '..' ], # 51: '..'
	[ '.', 0, 0, '...' ], # 52: '...'
	[ '=', 0, 0, '/=' ], # 53: '/='
	[ ':', 1, 0, '::' ], # 54: '::'
	[ '>', 0, 0, ':>' ], # 55: ':>'
	[ '%', 4, 0, '<%' ], # 56: '<%'
	[ ':', 3, 0, '<:' ], # 57: '<:'
	[ '<', 2, 61, '<<' ], # 58: '<<'
	[ '=', 1, 0, '<=' ], # 59: '<='
	[ '>', 0, 62, '<>' ], # 60: '<>'
	[ '=', 0, 0, '<<=' ], # 61: '<<='
	[ '=', 0, 0, '<>=' ], # 62: '<>='
	[ '=', 0, 64, '==' ], # 63: '=='
	[ '=', 0, 0, '===' ], # 64: '==='
	[ '=', 1, 0, '>=' ], # 65: '>='
	[ '>', 0, 67, '>>' ], # 66: '>>'
	[ '=', 1, 0, '>>=' ], # 67: '>>='
	[ '>', 0, 69, '>>>' ], # 68: '>>>'
	[ '=', 0, 0, '>>>='], # 69: '>>>='
	[ ']', 0, 0, '[]' ], # 70: '[]'
	[ '=', 0, 0, '^=' ], # 71: '^='
	[ '=', 1, 0, '\|=' ], # 72: '\|='
	[ '\|', 0, 0, '\|\|' ], # 73: '\|\|'
	[ '=', 1, 0, '~=' ], # 74: '~='
	[ '~', 0, 0, '~~' ], # 75: '~~'
	]

	#
	# Token types:
	# 0 = newline
	# 1 = punctuator
	# 2 = integer
	# 3 = float
	# 4 = string
	# 5 = identifier
	#

	class tokenizer:
	def __init__(self):
	self.tokens = []
	self.text = ''
	self.text_idx = 0

	def tokenize_text (self, in_text):
	self.tokens = []
	self.text = in_text
	self.text_idx = 0

	print in_text
	try:
	while self.text_idx < len(self.text):
	if self.parse_whitespace():
	continue
	elif self.text[self.text_idx] == '\\' and self.text[self.text_idx + 1] == '\n':
	self.text_idx += 2
	continue
	elif self.parse_comment():
	continue
	elif self.parse_number():
	continue
	elif self.parse_identifier():
	continue
	elif self.parse_string():
	continue
	elif self.parse_punctuator():
	continue
	else:
	print 'confused:', self.text[self.text_idx:]
	break
	except:
	print 'bombed'
	raise

	def parse_whitespace(self):
	start_idx = self.text_idx
	hit_newline = False
	while self.text_idx < len(self.text):
	if self.text[self.text_idx] in '\n\r':
	hit_newline = True
	elif not self.text[self.text_idx] in ' \t':
	break
	self.text_idx += 1

	if hit_newline:
	self.tokens.append(('\n', 0))
	return start_idx != self.text_idx

	def parse_comment(self):
	if not self.text[self.text_idx] == '/' or not self.text[self.text_idx + 1] in '/*':
	return False
	if self.text[self.text_idx + 1] == '/':
	while self.text_idx < len(self.text):
	if self.text[self.text_idx] in '\n\r':
	break;
	self.text_idx += 1
	else:
	while self.text_idx < len(self.text) - 1:
	if self.text[self.text_idx] == '*' and self.text[self.text_idx + 1] == '/':
	self.text_idx += 2
	break;
	self.text_idx += 1
	return True

	def parse_identifier(self):
	if not self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ':
	return False
	start_idx = self.text_idx
	while self.text_idx < len(self.text) and self.text[self.text_idx].upper() in '@_ABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890':
	self.text_idx += 1
	self.tokens.append((self.text[start_idx : self.text_idx], 5))
	return True

	def parse_string(self):
	starter = 0
	start_ch = self.text[self.text_idx]
	if start_ch == 'L':
	starter = 1
	start_ch = self.text[self.text_idx + 1]
	if not start_ch in '"\'':
	return False
	start_idx = self.text_idx
	self.text_idx += starter + 1
	escaped = False
	while self.text_idx < len(self.text):
	if escaped:
	escaped = False
	else:
	if self.text[self.text_idx] == '\\':
	escaped = True
	elif self.text[self.text_idx] == start_ch:
	self.text_idx += 1
	break;
	self.text_idx += 1

	self.tokens.append((self.text[start_idx : self.text_idx], 4))
	return True

	# Checks for punctuators
	# Returns whether a punctuator was consumed (True or False)
	def parse_punctuator(self):
	tab_idx = 0
	punc_len = 0
	saved_punc = None
	while 1:
	pte = punc_table[tab_idx]
	if pte[0] == self.text[self.text_idx]:
	if pte[3] != None:
	saved_punc = pte[3]
	self.text_idx += 1
	tab_idx = pte[2]
	if tab_idx == 0:
	break
	elif pte[1] == 0:
	break
	else:
	tab_idx += 1
	if saved_punc != None:
	self.tokens.append((saved_punc, 1))
	return True
	return False

	def parse_number(self):
	# A number must start with a digit or a dot followed by a digit
	ch = self.text[self.text_idx]
	if not ch.isdigit() and (ch != '.' or not self.text[self.text_idx + 1].isdigit()):
	return False;
	token_type = 2 # integer
	if (ch == '.'):
	token_type = 3 # float
	did_hex = False;
	start_idx = self.text_idx

	# Check for Hex, Octal, or Binary
	# Note that only D and Pawn support binary, but who cares?
	#
	if ch == '0':
	self.text_idx += 1
	ch = self.text[self.text_idx].upper()
	if ch == 'X': # hex
	did_hex = True
	self.text_idx += 1
	while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
	self.text_idx += 1
	elif ch == 'B': # binary
	self.text_idx += 1
	while self.text[self.text_idx] in '_01':
	self.text_idx += 1
	elif ch >= '0' and ch <= 7: # octal (but allow decimal)
	self.text_idx += 1
	while self.text[self.text_idx] in '_0123456789':
	self.text_idx += 1
	else:
	# either just 0 or 0.1 or 0UL, etc
	pass
	else:
	# Regular int or float
	while self.text[self.text_idx] in '_0123456789':
	self.text_idx += 1

	# Check if we stopped on a decimal point
	if self.text[self.text_idx] == '.':
	self.text_idx += 1
	token_type = 3 # float
	if did_hex:
	while self.text[self.text_idx] in '_0123456789abcdefABCDEF':
	self.text_idx += 1
	else:
	while self.text[self.text_idx] in '_0123456789':
	self.text_idx += 1

	# Check exponent
	# Valid exponents per language (not that it matters):
	# C/C++/D/Java: eEpP
	# C#/Pawn: eE
	if self.text[self.text_idx] in 'eEpP':
	token_type = 3 # float
	self.text_idx += 1
	if self.text[self.text_idx] in '+-':
	self.text_idx += 1
	while self.text[self.text_idx] in '_0123456789':
	self.text_idx += 1

	# Check the suffixes
	# Valid suffixes per language (not that it matters):
	# Integer Float
	# C/C++: uUlL lLfF
	# C#: uUlL fFdDMm
	# D: uUL ifFL
	# Java: lL fFdD
	# Pawn: (none) (none)
	#
	# Note that i, f, d, and m only appear in floats.
	while 1:
	if self.text[self.text_idx] in 'tTfFdDmM':
	token_type = 3 # float
	elif not self.text[self.text_idx] in 'lLuU':
	break;
	self.text_idx += 1

	self.tokens.append((self.text[start_idx : self.text_idx], token_type))
	return True

	text = """
	1.23+4-316%2 sin(1.e-3 + .5p32) "hello" and "hello\\"there"
	123 // some comment
	a = b + c;
	#define abc \\
	5
	d = 5 /* hello */ + 3;
	"""

	t=tokenizer()
	t.tokenize_text(text)
	print t.tokens