doc/source/_util/cql.py - cassandra - Git at Google

 # -*- coding: utf-8 -*-
 """
     CQL pygments lexer
     ~~~~~~~~~~~~~~~~~~

     Lexer for the Cassandra Query Language (CQL).

     This is heavily inspired from the pygments SQL lexer (and the Postgres one in particular) but adapted to CQL
     keywords and specificities.

     TODO: This has been hacked quickly, but once it's more tested, we could submit it upstream.
           In particular, we have alot of keywords whose meaning depends on the context and we could potentially improve
           their handling. For instance, SET is a keyword, but also a type name (that's why currently we also consider
           map and list as keywords, not types; we could disambiguate by looking if there is a '<' afterwards). Or things
           like USERS, which can is used in some documentation example as a table name but is a keyword too (we could
           only consider it a keyword if after LIST for instance). Similarly, type nanes are not reserved, so they and
           are sometime used as column identifiers (also, timestamp is both a type and a keyword). I "think" we can
           somewhat disambiguate through "states", but unclear how far it's worth going.

           We could also add the predefined functions?
 """

 import re

 from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, words
 from pygments.token import Punctuation, Whitespace, Error, \
     Text, Comment, Operator, Keyword, Name, String, Number, Generic, Literal
 from pygments.lexers import get_lexer_by_name, ClassNotFound
 from pygments.util import iteritems

 __all__ = [ 'CQLLexer' ]

 language_re = re.compile(r"\s+LANGUAGE\s+'?(\w+)'?", re.IGNORECASE)

 KEYWORDS = (
     'SELECT',
     'FROM',
     'AS',
     'WHERE',
     'AND',
     'KEY',
     'KEYS',
     'ENTRIES',
     'FULL',
     'INSERT',
     'UPDATE',
     'WITH',
     'LIMIT',
     'PER',
     'PARTITION',
     'USING',
     'USE',
     'DISTINCT',
     'COUNT',
     'SET',
     'BEGIN',
     'UNLOGGED',
     'BATCH',
     'APPLY',
     'TRUNCATE',
     'DELETE',
     'IN',
     'CREATE',
     'KEYSPACE',
     'SCHEMA',
     'KEYSPACES',
     'COLUMNFAMILY',
     'TABLE',
     'MATERIALIZED',
     'VIEW',
     'INDEX',
     'CUSTOM',
     'ON',
     'TO',
     'DROP',
     'PRIMARY',
     'INTO',
     'VALUES',
     'TIMESTAMP',
     'TTL',
     'CAST',
     'ALTER',
     'RENAME',
     'ADD',
     'TYPE',
     'COMPACT',
     'STORAGE',
     'ORDER',
     'BY',
     'ASC',
     'DESC',
     'ALLOW',
     'FILTERING',
     'IF',
     'IS',
     'CONTAINS',
     'GRANT',
     'ALL',
     'PERMISSION',
     'PERMISSIONS',
     'OF',
     'REVOKE',
     'MODIFY',
     'AUTHORIZE',
     'DESCRIBE',
     'EXECUTE',
     'NORECURSIVE',
     'MBEAN',
     'MBEANS',
     'USER',
     'USERS',
     'ROLE',
     'ROLES',
     'SUPERUSER',
     'NOSUPERUSER',
     'PASSWORD',
     'LOGIN',
     'NOLOGIN',
     'OPTIONS',
     'CLUSTERING',
     'TOKEN',
     'WRITETIME',
     'NULL',
     'NOT',
     'EXISTS',
     'MAP',
     'LIST',
     'NAN',
     'INFINITY',
     'TUPLE',
     'TRIGGER',
     'STATIC',
     'FROZEN',
     'FUNCTION',
     'FUNCTIONS',
     'AGGREGATE',
     'SFUNC',
     'STYPE',
     'FINALFUNC',
     'INITCOND',
     'RETURNS',
     'CALLED',
     'INPUT',
     'LANGUAGE',
     'OR',
     'REPLACE',
     'JSON',
     'LIKE',
 )

 DATATYPES = (
     'ASCII',
     'BIGINT',
     'BLOB',
     'BOOLEAN',
     'COUNTER',
     'DATE',
     'DECIMAL',
     'DOUBLE',
     'EMPTY',
     'FLOAT',
     'INET',
     'INT',
     'SMALLINT',
     'TEXT',
     'TIME',
     'TIMESTAMP',
     'TIMEUUID',
     'TINYINT',
     'UUID',
     'VARCHAR',
     'VARINT',
 )

 def language_callback(lexer, match):
     """Parse the content of a $-string using a lexer

     The lexer is chosen looking for a nearby LANGUAGE or assumed as
     java if no LANGUAGE has been found.
     """
     l = None
     m = language_re.match(lexer.text[max(0, match.start()-100):match.start()])
     if m is not None:
         l = lexer._get_lexer(m.group(1))
     else:
         l = lexer._get_lexer('java')

     # 1 = $, 2 = delimiter, 3 = $
     yield (match.start(1), String, match.group(1))
     yield (match.start(2), String.Delimiter, match.group(2))
     yield (match.start(3), String, match.group(3))
     # 4 = string contents
     if l:
         for x in l.get_tokens_unprocessed(match.group(4)):
             yield x
     else:
         yield (match.start(4), String, match.group(4))
     # 5 = $, 6 = delimiter, 7 = $
     yield (match.start(5), String, match.group(5))
     yield (match.start(6), String.Delimiter, match.group(6))
     yield (match.start(7), String, match.group(7))


 class CQLLexer(RegexLexer):
     """
     Lexer for the Cassandra Query Language.
     """

     name = 'Cassandra Query Language'
     aliases = ['cql']
     filenames = ['*.cql']
     mimetypes = ['text/x-cql']

     flags = re.IGNORECASE
     tokens = {
         'root': [
             (r'\s+', Text),
             (r'--.*\n?', Comment.Single),
             (r'//.*\n?', Comment.Single),
             (r'/\*', Comment.Multiline, 'multiline-comments'),
             (r'(' + '|'.join(s.replace(" ", "\s+")
                              for s in DATATYPES)
              + r')\b', Name.Builtin),
             (words(KEYWORDS, suffix=r'\b'), Keyword),
             (r'[+*/<>=~!@#%^&|`?-]+', Operator),
             (r'\$\d+', Name.Variable),

             # Using Number instead of the more accurate Literal because the latter don't seem to e highlighted in most
             # styles
             (r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', Number), # UUIDs
             (r'0x[0-9a-fA-F]+', Number), # Blobs

             (r'([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
             (r'[0-9]+', Number.Integer),
             (r"((?:E|U&)?)(')", bygroups(String.Affix, String.Single), 'string'),
             # quoted identifier
             (r'((?:U&)?)(")', bygroups(String.Affix, String.Name), 'quoted-ident'),
             (r'(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)', language_callback),
             (r'[a-z_]\w*', Name),
             (r'[;:()\[\]{},.]', Punctuation),
         ],
         'multiline-comments': [
             (r'/\*', Comment.Multiline, 'multiline-comments'),
             (r'\*/', Comment.Multiline, '#pop'),
             (r'[^/*]+', Comment.Multiline),
             (r'[/*]', Comment.Multiline)
         ],
         'string': [
             (r"[^']+", String.Single),
             (r"''", String.Single),
             (r"'", String.Single, '#pop'),
         ],
         'quoted-ident': [
             (r'[^"]+', String.Name),
             (r'""', String.Name),
             (r'"', String.Name, '#pop'),
         ],
     }

     def get_tokens_unprocessed(self, text, *args):
         # Have a copy of the entire text to be used by `language_callback`.
         self.text = text
         for x in RegexLexer.get_tokens_unprocessed(self, text, *args):
             yield x

     def _get_lexer(self, lang):
         return get_lexer_by_name(lang, **self.options)
	# -- coding: utf-8 --
	"""
	CQL pygments lexer
	~~~~~~~~~~~~~~~~~~

	Lexer for the Cassandra Query Language (CQL).

	This is heavily inspired from the pygments SQL lexer (and the Postgres one in particular) but adapted to CQL
	keywords and specificities.

	TODO: This has been hacked quickly, but once it's more tested, we could submit it upstream.
	In particular, we have alot of keywords whose meaning depends on the context and we could potentially improve
	their handling. For instance, SET is a keyword, but also a type name (that's why currently we also consider
	map and list as keywords, not types; we could disambiguate by looking if there is a '<' afterwards). Or things
	like USERS, which can is used in some documentation example as a table name but is a keyword too (we could
	only consider it a keyword if after LIST for instance). Similarly, type nanes are not reserved, so they and
	are sometime used as column identifiers (also, timestamp is both a type and a keyword). I "think" we can
	somewhat disambiguate through "states", but unclear how far it's worth going.

	We could also add the predefined functions?
	"""

	import re

	from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, words
	from pygments.token import Punctuation, Whitespace, Error, \
	Text, Comment, Operator, Keyword, Name, String, Number, Generic, Literal
	from pygments.lexers import get_lexer_by_name, ClassNotFound
	from pygments.util import iteritems

	__all__ = [ 'CQLLexer' ]

	language_re = re.compile(r"\s+LANGUAGE\s+'?(\w+)'?", re.IGNORECASE)

	KEYWORDS = (
	'SELECT',
	'FROM',
	'AS',
	'WHERE',
	'AND',
	'KEY',
	'KEYS',
	'ENTRIES',
	'FULL',
	'INSERT',
	'UPDATE',
	'WITH',
	'LIMIT',
	'PER',
	'PARTITION',
	'USING',
	'USE',
	'DISTINCT',
	'COUNT',
	'SET',
	'BEGIN',
	'UNLOGGED',
	'BATCH',
	'APPLY',
	'TRUNCATE',
	'DELETE',
	'IN',
	'CREATE',
	'KEYSPACE',
	'SCHEMA',
	'KEYSPACES',
	'COLUMNFAMILY',
	'TABLE',
	'MATERIALIZED',
	'VIEW',
	'INDEX',
	'CUSTOM',
	'ON',
	'TO',
	'DROP',
	'PRIMARY',
	'INTO',
	'VALUES',
	'TIMESTAMP',
	'TTL',
	'CAST',
	'ALTER',
	'RENAME',
	'ADD',
	'TYPE',
	'COMPACT',
	'STORAGE',
	'ORDER',
	'BY',
	'ASC',
	'DESC',
	'ALLOW',
	'FILTERING',
	'IF',
	'IS',
	'CONTAINS',
	'GRANT',
	'ALL',
	'PERMISSION',
	'PERMISSIONS',
	'OF',
	'REVOKE',
	'MODIFY',
	'AUTHORIZE',
	'DESCRIBE',
	'EXECUTE',
	'NORECURSIVE',
	'MBEAN',
	'MBEANS',
	'USER',
	'USERS',
	'ROLE',
	'ROLES',
	'SUPERUSER',
	'NOSUPERUSER',
	'PASSWORD',
	'LOGIN',
	'NOLOGIN',
	'OPTIONS',
	'CLUSTERING',
	'TOKEN',
	'WRITETIME',
	'NULL',
	'NOT',
	'EXISTS',
	'MAP',
	'LIST',
	'NAN',
	'INFINITY',
	'TUPLE',
	'TRIGGER',
	'STATIC',
	'FROZEN',
	'FUNCTION',
	'FUNCTIONS',
	'AGGREGATE',
	'SFUNC',
	'STYPE',
	'FINALFUNC',
	'INITCOND',
	'RETURNS',
	'CALLED',
	'INPUT',
	'LANGUAGE',
	'OR',
	'REPLACE',
	'JSON',
	'LIKE',
	)

	DATATYPES = (
	'ASCII',
	'BIGINT',
	'BLOB',
	'BOOLEAN',
	'COUNTER',
	'DATE',
	'DECIMAL',
	'DOUBLE',
	'EMPTY',
	'FLOAT',
	'INET',
	'INT',
	'SMALLINT',
	'TEXT',
	'TIME',
	'TIMESTAMP',
	'TIMEUUID',
	'TINYINT',
	'UUID',
	'VARCHAR',
	'VARINT',
	)

	def language_callback(lexer, match):
	"""Parse the content of a $-string using a lexer

	The lexer is chosen looking for a nearby LANGUAGE or assumed as
	java if no LANGUAGE has been found.
	"""
	l = None
	m = language_re.match(lexer.text[max(0, match.start()-100):match.start()])
	if m is not None:
	l = lexer._get_lexer(m.group(1))
	else:
	l = lexer._get_lexer('java')

	# 1 = $, 2 = delimiter, 3 = $
	yield (match.start(1), String, match.group(1))
	yield (match.start(2), String.Delimiter, match.group(2))
	yield (match.start(3), String, match.group(3))
	# 4 = string contents
	if l:
	for x in l.get_tokens_unprocessed(match.group(4)):
	yield x
	else:
	yield (match.start(4), String, match.group(4))
	# 5 = $, 6 = delimiter, 7 = $
	yield (match.start(5), String, match.group(5))
	yield (match.start(6), String.Delimiter, match.group(6))
	yield (match.start(7), String, match.group(7))


	class CQLLexer(RegexLexer):
	"""
	Lexer for the Cassandra Query Language.
	"""

	name = 'Cassandra Query Language'
	aliases = ['cql']
	filenames = ['*.cql']
	mimetypes = ['text/x-cql']

	flags = re.IGNORECASE
	tokens = {
	'root': [
	(r'\s+', Text),
	(r'--.*\n?', Comment.Single),
	(r'//.*\n?', Comment.Single),
	(r'/\*', Comment.Multiline, 'multiline-comments'),
	(r'(' + '\|'.join(s.replace(" ", "\s+")
	for s in DATATYPES)
	+ r')\b', Name.Builtin),
	(words(KEYWORDS, suffix=r'\b'), Keyword),
	(r'[+*/<>=~!@#%^&\|`?-]+', Operator),
	(r'\$\d+', Name.Variable),

	# Using Number instead of the more accurate Literal because the latter don't seem to e highlighted in most
	# styles
	(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', Number), # UUIDs
	(r'0x[0-9a-fA-F]+', Number), # Blobs

	(r'([0-9]\.[0-9]\|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
	(r'[0-9]+', Number.Integer),
	(r"((?:E\|U&)?)(')", bygroups(String.Affix, String.Single), 'string'),
	# quoted identifier
	(r'((?:U&)?)(")', bygroups(String.Affix, String.Name), 'quoted-ident'),
	(r'(?s)(\$)([^$])(\$)(.?)(\$)(\2)(\$)', language_callback),
	(r'[a-z_]\w*', Name),
	(r'[;:()\[\]{},.]', Punctuation),
	],
	'multiline-comments': [
	(r'/\*', Comment.Multiline, 'multiline-comments'),
	(r'\*/', Comment.Multiline, '#pop'),
	(r'[^/*]+', Comment.Multiline),
	(r'[/*]', Comment.Multiline)
	],
	'string': [
	(r"[^']+", String.Single),
	(r"''", String.Single),
	(r"'", String.Single, '#pop'),
	],
	'quoted-ident': [
	(r'[^"]+', String.Name),
	(r'""', String.Name),
	(r'"', String.Name, '#pop'),
	],
	}

	def get_tokens_unprocessed(self, text, *args):
	# Have a copy of the entire text to be used by `language_callback`.
	self.text = text
	for x in RegexLexer.get_tokens_unprocessed(self, text, *args):
	yield x

	def _get_lexer(self, lang):
	return get_lexer_by_name(lang, **self.options)