blob: b1c8cde2a17201a1796cdf4191093581d01f2881 [file] [log] [blame]
# -*- coding: utf-8 -*-
"""
CQL pygments lexer
~~~~~~~~~~~~~~~~~~
Lexer for the Cassandra Query Language (CQL).
This is heavily inspired from the pygments SQL lexer (and the Postgres one in particular) but adapted to CQL
keywords and specificities.
TODO: This has been hacked quickly, but once it's more tested, we could submit it upstream.
In particular, we have alot of keywords whose meaning depends on the context and we could potentially improve
their handling. For instance, SET is a keyword, but also a type name (that's why currently we also consider
map and list as keywords, not types; we could disambiguate by looking if there is a '<' afterwards). Or things
like USERS, which can is used in some documentation example as a table name but is a keyword too (we could
only consider it a keyword if after LIST for instance). Similarly, type nanes are not reserved, so they and
are sometime used as column identifiers (also, timestamp is both a type and a keyword). I "think" we can
somewhat disambiguate through "states", but unclear how far it's worth going.
We could also add the predefined functions?
"""
import re
from pygments.lexer import Lexer, RegexLexer, do_insertions, bygroups, words
from pygments.token import Punctuation, Whitespace, Error, \
Text, Comment, Operator, Keyword, Name, String, Number, Generic, Literal
from pygments.lexers import get_lexer_by_name, ClassNotFound
from pygments.util import iteritems
__all__ = [ 'CQLLexer' ]
language_re = re.compile(r"\s+LANGUAGE\s+'?(\w+)'?", re.IGNORECASE)
KEYWORDS = (
'SELECT',
'FROM',
'AS',
'WHERE',
'AND',
'KEY',
'KEYS',
'ENTRIES',
'FULL',
'INSERT',
'UPDATE',
'WITH',
'LIMIT',
'PER',
'PARTITION',
'USING',
'USE',
'DISTINCT',
'COUNT',
'SET',
'BEGIN',
'UNLOGGED',
'BATCH',
'APPLY',
'TRUNCATE',
'DELETE',
'IN',
'CREATE',
'KEYSPACE',
'SCHEMA',
'KEYSPACES',
'COLUMNFAMILY',
'TABLE',
'MATERIALIZED',
'VIEW',
'INDEX',
'CUSTOM',
'ON',
'TO',
'DROP',
'PRIMARY',
'INTO',
'VALUES',
'TIMESTAMP',
'TTL',
'CAST',
'ALTER',
'RENAME',
'ADD',
'TYPE',
'COMPACT',
'STORAGE',
'ORDER',
'BY',
'ASC',
'DESC',
'ALLOW',
'FILTERING',
'IF',
'IS',
'CONTAINS',
'GRANT',
'ALL',
'PERMISSION',
'PERMISSIONS',
'OF',
'REVOKE',
'MODIFY',
'AUTHORIZE',
'DESCRIBE',
'EXECUTE',
'NORECURSIVE',
'MBEAN',
'MBEANS',
'USER',
'USERS',
'ROLE',
'ROLES',
'SUPERUSER',
'NOSUPERUSER',
'PASSWORD',
'LOGIN',
'NOLOGIN',
'OPTIONS',
'CLUSTERING',
'TOKEN',
'WRITETIME',
'NULL',
'NOT',
'EXISTS',
'MAP',
'LIST',
'NAN',
'INFINITY',
'TUPLE',
'TRIGGER',
'STATIC',
'FROZEN',
'FUNCTION',
'FUNCTIONS',
'AGGREGATE',
'SFUNC',
'STYPE',
'FINALFUNC',
'INITCOND',
'RETURNS',
'CALLED',
'INPUT',
'LANGUAGE',
'OR',
'REPLACE',
'JSON',
'LIKE',
)
DATATYPES = (
'ASCII',
'BIGINT',
'BLOB',
'BOOLEAN',
'COUNTER',
'DATE',
'DECIMAL',
'DOUBLE',
'EMPTY',
'FLOAT',
'INET',
'INT',
'SMALLINT',
'TEXT',
'TIME',
'TIMESTAMP',
'TIMEUUID',
'TINYINT',
'UUID',
'VARCHAR',
'VARINT',
)
def language_callback(lexer, match):
"""Parse the content of a $-string using a lexer
The lexer is chosen looking for a nearby LANGUAGE or assumed as
java if no LANGUAGE has been found.
"""
l = None
m = language_re.match(lexer.text[max(0, match.start()-100):match.start()])
if m is not None:
l = lexer._get_lexer(m.group(1))
else:
l = lexer._get_lexer('java')
# 1 = $, 2 = delimiter, 3 = $
yield (match.start(1), String, match.group(1))
yield (match.start(2), String.Delimiter, match.group(2))
yield (match.start(3), String, match.group(3))
# 4 = string contents
if l:
for x in l.get_tokens_unprocessed(match.group(4)):
yield x
else:
yield (match.start(4), String, match.group(4))
# 5 = $, 6 = delimiter, 7 = $
yield (match.start(5), String, match.group(5))
yield (match.start(6), String.Delimiter, match.group(6))
yield (match.start(7), String, match.group(7))
class CQLLexer(RegexLexer):
"""
Lexer for the Cassandra Query Language.
"""
name = 'Cassandra Query Language'
aliases = ['cql']
filenames = ['*.cql']
mimetypes = ['text/x-cql']
flags = re.IGNORECASE
tokens = {
'root': [
(r'\s+', Text),
(r'--.*\n?', Comment.Single),
(r'//.*\n?', Comment.Single),
(r'/\*', Comment.Multiline, 'multiline-comments'),
(r'(' + '|'.join(s.replace(" ", "\s+")
for s in DATATYPES)
+ r')\b', Name.Builtin),
(words(KEYWORDS, suffix=r'\b'), Keyword),
(r'[+*/<>=~!@#%^&|`?-]+', Operator),
(r'\$\d+', Name.Variable),
# Using Number instead of the more accurate Literal because the latter don't seem to e highlighted in most
# styles
(r'[0-9a-fA-F]{8}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{4}-[0-9a-fA-F]{12}', Number), # UUIDs
(r'0x[0-9a-fA-F]+', Number), # Blobs
(r'([0-9]*\.[0-9]*|[0-9]+)(e[+-]?[0-9]+)?', Number.Float),
(r'[0-9]+', Number.Integer),
(r"((?:E|U&)?)(')", bygroups(String.Affix, String.Single), 'string'),
# quoted identifier
(r'((?:U&)?)(")', bygroups(String.Affix, String.Name), 'quoted-ident'),
(r'(?s)(\$)([^$]*)(\$)(.*?)(\$)(\2)(\$)', language_callback),
(r'[a-z_]\w*', Name),
(r'[;:()\[\]{},.]', Punctuation),
],
'multiline-comments': [
(r'/\*', Comment.Multiline, 'multiline-comments'),
(r'\*/', Comment.Multiline, '#pop'),
(r'[^/*]+', Comment.Multiline),
(r'[/*]', Comment.Multiline)
],
'string': [
(r"[^']+", String.Single),
(r"''", String.Single),
(r"'", String.Single, '#pop'),
],
'quoted-ident': [
(r'[^"]+', String.Name),
(r'""', String.Name),
(r'"', String.Name, '#pop'),
],
}
def get_tokens_unprocessed(self, text, *args):
# Have a copy of the entire text to be used by `language_callback`.
self.text = text
for x in RegexLexer.get_tokens_unprocessed(self, text, *args):
yield x
def _get_lexer(self, lang):
return get_lexer_by_name(lang, **self.options)