| #!/usr/bin/python |
| # coding: utf-8 |
| |
| """Lexical analysis of formal languages (i.e. code) using Pygments.""" |
| |
| # :Author: Georg Brandl; Felix Wiemann; Günter Milde |
| # :Date: $Date: 2021-10-22 18:39:59 +0200 (Fr, 22. Okt 2021) $ |
| # :Copyright: This module has been placed in the public domain. |
| |
| from docutils import ApplicationError |
| try: |
| import pygments |
| from pygments.lexers import get_lexer_by_name |
| from pygments.formatters.html import _get_ttype_class |
| with_pygments = True |
| except ImportError: |
| with_pygments = False |
| |
| # Filter the following token types from the list of class arguments: |
| unstyled_tokens = ['token', # Token (base token type) |
| 'text', # Token.Text |
| ''] # short name for Token and Text |
| # (Add, e.g., Token.Punctuation with ``unstyled_tokens += 'punctuation'``.) |
| |
| class LexerError(ApplicationError): |
| pass |
| |
| class Lexer(object): |
| """Parse `code` lines and yield "classified" tokens. |
| |
| Arguments |
| |
| code -- string of source code to parse, |
| language -- formal language the code is written in, |
| tokennames -- either 'long', 'short', or 'none' (see below). |
| |
| Merge subsequent tokens of the same token-type. |
| |
| Iterating over an instance yields the tokens as ``(tokentype, value)`` |
| tuples. The value of `tokennames` configures the naming of the tokentype: |
| |
| 'long': downcased full token type name, |
| 'short': short name defined by pygments.token.STANDARD_TYPES |
| (= class argument used in pygments html output), |
| 'none': skip lexical analysis. |
| """ |
| |
| def __init__(self, code, language, tokennames='short'): |
| """ |
| Set up a lexical analyzer for `code` in `language`. |
| """ |
| self.code = code |
| self.language = language |
| self.tokennames = tokennames |
| self.lexer = None |
| # get lexical analyzer for `language`: |
| if language in ('', 'text') or tokennames == 'none': |
| return |
| if not with_pygments: |
| raise LexerError('Cannot analyze code. ' |
| 'Pygments package not found.') |
| try: |
| self.lexer = get_lexer_by_name(self.language) |
| except pygments.util.ClassNotFound: |
| raise LexerError('Cannot analyze code. ' |
| 'No Pygments lexer found for "%s".' % language) |
| # self.lexer.add_filter('tokenmerge') |
| # Since version 1.2. (released Jan 01, 2010) Pygments has a |
| # TokenMergeFilter. # ``self.merge(tokens)`` in __iter__ could |
| # be replaced by ``self.lexer.add_filter('tokenmerge')`` in __init__. |
| # However, `merge` below also strips a final newline added by pygments. |
| # |
| # self.lexer.add_filter('tokenmerge') |
| |
| def merge(self, tokens): |
| """Merge subsequent tokens of same token-type. |
| |
| Also strip the final newline (added by pygments). |
| """ |
| tokens = iter(tokens) |
| (lasttype, lastval) = next(tokens) |
| for ttype, value in tokens: |
| if ttype is lasttype: |
| lastval += value |
| else: |
| yield(lasttype, lastval) |
| (lasttype, lastval) = (ttype, value) |
| if lastval.endswith('\n'): |
| lastval = lastval[:-1] |
| if lastval: |
| yield(lasttype, lastval) |
| |
| def __iter__(self): |
| """Parse self.code and yield "classified" tokens. |
| """ |
| if self.lexer is None: |
| yield ([], self.code) |
| return |
| tokens = pygments.lex(self.code, self.lexer) |
| for tokentype, value in self.merge(tokens): |
| if self.tokennames == 'long': # long CSS class args |
| classes = str(tokentype).lower().split('.') |
| else: # short CSS class args |
| classes = [_get_ttype_class(tokentype)] |
| classes = [cls for cls in classes if cls not in unstyled_tokens] |
| yield (classes, value) |
| |
| |
| class NumberLines(object): |
| """Insert linenumber-tokens at the start of every code line. |
| |
| Arguments |
| |
| tokens -- iterable of ``(classes, value)`` tuples |
| startline -- first line number |
| endline -- last line number |
| |
| Iterating over an instance yields the tokens with a |
| ``(['ln'], '<the line number>')`` token added for every code line. |
| Multi-line tokens are split.""" |
| |
| def __init__(self, tokens, startline, endline): |
| self.tokens = tokens |
| self.startline = startline |
| # pad linenumbers, e.g. endline == 100 -> fmt_str = '%3d ' |
| self.fmt_str = '%%%dd ' % len(str(endline)) |
| |
| def __iter__(self): |
| lineno = self.startline |
| yield (['ln'], self.fmt_str % lineno) |
| for ttype, value in self.tokens: |
| lines = value.split('\n') |
| for line in lines[:-1]: |
| yield (ttype, line + '\n') |
| lineno += 1 |
| yield (['ln'], self.fmt_str % lineno) |
| yield (ttype, lines[-1]) |