| # -*- coding: utf-8 -*- # |
| |
| module Rouge |
| # @abstract |
| # A stateful lexer that uses sets of regular expressions to |
| # tokenize a string. Most lexers are instances of RegexLexer. |
| class RegexLexer < Lexer |
| # A rule is a tuple of a regular expression to test, and a callback |
| # to perform if the test succeeds. |
| # |
| # @see StateDSL#rule |
| class Rule |
| attr_reader :callback |
| attr_reader :re |
| attr_reader :beginning_of_line |
| def initialize(re, callback) |
| @re = re |
| @callback = callback |
| @beginning_of_line = re.source[0] == ?^ |
| end |
| |
| def inspect |
| "#<Rule #{@re.inspect}>" |
| end |
| end |
| |
| # a State is a named set of rules that can be tested for or |
| # mixed in. |
| # |
| # @see RegexLexer.state |
| class State |
| attr_reader :name, :rules |
| def initialize(name, rules) |
| @name = name |
| @rules = rules |
| end |
| |
| def inspect |
| "#<#{self.class.name} #{@name.inspect}>" |
| end |
| end |
| |
| class StateDSL |
| attr_reader :rules |
| def initialize(name, &defn) |
| @name = name |
| @defn = defn |
| @rules = [] |
| end |
| |
| def to_state(lexer_class) |
| load! |
| rules = @rules.map do |rule| |
| rule.is_a?(String) ? lexer_class.get_state(rule) : rule |
| end |
| State.new(@name, rules) |
| end |
| |
| def prepended(&defn) |
| parent_defn = @defn |
| StateDSL.new(@name) do |
| instance_eval(&defn) |
| instance_eval(&parent_defn) |
| end |
| end |
| |
| def appended(&defn) |
| parent_defn = @defn |
| StateDSL.new(@name) do |
| instance_eval(&parent_defn) |
| instance_eval(&defn) |
| end |
| end |
| |
| protected |
| # Define a new rule for this state. |
| # |
| # @overload rule(re, token, next_state=nil) |
| # @overload rule(re, &callback) |
| # |
| # @param [Regexp] re |
| # a regular expression for this rule to test. |
| # @param [String] tok |
| # the token type to yield if `re` matches. |
| # @param [#to_s] next_state |
| # (optional) a state to push onto the stack if `re` matches. |
| # If `next_state` is `:pop!`, the state stack will be popped |
| # instead. |
| # @param [Proc] callback |
| # a block that will be evaluated in the context of the lexer |
| # if `re` matches. This block has access to a number of lexer |
| # methods, including {RegexLexer#push}, {RegexLexer#pop!}, |
| # {RegexLexer#token}, and {RegexLexer#delegate}. The first |
| # argument can be used to access the match groups. |
| def rule(re, tok=nil, next_state=nil, &callback) |
| if tok.nil? && callback.nil? |
| raise "please pass `rule` a token to yield or a callback" |
| end |
| |
| callback ||= case next_state |
| when :pop! |
| proc do |stream| |
| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug |
| @output_stream.call(tok, stream[0]) |
| puts " popping stack: #{1}" if @debug |
| @stack.pop or raise 'empty stack!' |
| end |
| when :push |
| proc do |stream| |
| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug |
| @output_stream.call(tok, stream[0]) |
| puts " pushing #{@stack.last.name}" if @debug |
| @stack.push(@stack.last) |
| end |
| when Symbol |
| proc do |stream| |
| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug |
| @output_stream.call(tok, stream[0]) |
| state = @states[next_state] || self.class.get_state(next_state) |
| puts " pushing #{state.name}" if @debug |
| @stack.push(state) |
| end |
| when nil |
| proc do |stream| |
| puts " yielding #{tok.qualname}, #{stream[0].inspect}" if @debug |
| @output_stream.call(tok, stream[0]) |
| end |
| else |
| raise "invalid next state: #{next_state.inspect}" |
| end |
| |
| rules << Rule.new(re, callback) |
| end |
| |
| # Mix in the rules from another state into this state. The rules |
| # from the mixed-in state will be tried in order before moving on |
| # to the rest of the rules in this state. |
| def mixin(state) |
| rules << state.to_s |
| end |
| |
| private |
| def load! |
| return if @loaded |
| @loaded = true |
| instance_eval(&@defn) |
| end |
| end |
| |
| # The states hash for this lexer. |
| # @see state |
| def self.states |
| @states ||= {} |
| end |
| |
| def self.state_definitions |
| @state_definitions ||= InheritableHash.new(superclass.state_definitions) |
| end |
| @state_definitions = {} |
| |
| def self.replace_state(name, new_defn) |
| states[name] = nil |
| state_definitions[name] = new_defn |
| end |
| |
| # The routines to run at the beginning of a fresh lex. |
| # @see start |
| def self.start_procs |
| @start_procs ||= InheritableList.new(superclass.start_procs) |
| end |
| @start_procs = [] |
| |
| # Specify an action to be run every fresh lex. |
| # |
| # @example |
| # start { puts "I'm lexing a new string!" } |
| def self.start(&b) |
| start_procs << b |
| end |
| |
| # Define a new state for this lexer with the given name. |
| # The block will be evaluated in the context of a {StateDSL}. |
| def self.state(name, &b) |
| name = name.to_s |
| state_definitions[name] = StateDSL.new(name, &b) |
| end |
| |
| def self.prepend(name, &b) |
| name = name.to_s |
| dsl = state_definitions[name] or raise "no such state #{name.inspect}" |
| replace_state(name, dsl.prepended(&b)) |
| end |
| |
| def self.append(name, &b) |
| name = name.to_s |
| dsl = state_definitions[name] or raise "no such state #{name.inspect}" |
| replace_state(name, dsl.appended(&b)) |
| end |
| |
| # @private |
| def self.get_state(name) |
| return name if name.is_a? State |
| |
| states[name.to_sym] ||= begin |
| defn = state_definitions[name.to_s] or raise "unknown state: #{name.inspect}" |
| defn.to_state(self) |
| end |
| end |
| |
| # @private |
| def get_state(state_name) |
| self.class.get_state(state_name) |
| end |
| |
| # The state stack. This is initially the single state `[:root]`. |
| # It is an error for this stack to be empty. |
| # @see #state |
| def stack |
| @stack ||= [get_state(:root)] |
| end |
| |
| # The current state - i.e. one on top of the state stack. |
| # |
| # NB: if the state stack is empty, this will throw an error rather |
| # than returning nil. |
| def state |
| stack.last or raise 'empty stack!' |
| end |
| |
| # reset this lexer to its initial state. This runs all of the |
| # start_procs. |
| def reset! |
| @stack = nil |
| @current_stream = nil |
| |
| self.class.start_procs.each do |pr| |
| instance_eval(&pr) |
| end |
| end |
| |
| # This implements the lexer protocol, by yielding [token, value] pairs. |
| # |
| # The process for lexing works as follows, until the stream is empty: |
| # |
| # 1. We look at the state on top of the stack (which by default is |
| # `[:root]`). |
| # 2. Each rule in that state is tried until one is successful. If one |
| # is found, that rule's callback is evaluated - which may yield |
| # tokens and manipulate the state stack. Otherwise, one character |
| # is consumed with an `'Error'` token, and we continue at (1.) |
| # |
| # @see #step #step (where (2.) is implemented) |
| def stream_tokens(str, &b) |
| stream = StringScanner.new(str) |
| |
| @current_stream = stream |
| @output_stream = b |
| @states = self.class.states |
| @null_steps = 0 |
| |
| until stream.eos? |
| if @debug |
| puts "lexer: #{self.class.tag}" |
| puts "stack: #{stack.map(&:name).inspect}" |
| puts "stream: #{stream.peek(20).inspect}" |
| end |
| |
| success = step(state, stream) |
| |
| if !success |
| puts " no match, yielding Error" if @debug |
| b.call(Token::Tokens::Error, stream.getch) |
| end |
| end |
| end |
| |
| # The number of successive scans permitted without consuming |
| # the input stream. If this is exceeded, the match fails. |
| MAX_NULL_SCANS = 5 |
| |
| # Runs one step of the lex. Rules in the current state are tried |
| # until one matches, at which point its callback is called. |
| # |
| # @return true if a rule was tried successfully |
| # @return false otherwise. |
| def step(state, stream) |
| state.rules.each do |rule| |
| if rule.is_a?(State) |
| puts " entering mixin #{rule.name}" if @debug |
| return true if step(rule, stream) |
| puts " exiting mixin #{rule.name}" if @debug |
| else |
| puts " trying #{rule.inspect}" if @debug |
| |
| # XXX HACK XXX |
| # StringScanner's implementation of ^ is b0rken. |
| # see http://bugs.ruby-lang.org/issues/7092 |
| # TODO: this doesn't cover cases like /(a|^b)/, but it's |
| # the most common, for now... |
| next if rule.beginning_of_line && !stream.beginning_of_line? |
| |
| if size = stream.skip(rule.re) |
| puts " got #{stream[0].inspect}" if @debug |
| |
| instance_exec(stream, &rule.callback) |
| |
| if size.zero? |
| @null_steps += 1 |
| if @null_steps > MAX_NULL_SCANS |
| puts " too many scans without consuming the string!" if @debug |
| return false |
| end |
| else |
| @null_steps = 0 |
| end |
| |
| return true |
| end |
| end |
| end |
| |
| false |
| end |
| |
| # Yield a token. |
| # |
| # @param tok |
| # the token type |
| # @param val |
| # (optional) the string value to yield. If absent, this defaults |
| # to the entire last match. |
| def token(tok, val=@current_stream[0]) |
| yield_token(tok, val) |
| end |
| |
| # @deprecated |
| # |
| # Yield a token with the next matched group. Subsequent calls |
| # to this method will yield subsequent groups. |
| def group(tok) |
| raise "RegexLexer#group is deprecated: use #groups instead" |
| end |
| |
| # Yield tokens corresponding to the matched groups of the current |
| # match. |
| def groups(*tokens) |
| tokens.each_with_index do |tok, i| |
| yield_token(tok, @current_stream[i+1]) |
| end |
| end |
| |
| # Delegate the lex to another lexer. The #lex method will be called |
| # with `:continue` set to true, so that #reset! will not be called. |
| # In this way, a single lexer can be repeatedly delegated to while |
| # maintaining its own internal state stack. |
| # |
| # @param [#lex] lexer |
| # The lexer or lexer class to delegate to |
| # @param [String] text |
| # The text to delegate. This defaults to the last matched string. |
| def delegate(lexer, text=nil) |
| puts " delegating to #{lexer.inspect}" if @debug |
| text ||= @current_stream[0] |
| |
| lexer.lex(text, :continue => true) do |tok, val| |
| puts " delegated token: #{tok.inspect}, #{val.inspect}" if @debug |
| yield_token(tok, val) |
| end |
| end |
| |
| def recurse(text=nil) |
| delegate(self.class, text) |
| end |
| |
| # Push a state onto the stack. If no state name is given and you've |
| # passed a block, a state will be dynamically created using the |
| # {StateDSL}. |
| def push(state_name=nil, &b) |
| push_state = if state_name |
| get_state(state_name) |
| elsif block_given? |
| StateDSL.new(b.inspect, &b).to_state(self.class) |
| else |
| # use the top of the stack by default |
| self.state |
| end |
| |
| puts " pushing #{push_state.name}" if @debug |
| stack.push(push_state) |
| end |
| |
| # Pop the state stack. If a number is passed in, it will be popped |
| # that number of times. |
| def pop!(times=1) |
| raise 'empty stack!' if stack.empty? |
| |
| puts " popping stack: #{times}" if @debug |
| |
| stack.pop(times) |
| |
| nil |
| end |
| |
| # replace the head of the stack with the given state |
| def goto(state_name) |
| raise 'empty stack!' if stack.empty? |
| |
| puts " going to state #{state_name} " if @debug |
| stack[-1] = get_state(state_name) |
| end |
| |
| # reset the stack back to `[:root]`. |
| def reset_stack |
| puts ' resetting stack' if @debug |
| stack.clear |
| stack.push get_state(:root) |
| end |
| |
| # Check if `state_name` is in the state stack. |
| def in_state?(state_name) |
| state_name = state_name.to_s |
| stack.any? do |state| |
| state.name == state_name.to_s |
| end |
| end |
| |
| # Check if `state_name` is the state on top of the state stack. |
| def state?(state_name) |
| state_name.to_s == state.name |
| end |
| |
| private |
| def yield_token(tok, val) |
| return if val.nil? || val.empty? |
| puts " yielding #{tok.qualname}, #{val.inspect}" if @debug |
| @output_stream.yield(tok, val) |
| end |
| end |
| end |