class Regexp::Lexer

A very thin wrapper around the scanner that breaks quantified literal runs, collects emitted tokens into an array, calculates their nesting depth, and normalizes tokens for the parser, and checks if they are implemented by the given syntax flavor.

Constants

CLOSING_TOKENS
CONDITION_TOKENS
OPENING_TOKENS

Attributes

conditional_nesting[RW]
nesting[RW]
set_nesting[RW]
shift[RW]
tokens[RW]

Public Class Methods

lex(input, syntax = "ruby/ click to toggle source
# File lib/regexp_parser/lexer.rb, line 16
def self.lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
  new.lex(input, syntax, options: options, &block)
end
Also aliased as: scan
scan(input, syntax = "ruby/
Alias for: lex

Public Instance Methods

lex(input, syntax = "ruby/ click to toggle source
# File lib/regexp_parser/lexer.rb, line 20
def lex(input, syntax = "ruby/#{RUBY_VERSION}", options: nil, &block)
  syntax = Regexp::Syntax.for(syntax)

  self.tokens = []
  self.nesting = 0
  self.set_nesting = 0
  self.conditional_nesting = 0
  self.shift = 0

  last = nil
  Regexp::Scanner.scan(input, options: options) do |type, token, text, ts, te|
    type, token = *syntax.normalize(type, token)
    syntax.check! type, token

    ascend(type, token)

    if type == :quantifier and last
      break_literal(last)        if last.type == :literal
      break_codepoint_list(last) if last.token == :codepoint_list
    end

    current = Regexp::Token.new(type, token, text, ts + shift, te + shift,
                                nesting, set_nesting, conditional_nesting)

    current = merge_condition(current) if type == :conditional and
      CONDITION_TOKENS.include?(token)

    last.next = current if last
    current.previous = last if last

    tokens << current
    last = current

    descend(type, token)
  end

  if block_given?
    tokens.map { |t| block.call(t) }
  else
    tokens
  end
end

Private Instance Methods

ascend(type, token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 71
def ascend(type, token)
  case type
  when :group, :assertion
    self.nesting = nesting - 1 if CLOSING_TOKENS.include?(token)
  when :set
    self.set_nesting = set_nesting - 1 if token == :close
  when :conditional
    self.conditional_nesting = conditional_nesting - 1 if token == :close
  end
end
break_codepoint_list(token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 108
def break_codepoint_list(token)
  lead, _, tail = token.text.rpartition(' ')
  return if lead.empty?

  tokens.pop
  tokens << Regexp::Token.new(:escape, :codepoint_list, lead + '}',
            token.ts, (token.te - tail.length),
            nesting, set_nesting, conditional_nesting)
  tokens << Regexp::Token.new(:escape, :codepoint_list, '\u{' + tail,
            (token.ts + lead.length + 1), (token.te + 3),
            nesting, set_nesting, conditional_nesting)

  self.shift = shift + 3 # one space less, but extra \, u, {, and }
end
break_literal(token) click to toggle source

called by scan to break a literal run that is longer than one character into two separate tokens when it is followed by a quantifier

# File lib/regexp_parser/lexer.rb, line 95
def break_literal(token)
  lead, last, _ = token.text.partition(/.\z/mu)
  return if lead.empty?

  tokens.pop
  tokens << Regexp::Token.new(:literal, :literal, lead,
            token.ts, (token.te - last.length),
            nesting, set_nesting, conditional_nesting)
  tokens << Regexp::Token.new(:literal, :literal, last,
            (token.ts + lead.length), token.te,
            nesting, set_nesting, conditional_nesting)
end
descend(type, token) click to toggle source
# File lib/regexp_parser/lexer.rb, line 82
def descend(type, token)
  case type
  when :group, :assertion
    self.nesting = nesting + 1 if OPENING_TOKENS.include?(token)
  when :set
    self.set_nesting = set_nesting + 1 if token == :open
  when :conditional
    self.conditional_nesting = conditional_nesting + 1 if token == :open
  end
end
merge_condition(current) click to toggle source
# File lib/regexp_parser/lexer.rb, line 123
def merge_condition(current)
  last = tokens.pop
  Regexp::Token.new(:conditional, :condition, last.text + current.text,
    last.ts, current.te, nesting, set_nesting, conditional_nesting)
end