Ex Pattern

lua-users home
wiki

xpattern is a Lua pattern matching library based on Lua patterns and in the style of LuaPeg.

Description

Lua patterns [1] are a limited form of regular expressions. Unlike regular expressions, patterns do not support grouping and quantification [2] operations on anything except single character sets. For full regular expression (or Perl-style regular expression) support, you may instead compile a regular expression module such as lrexlib [3]. For even greater flexibility, for parsing and lexical analysis applications or even for the simpler case, one can use parsing expression grammars (PEG) via the LuaPeg library [4] implemented in C.

Sometimes, however, Lua patterns are almost sufficient, as we may want to extend the functionality of patterns just a bit more without compiling in any additional C code. That's where this module comes in. This module builds extended pattern matching on top of Lua patterns. It takes a pattern expression written in a form somewhat similar to LPeg and translates it to a string of Lua code containing string.match calls, which is then compiled to a Lua function via loadstring. This is similar to the code you might write yourself, although this module automates it with code generation. It should have similar efficiency assuming you precompile any match objects that are used repeatedly.

Here is a simple example:

local M = require "xpattern"

local P = M.P

local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()

local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first

assert(a == nil and b == 'ccc' and c == 'b' and d == 11)

which internally generates this Lua function:

local match = ...  -- string.match

return function(s,pos)

  for pos1=(pos or 1),#s do

    local pos2

    local c1,c2,c3,c4

    local pos3 = pos1

    c1,pos3 = match(s, "^(b+)()", pos1)

    if not pos3 then

      c2,pos3 = match(s, "^(c+)()", pos1)

    end

    pos2 = pos3

    if pos2 then

      do

        local pos3=pos2

        pos2 = pos2

        while 1 do

          pos3 = match(s, "^[A-Z][a-z]()", pos3)

          if pos3 then

            pos2=pos3

          else break end

        end

      end

    end

    if pos2 then

      c3,c4,pos2 = match(s, "^(.)()()", pos2)

    end

    if pos2 then return c1,c2,c3,c4 end

  end

end

See the test suite below for additional examples.

Status

Warning: this code should be considered somewhat experimental. It is not fully developed or tested. Fix up the code if you want to use it in production.

Appendix: xpattern.lua

-- xpattern.lua

-- Preliminary regular expression-like support in Lua

-- Uses Lua patterns as the core building block.

--

-- Implemented in pure Lua with code generation technique.

-- It translates an expression into a snippet of Lua code

-- having a series of `string.match` calls, which is then

-- compiled (via `loadstring`).

--

-- Like lpeg, does not support backtracking.

--

-- WARNING: This is experimental code.  The design and implementation

-- has not been thoroughly tested.

--

-- Version v20091021.

-- (c) 2008-2009 David Manura. Licensed under the same terms as Lua (MIT license).

-- Please post patches.



local M = {}



local string = string

local format = string.format

local match  = string.match

local assert = assert

local error  = error

local ipairs = ipairs

local loadstring   = loadstring

local setmetatable = setmetatable

local type   = type

local print  = print





-- Adds whitespace to string `s`.

-- Whitespace string `ws` (default to '' if omitted) is prepended to each line

-- of `s`.  Also ensures `s` is is terminated by a newline.

local function add_whitespace(s, ws)

  ws = ws or ''

  s = s:gsub('[^\r\n]+', ws .. '%1')

  if s:match('[^\r\n]$') then

    s = s .. '\n'

  end

  return s

end



-- Counts the number `count` of captures '()' in Lua pattern string `pat`.

local function count_captures(pat)

  local count = 0

  local pos = 1

  while pos <= #pat do

    local pos2 = pat:match('^[^%(%%%[]+()', pos)

    if pos2 then

      pos = pos2

    elseif pat:match('^%(', pos) then

      count = count + 1

      pos = pos + 1

    elseif pat:match('^%%b..', pos) then

      pos = pos + 3

    elseif pat:match('^%%.', pos) then

      pos = pos + 2

    else

      local pos2 = pat:match('^%[[^%]%%]*()', pos)

      if pos2 then

        pos = pos2

        while 1 do

          local pos2 = pat:match('^%%.[^%]%%]*()', pos)

          if pos2 then

            pos = pos2

          elseif pat:match('^%]', pos) then

            pos = pos + 1

            break

          else

            error('syntax', 2)

          end

        end

      else

        error('syntax', 2)

      end

    end

  end

  return count

end

M._count_captures = count_captures





-- Appends '()' to Lua pattern string `pat`.

local function pat_append_pos(pat)

  local prefix = pat:match'^(.*)%$$'

  pat = prefix and prefix .. '()$' or pat .. '()'

  return pat

end



-- Prepends '()' to Lua pattern string `pat`.

local function pat_prepend_pos(pat)

  local postfix = pat:match'^%^(.*)'

  pat = postfix and '^()' .. postfix or '()' .. pat

  return pat

end





-- Prepends '^' to Lua pattern string `pat`.

local function pat_prepend_carrot(pat)

  local postfix = pat:match'^%^(.*)'

  pat = postfix and pat or '^' .. pat

  return pat

end





-- Return a string listing pattern capture variables with indices `firstidx`

-- to `lastidx`.

-- Ex: code_vars(1,2) --> 'c1,c2'

local function code_vars(firstidx, lastidx)

  local code = ''

  for i=firstidx,lastidx do

    code = code .. (i == firstidx and '' or ',') .. 'c' .. i

  end

  return code

end





-- Metatable for expression objects

local epat_mt = {}

epat_mt.__index = epat_mt





-- Builds an extended pattern object `epat` from Lua string pattern `pat`.

local function pattern(pat)

  local epat = setmetatable({}, epat_mt)

  epat.call = function(srcidx0, destidx0, totncaptures0)

    local ncaptures = count_captures(pat)

    local lvars =

      code_vars(totncaptures0+1, totncaptures0+ncaptures)

      .. (ncaptures == 0 and '' or ',') .. 'pos' .. destidx0

    local pat = pat_append_pos(pat)



    pat = pat_prepend_carrot(pat)



    local str = format('%q', pat)

    local code = lvars .. ' = match(s, ' .. str .. ', pos' .. srcidx0 .. ')\n'

    return code, ncaptures

  end

  epat.anchored = pat:sub(1,1) == '^'

  return epat

end





-- Generates code from pattern `anypat` (either Lua pattern string or extended

-- pattern object).

--  `anypat`    - either Lua pattern string or extended pattern object

--  `srcidx0`   - index of variable holding position to start matching at

--  `destidx0`  - index of variable holding position to store subsequent

--                match position at.  stores nil if no match

--  `totncaptures0` - number of captures prior to this match

--  `code`      - Lua code string (code) and number of

--  `ncaptures` - number of captures in pattern.

local function gen(anypat, srcidx0, destidx0, totncaptures0)

  if type(anypat) == 'string' then

    anypat = pat_prepend_carrot(anypat)

    anypat = pattern(anypat)

  end

  local code, ncaptures = anypat(srcidx0, destidx0, totncaptures0)

  return code, ncaptures

end





-- Creates a new extended pattern object `epat` that is the concatenation of

-- the given list (of size >= 0) of pattern objects.

-- Specify a single string argument to convert a Lua pattern to an extended

-- pattern object.

local function seq(...) -- epats...

  -- Ensure args are extended pattern objects.

  local epats = {...}

  for i=1,#epats do

    if type(epats[i]) == 'string' then

      epats[i] = pattern(epats[i])

    end

  end



  local epat = setmetatable({}, epat_mt)

  epat.call = function(srcidx0, destidx0, totncaptures0, ws)

    if #epats == 0 then

      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0

    elseif #epats == 1 then

      return epats[1](srcidx0, destidx0, totncaptures0, ws)

    else

      local ncaptures = 0

      local pat_code, pat_ncaptures =

          gen(epats[1], srcidx0, destidx0, totncaptures0+ncaptures, true)

      ncaptures = ncaptures + pat_ncaptures

      local code = add_whitespace(pat_code, '')



      for i=2,#epats do

        local pat_code, pat_ncaptures =

            gen(epats[i], destidx0, destidx0, totncaptures0+ncaptures, true)

        ncaptures = ncaptures + pat_ncaptures

        code =

          code ..

          'if pos' .. destidx0 .. ' then\n' ..

            add_whitespace(pat_code, '  ') ..

          'end\n'

      end

      return code, ncaptures

    end

  end

  if epats[1] and epats[1].anchored then

    epat.anchored = true

  end

  return epat

end

M.P = seq





-- Creates new extended pattern object `epat` that is the alternation of the

-- given list of pattern objects `epats...`.

local function alt(...) -- epats...

  -- Ensure args are extended pattern objects.

  local epats = {...}

  for i=1,#epats do

    if type(epats[i]) == 'string' then

      epats[i] = pattern(epats[i])

    end

  end



  local epat = setmetatable({}, epat_mt)

  epat.call = function(srcidx0, destidx0, totncaptures0)

    if #epats == 0 then

      return 'pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n', 0

    elseif #epats == 1 then

      return epats[1](srcidx0, destidx0, totncaptures0)

    else

      local ncaptures = 0

      local pat_code, pat_ncaptures =

          gen(epats[1], srcidx0, destidx0+1, totncaptures0+ncaptures, true)

      ncaptures = ncaptures + pat_ncaptures

      local code = 'local pos' .. destidx0+1 .. ' = pos' .. srcidx0 .. '\n' ..

                   add_whitespace(pat_code, '')



      for i=2,#epats do

        local pat_code, pat_ncaptures =

            gen(epats[i], srcidx0, destidx0+1, totncaptures0+ncaptures, true)

        ncaptures = ncaptures + pat_ncaptures

        code =

          code ..

          'if not pos' .. destidx0+1 .. ' then\n' ..

            add_whitespace(pat_code, '  ') ..

          'end\n'

      end

      code = code .. 'pos' .. destidx0 .. ' = pos' .. destidx0+1 .. '\n'

      return code, ncaptures

    end

  end

  return epat

end

M.alt = alt





-- Creates new extended pattern object `epat` that is zero or more repetitions

-- of the given pattern object `pat` (if one evaluates to false) or one or more

-- (if one evaluates to true).

local function star(pat, one)

  local epat = setmetatable({}, epat_mt)

  epat.call = function(srcidx0, destidx0, totncaptures0)

    local ncaptures = 0

    local destidx = destidx0 + 1

    local code = 'do\n' ..

                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'

    local pat_code, pat_ncaptures =

        gen(pat, destidx, destidx, totncaptures0+ncaptures, true)

    ncaptures = ncaptures + pat_ncaptures

    code = code ..

      (one and ('  pos' .. destidx0 .. ' = nil\n')

           or  ('  pos' .. destidx0 .. ' = pos' .. srcidx0 .. '\n')) ..

      '  while 1 do\n' ..

           add_whitespace(pat_code, '    ') ..

      '    if pos' .. destidx .. ' then\n' ..

      '      pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..

      '    else break end\n' ..

      '  end\n' ..

      'end\n'

    return code, ncaptures

  end

  return epat

end

M.star = star





-- Creates new extended pattern object `epat` that is zero or one of the

-- given pattern object `epat0`.

local function zero_or_one(epat0)

  local epat = setmetatable({}, epat_mt)

  epat.call = function(srcidx0, destidx0, totncaptures0)

    local ncaptures = 0

    local destidx = destidx0 + 1

    local code = 'do\n' ..

                 '  local pos' .. destidx .. '=pos' .. srcidx0 .. '\n'

    local epat0_code, epat0_ncaptures =

        gen(epat0, destidx, destidx, totncaptures0+ncaptures, true)

    ncaptures = ncaptures + epat0_ncaptures

    code = code ..

      add_whitespace(epat0_code) ..

      '  if pos' .. destidx .. ' then\n' ..

      '    pos' .. destidx0 .. '=pos' .. destidx .. '\n' ..

      '  else\n' ..

      '    pos' .. destidx0 .. '=pos' .. srcidx0 .. '\n' ..

      '  end\n' ..

      'end\n'

    return code, ncaptures

  end

  return epat

end

M.zero_or_one = zero_or_one





-- Gets Lua core code string `code` corresponding to pattern object `epat`

local function basic_code_of(epat)

  local pat_code, ncaptures = epat(1, 2, 0, true)

  local lvars = code_vars(1, ncaptures)



  if epat.anchored then

    local code =

      'local pos1=pos or 1\n' ..

      'local pos2\n' ..

      (lvars ~= '' and '  local ' .. lvars .. '\n' or '') ..

      add_whitespace(pat_code) .. '\n' ..

      'if pos2 then return ' .. (lvars ~= '' and lvars or 's:sub(pos1,pos2-1)') .. ' end\n'

    return code

  else

    local code =

        'for pos1=(pos or 1),#s do\n' ..

        '  local pos2\n'

    if lvars == '' then

      code =

        code ..

           add_whitespace(pat_code, '  ') ..

        '  if pos2 then return s:sub(pos1,pos2-1) end\n'

    else

      code =

        code ..

        '  local ' .. lvars .. '\n' ..

           add_whitespace(pat_code, '  ') ..

        '  if pos2 then return ' .. lvars .. ' end\n'

    end

    code =

        code ..

        'end\n'

    return code

  end

end

M.basic_code_of = basic_code_of





-- Gets Lua complete code string `code` corresponding to pattern object `epat`.

local function code_of(epat)

  local code =

    'local match = ...\n' ..

    'return function(s,pos)\n' ..

    add_whitespace(basic_code_of(epat), '  ') ..

    'end\n'

  return code

end

M.code_of = code_of





-- Compiles pattern object `epat` to Lua function `f`.

local function compile(epat)

  local code = code_of(epat)

  if M.debug then print('DEBUG:\n' .. code) end

  local f = assert(loadstring(code))(match)

  return f

end

M.compile = compile





-- operator for pattern matching

function epat_mt.__call(epat, ...)

  return epat.call(...)

end





-- operator for pattern alternation

function epat_mt.__add(a_epat, b_epat)

  return alt(a_epat, b_epat)

end





-- operator for pattern concatenation

function epat_mt.__mul(a_epat, b_epat)

  return seq(a_epat, b_epat)

end





-- operator for pattern repetition

function epat_mt.__pow(epat, n)

  if n == 0 then

    return star(epat)

  elseif n == 1 then

    return star(epat, true)

  elseif n == -1 then

    return zero_or_one(epat)

  else

    error 'FIX - unimplemented'

  end

end





-- IMPROVE design?

epat_mt.compile       = compile

epat_mt.basic_code_of = basic_code_of

epat_mt.code_of       = code_of





return M

Appendix: xpattern_test.lua

-- xpattern_test.lua - test suite for xpattern.lua



-- utility function: convert list of values to string.

local function str(...)

  local n = select('#', ...)

  local t = {...}

  for i=1,n do t[i] = tostring(t[i]) end

  return table.concat(t, ',')

end



local M = require "xpattern"

local P = M.P

M.debug = true



-- internal: _count_captures

assert(M._count_captures'' == 0)

assert(M._count_captures'a' == 0)

assert(not pcall(function() M._count_captures'%' end))

assert(M._count_captures'()' == 1)

assert(M._count_captures'%(%)' == 0)    -- %(

assert(M._count_captures'[()]' == 0)    -- () inside []

assert(M._count_captures'[%(%)]' == 0)  -- %( inside []

assert(M._count_captures'[%]()]' == 0)  -- %] inside []

assert(M._count_captures'[]()]' == 1)

assert(M._count_captures'%b()' == 0)    -- () on %b..

assert(M._count_captures'(()().())' == 4)   -- nested

-- more complex example

assert(M._count_captures'.(.%))[(]%(()' == 2)





-- simple matching

assert(str(P'':compile()('')) == '')

assert(str(P'':compile()('a')) == '')

assert(str(P'a':compile()('')) == '')

assert(str(P'a':compile()('a')) == 'a')

assert(str(P'a':compile()('ba')) == 'a')

assert(str(P'a+':compile()('baa')) == 'aa')



-- simple anchors

assert(str(P'^a+':compile()('aa')) == 'aa')

assert(str(P'^a+':compile()('baab')) == '') -- $ fail

assert(str(P'a+$':compile()('baa')) == 'aa')

assert(str(P'a+$':compile()('baab')) == '') -- $ fail



-- simple captures

assert(str(P'(a+)(b+)':compile()('baab')) == 'aa,b')

assert(str(P'^(a+)(b+)':compile()('baab')) == '')



-- simple sequences

local m = P():compile()

assert(str( m('') ) == '')

assert(str( m('a') ) == '')

local m = P(''):compile()

assert(str( m('') ) == '')

assert(str( m('a') ) == '')

local m = P('a', 'b', 'c'):compile()

assert(str( m('.a.') ) == '')

assert(str( m('.abc.') ) == 'abc')

local m = (P'a' * P'b' * P'c'):compile() -- identical

assert(str( m('.a.') ) == '')

assert(str( m('.abc.') ) == 'abc')

local m = P(P'a', 'b', P'c'):compile() -- identical

assert(str( m('.a.') ) == '')

assert(str( m('.abc.') ) == 'abc')

local m = P(P'a+', 'b+', P'c+'):compile()

assert(str( m('.abaabcc.') ) == 'aabcc')



-- simple alternation

local m = (P'aa+' + P'bb+'):compile()

assert(str( m('abbaa') ) == 'bb')

local m = (P'aa+' + P'bb+' + P'cc+'):compile()

assert(str( m('abccaa') ) == 'cc')



-- simple combinations

local m = ((P'(a+)' + P'b(b*)') * P'(c+)()'):compile()

assert(str( m("aacccdd")) == 'aa,nil,ccc,6')

assert(str( m("bbcccdd")) == 'nil,b,ccc,6')

assert(str( m("bbdd")) == '')

print('?'..str( m("aabbcc")))

assert(str( m("aabbcc")) == 'nil,b,cc,7') -- alternative



-- simple replication (*)

local m = ( P'a'^0 ):compile()

assert(str(m'') == '')

assert(str(m'a') == 'a')

assert(str(m'aab') == 'aa')



-- replication (*)

local m = ( (P'a+' + P'b+')^0 ):compile()

assert(str(m'zabaabbc') == '')

assert(str(m'abaabb') == 'abaabb')

local m = ( (P'a+' * P'b+' + P'c+' * P'd+')^0 ):compile()

assert(str(m'aabbccddaa') == 'aabbccdd')

local m = ( P'aa'^0 * P'bb' * P'aa'^0 ):compile()

assert(str(m'aaccaaaabbaa') == 'aaaabbaa')



-- simple replication (+)

local m = ( P'a'^1 ):compile()

assert(str(m'') == '')

assert(str(m'a') == 'a')

assert(str(m'aab') == 'aa')



-- replacation (+)

local m = ( P'b' * P'a'^1 ):compile()

print(m'b')

assert(str(m'b') == '')

assert(str(m'ba') == 'ba')

assert(str(m'baab') == 'baa')



-- simple replication (?)

local m = ( P'a'^-1 ):compile()

assert(str(m'') == '')

assert(str(m'a') == 'a')

assert(str(m'aab') == 'a')



-- replication (?)

local m = ( P'c' * (P'a+' + P'b+')^-1 ):compile()

assert(str(m'caabb') == 'caa')





-- Some of these examples from Mastering Regular Expressions (MRE),

-- 2nd Ed. Jeffrey .Friedl.



-- MRE p.19

local m = ( P'^' * (P'From' + P'Subject' + P'Date') * P':%s*(.*)' ):compile()

assert(str(m('Subject: test')) == 'test')



-- MRE p.13

local m = ( (P'Geo' + P'Je') * P'ff' * (P're' + P'er') * P'y' ):compile()

assert(str(m'Jeffrey') == 'Jeffrey')

assert(str(m'Jeffery') == 'Jeffery')

assert(str(m'Geoffrey') == 'Geoffrey')

assert(str(m'Geoffery') == 'Geoffery')

assert(str(m'Jefery') == '')

assert(str(m'Geofferi') == '')

assert(str(m'GeoffrezGeoffery') == 'Geoffery') -- skips

assert(str(m'JefferzGeoffery') == 'Geoffery') -- skips

assert(str(m'GeoffJeffery') == 'Jeffery') -- skips



-- MRE p.24

local m = ( P'%$[0-9]+' * P'%.[0-9][0-9]'^-1 ):compile()

assert(str(m'$20.00') == '$20.00')

assert(str(m'$20') == '$20')

assert(str(m'$20.00.00') == '$20.00')



-- example

print 'example'

local M = require "xpattern"

local P = M.P

local m = ( (P'(b+)' + P'(c+)') * P'[A-Z][a-z]'^0 * P'(.)()' ):compile()

local a,b,c,d = m('mmcccZzYybbZzYyddd') -- match c not b first

assert(a == nil and b == 'ccc' and c == 'b' and d == 11)



-- example

local m = P('foo', P'bar'+P'baz', 'qux'):compile()

assert(str(m'afoobazfoobarquxbar', 'foobarqux'))

local m = P('^foo', P'bar'+P'baz', 'qux'):compile() -- anchored

assert(str(m'afoobazfoobarquxbar', ''))

assert(str(m'foobarquxbar', ''))



-- http://lua-users.org/lists/lua-l/2009-10/msg00752.html

local m = (

  P'^' * ( ( P'ceil'+P'abs' +P'floor'+P'mod' +P'exp'+P'log'+P'pow'+

             P'sqrt'+P'acos'+P'asin' +P'atan'+P'cos'+P'sin'+P'tan'+

             P'deg' +P'rad' +P'random'

           ) * P'%('

           + P'[0-9%(%)%-%+%*%/%.%,]' + P'pi'

          )^1 * P'$'

):compile()

assert(m'cos(1+pi)' == 'cos(1+pi)')

assert(m'cos(1+p)' == nil) -- 'p'

assert(m'cos(12.3/2)+mod(2,3)' == 'cos(12.3/2)+mod(2,3)')

assert(m'cos(12.3/2)+mod(2,3) ' == nil) -- ' '

assert(m' cos(12.3/2)+mod(2,3)' == nil) -- ' '

assert(m'cos(12.3/2)+mod+2' == nil) -- no '('



print 'DONE'

Changes:

Author

DavidManura

See Also


RecentChanges · preferences
edit · history
Last edited January 7, 2010 1:27 am GMT (diff)