String Trim

lua-users home
wiki

There are many ways to implement the "trim" function [1] in Lua:

-- trim implementations



function trim1(s)

  return (s:gsub("^%s*(.-)%s*$", "%1"))

end

-- from PiL2 20.4



function trim2(s)

  return s:match "^%s*(.-)%s*$"

end

-- variant of trim1 (match)



function trim3(s)

  return s:gsub("^%s+", ""):gsub("%s+$", "")

end

-- two gsub's



function trim4(s)

  return s:match"^%s*(.*)":match"(.-)%s*$"

end

-- variant of trim3 (match)



function trim5(s)

  return s:match'^%s*(.*%S)' or ''

end

-- warning: has bad performance when s:match'^%s*$' and #s is large



function trim6(s)

  return s:match'^()%s*$' and '' or s:match'^%s*(.*%S)'

end

-- fixes performance problem in trim5.

-- note: the '()' avoids the overhead of default string capture.

-- This overhead is small, ~ 10% for successful whitespace match call

-- alone, and may not be noticeable in the overall benchmarks here,

-- but there's little harm either.  Instead replacing the first `match`

-- with a `find` has a similar effect, but that requires localizing

-- two functions in the trim7 variant below.



local match = string.match

function trim7(s)

  return match(s,'^()%s*$') and '' or match(s,'^%s*(.*%S)')

end

-- variant of trim6 (localize functions)



local find = string.find

local sub = string.sub

function trim8(s)

  local i1,i2 = find(s,'^%s*')

  if i2 >= i1 then s = sub(s,i2+1) end

  local i1,i2 = find(s,'%s*$')

  if i2 >= i1 then s = sub(s,1,i1-1) end

  return s

end

-- based on penlight 0.7.2



function trim9(s)

  local _,i1 = find(s,'^%s*')

  local i2 = find(s,'%s*$')

  return sub(s,i1+1,i2-1)

end

-- simplification of trim8



function trim10(s)

  local a = s:match('^%s*()')

  local b = s:match('()%s*$', a)

  return s:sub(a,b-1)

end

-- variant of trim9 (match)



function trim11(s)

 local n = s:find"%S"

 return n and s:match(".*%S", n) or ""

end

-- variant of trim6 (use n position)

-- http://lua-users.org/lists/lua-l/2009-12/msg00904.html



function trim12(s)

 local from = s:match"^%s*()"

 return from > #s and "" or s:match(".*%S", from)

end

-- variant of trim11 (performs better for all

-- whitespace string). See Roberto's comments

-- on ^%s*$" v.s. "%S" performance:

-- http://lua-users.org/lists/lua-l/2009-12/msg00921.html



do

 require 'lpeg'

 local space = lpeg.S' \t\n\v\f\r'

 local nospace = 1 - space

 local ptrim = space^0 * lpeg.C((space^0 * nospace^1)^0)

 local match = lpeg.match

 function trim13(s)

   return match(ptrim, s)

 end

end

-- lpeg.  based on http://lua-users.org/lists/lua-l/2009-12/msg00921.html



do

 require 'lpeg'

 require 're'

 local ptrim = re.compile"%s* {(%s* %S+)*}"

 local match = lpeg.match

 function trim14(s)

   return match(ptrim, s)

 end

end

-- variant with re module.



require 'trim'

local trim15 = trim

-- C implementation (see separate trim.c file)





-- test utilities



local function trimtest(trim)

  assert(trim'' == '')

  assert(trim' ' == '')

  assert(trim'  ' == '')

  assert(trim'a' == 'a')

  assert(trim' a' == 'a')

  assert(trim'a ' == 'a')

  assert(trim' a ' == 'a')

  assert(trim'  a  ' == 'a')

  assert(trim'  ab cd  ' == 'ab cd')

  assert(trim' \t\r\n\f\va\000b \r\t\n\f\v' == 'a\000b')

end



local function perftest(f, s)

  local time = os.clock  -- os.time or os.clock

  local t1 = time()

  for i=1,100000 do

    f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)f(s)

  end

  local dt = time() - t1

  io.stdout:write(string.format("%4.1f",dt) .. ' ')

end



local trims = {trim1, trim2, trim3, trim4, trim5, trim6, trim7,

               trim8, trim9, trim10, trim11, trim12, trim13, trim14, trim15}



-- correctness tests

for _,trim in ipairs(trims) do

  trimtest(trim)

end



-- performance tests

for j=1,3 do

  for i,trim in ipairs(trims) do

    io.stdout:write(string.format("%2d",i) .. ": ")

    perftest(trim,  "")

    perftest(trim,  "abcdef")

    perftest(trim,  "   abcdef   ")

    perftest(trim,  "abcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdefabcdef")

    perftest(trim,  "  a b c d e f g h i j k l m n o p q r s t u v w x y z A B C ")

    perftest(trim,  "                               a                            ")

    perftest(trim,  "                                                            ")

    print()

  end

end

Optional C module for test15 based on LuaList:2009-12/msg00951.html:


/* trim.c - based on http://lua-users.org/lists/lua-l/2009-12/msg00951.html

            from Sean Conner */

#include <stddef.h>

#include <ctype.h>

#include <lua.h>

#include <lauxlib.h>



int trim(lua_State *L)

{

 const char *front;

 const char *end;

 size_t      size;



 front = luaL_checklstring(L,1,&size);

 end   = &front[size - 1];



 for ( ; size && isspace(*front) ; size-- , front++)

   ;

 for ( ; size && isspace(*end) ; size-- , end--)

   ;



 lua_pushlstring(L,front,(size_t)(end - front) + 1);

 return 1;

}



int luaopen_trim(lua_State *L)

{

 lua_register(L,"trim",trim);

 return 0;

}

Test results (lower numbers = faster):


Lua 5.1.4/Cygwin1.7

 1:  0.9  1.9  2.1  9.6 10.3  2.2  2.0

 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8

 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1

 4:  1.2  2.2  2.2 10.1 11.2  2.7  2.2

 5:  0.6  0.9  1.1  1.2  1.3  2.8 77.6

 6:  0.6  1.2  1.6  1.6  1.8  4.1  1.7

 7:  0.6  1.1  1.5  1.5  1.6  3.9  1.6

 8:  1.0  1.7  2.5  7.5  9.7  3.0  2.4

 9:  1.2  2.0  2.7  8.0  9.3 21.2  3.4

10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6

11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5

12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8

13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0

14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0

15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

 1:  0.9  1.9  2.0  9.6 10.3  2.2  1.9

 2:  0.7  1.6  1.8  8.9 10.0  2.0  1.8

 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1

 4:  1.1  2.2  2.2 10.1 11.4  2.7  2.2

 5:  0.6  0.9  1.2  1.2  1.2  2.8 78.2

 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7

 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6

 8:  1.0  1.7  2.5  7.5  9.6  3.1  2.3

 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3

10:  1.5  2.4  2.5  9.8 10.8  2.8  2.5

11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5

12:  0.7  1.3  1.6  1.7  1.8  3.0  1.8

13:  0.8  0.9  1.0  1.3  2.4  1.1  1.0

14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0

15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3

 1:  0.9  1.9  2.0  9.6 10.3  2.2  2.0

 2:  0.7  1.6  1.7  8.9 10.0  2.0  1.8

 3:  1.0  1.3  1.7  3.8  6.4  2.6  2.1

 4:  1.1  2.2  2.2 10.1 11.2  2.7  2.2

 5:  0.6  0.9  1.2  1.2  1.3  2.8 77.3

 6:  0.6  1.2  1.7  1.6  1.8  4.2  1.7

 7:  0.6  1.1  1.5  1.5  1.7  3.9  1.6

 8:  1.0  1.7  2.6  7.4  9.6  3.1  2.3

 9:  1.2  2.0  2.7  8.0  9.3 21.1  3.3

10:  1.5  2.4  2.6  9.8 10.8  2.8  2.6

11:  0.5  1.2  1.5  1.6  1.7  3.5  2.5

12:  0.7  1.3  1.6  1.6  1.8  3.0  1.8

13:  0.8  0.9  1.0  1.3  2.5  1.1  1.0

14:  0.8  0.9  1.0  1.3  2.5  1.1  1.0

15:  0.2  0.2  0.2  0.4  0.4  0.3  0.3



LuaJIT 2.0.0-beta2/Cygwin1.7

 1:  0.6  1.5  1.5  7.7  8.3  1.3  1.2

 2:  0.4  1.2  1.2  7.1  7.8  1.1  1.0

 3:  0.6  1.0  1.2  3.1  4.9  1.7  1.3

 4:  0.8  1.6  1.8  8.4  9.0  1.9  1.3

 5:  0.4  0.6  0.8  1.2  1.2  2.3 99.2

 6:  0.4  0.9  1.1  1.5  1.5  3.2  0.9

 7:  0.3  0.8  1.1  1.4  1.5  3.0  0.8

 8:  0.6  1.2  1.6  5.3  6.8  1.7  1.3

 9:  0.7  1.2  1.8  5.6  6.7 14.4  1.7

10:  0.9  1.6  1.7  7.6  8.3  1.5  1.5

11:  0.3  0.8  1.1  1.4  1.5  2.9  2.1

12:  0.4  0.9  1.1  1.5  1.5  2.7  0.9

13:  0.6  0.7  0.7  1.0  1.4  0.8  0.7

14:  0.6  0.7  0.7  1.0  1.4  0.8  0.8

15:  0.1  0.1  0.1  0.3  0.3  0.2  0.2

...

The speed of trim5 is relatively favorable or competitive over the data set except it is quite poor in the case of a long string with only whitespace. The use of ".*" (rather than a non-greedy ".-") quickly plows to the end of a long string, and the "%S" then triggers backtracking to avoid the trailing whitespace, but the juxtaposition of "%s*" and ".*" imposes substantial backtracking (O(N^2)) if no "%S" match is found. trim6 handles the worst cast condition specially and behaves well over the entire data set. trim7 is a variant that localizes functions, which gives a bit larger code size and does not give a substantial speed improvement. (On further testing, each localization gives maybe 10% in best case involving empty string data and inlining the function call, but here there are two calls to match). trim11 is also a variant of trim6 with similar performance characteristics, perhaps slightly better, except speed is about half in the case of a long string with only whitespace (this is fixed in trim12). trim13 (or its variant trim14) is an lpeg (LuaPeg) implementation, which generally meets or exceeds the performance of the best pattern matching implementations, particularly exceeding in the case of lots of whitespace, but it is a bit slower in its worst case condition of alternating space and whitespace. The C implementation (trim15) is by far the fastest over the entire data set.

Reusing the same strings across all test iterations, as done above, might not properly gauge the effect of temporary string creation due to Lua's string interning. A quick test with local ss = {}; for i=1,80000 do ss[i] = " " .. (" "..i):rep(10) .. " " end as a string cache doesn't seem to affect the basic conclusions above.

--DavidManura

See Also


RecentChanges · preferences
edit · history
Last edited January 1, 2010 11:01 pm GMT (diff)