Read Defined Chunks

lua-users home
wiki

With this function you can read defined chunks (until a given delimiter is found) from an file or from io.stdin. It's a complete rewrite of a prior version: now its fast, doesn't concatenate strings unnecessary, needs less memory and is flexible. The purpose is to process a) files with tons of megabaytes b) mixed formated input e.g. MIME multipart messages are a mix of lines \r\n and binary data Please note a simple non Lua standard enhancement: I use the number variable lua.maxread to set the chunksizes used for io.reads(chunk of bytes) at a central point. Replace this variable by your preferred chunksize e.g. 2^13 for 8KB.

-- a simple example without using all the specials

local Handle=io.open('File','r')

local ReadUntil=io.readuntil(Handle)

repeat

   Line=coroutine.resume(ReadUntil,'\n',true)

   if Line then



   end

until Line==nil

-- another example

local Handle = io.open('File', 'r')

local ReadUntil = io.readuntil(Handle)

local Chunk, Found

repeat

   _,Chunk, Found = coroutine.resume(ReadUntil,

                   'search this string in a huge file',false)

   if Found then

      _,Chunk, Found = coroutine.resume(ReadUntil,

                      'search another string in the same file',true)

      if Found then break end

   end

until Chunk == nil



-- Now if chunk ~= nil, then chunk is the stuff between

-- 'search this string in a huge file' and 'search another

-- string in the same file'. Yes, it's possible to do the same

-- very simply, but the advantage here is that the large file

-- isn't loaded at once into memory.

Code:

function io.readuntil(Filehandle, Delimiter, Collect, Limit)



-- Filehandle (userdata)

-- Delimiter (string, optional); max. length is lua.maxread;

--   optional because coroutine.resume() also accepts <delimiter>

-- Collect (boolean, optional) = true (default); read until

--   <delimiter> is found or end of file or <limit> is reached

--   and return string at once = false; return string also before

--   <delimiter> is found or end of file or <limit> is reached

-- Limit (number, optional); number of bytes to read from

--   <filehandle>; default is unlimited



-- <function> = cooroutine.resume(Function, Delimiter, Collect)



-- Function (thread); returned from io.readuntil()

-- Delimiter (string, optional); see io.readuntil()

-- Collect (boolean, optional); see io.readuntil()



-- return (boolean); = true; no error

--                   = false; an error occured and the second

--                            argument returned is the errormessage

--        (string or nil) = nil; end of file

--        (boolean) = true; delimiter found

--                  = false; delimiter not found



-- note: if the coroutine returns true,<string>,false then

--          if <collect> = false it does not have to be the end of file

--                       = true  the end of file is reached and the next

--                               coroutine.resume returns true,nil(,nil)



   if type(Delimiter) == 'boolean' then

      Collect,Delimiter = Delimiter,Collect

   end

   if type(Delimiter) == 'number' then

      Limit,Delimiter = Delimiter,nil

   end

   if type(Collect) == 'number' then

      Limit,Collect = Collect,nil

   end



   return coroutine.create(function(NewDelimiter,NewCollect)



      local Next = function(NewDelimiter,NewCollect)

                      if type(NewDelimiter) == 'boolean' then

                          NewCollect,NewDelimiter = NewDelimiter,nil

                      end

                      return NewDelimiter or Delimiter,NewCollect or Collect

                   end



      Delimiter,Collect = Next(NewDelimiter,NewCollect)



      local Chunksize,Chunk,Length,First,Second,SearchFrom,

            GetFrom,FoundFrom,FoundTo =

            lua.maxread,{},0,1,2,1,1



      if Limit and Length+Chunksize>Limit then

         Chunk[First] = Limit-Length>0 and Filehandle:read(Limit-Length)

      else

         Chunk[First] = Filehandle:read(Chunksize)

      end



      if Chunk[First] then

         Length = Length + string.len(Chunk[First])



         while true do

            if string.len(Delimiter)>Chunksize then

               error('io.readuntil: delimiter to long')

            end



            FoundFrom,FoundTo = string.find(

                      Chunk[First],Delimiter,SearchFrom,true)

            if FoundFrom then

               -- delimiter found in first chunk

               Delimiter,Collect = Next(coroutine.yield(

                    string.sub(Chunk[First],GetFrom,FoundFrom-1),true))

               SearchFrom,GetFrom = FoundTo+1,FoundTo+1

            else

               if Limit and Length+Chunksize > Limit then

                  Chunk[Second] = Limit-Length>0 and Filehandle:read(Limit-Length)

               else

                  Chunk[Second] = Filehandle:read(Chunksize)

               end



               if Chunk[Second] then

                  Length = Length + string.len(Chunk[Second])



                  -- concatenate end of first chunk with start of

                  -- second chunk so that a possible splitted delimiter

                  -- must be found

                  FoundFrom,FoundTo = string.find(

                    string.sub(Chunk[First],

                      string.len(Chunk[First])-string.len(Delimiter)+2) ..

                      string.sub(Chunk[Second],1,string.len(Delimiter)-1),

                    Delimiter,1,true)

                  if FoundFrom then

                     -- delimiter is splitted between first and second chunk

                     Delimiter,Collect = Next(coroutine.yield(

                       string.sub(Chunk[First],GetFrom,string.len(Chunk[First])-

                         string.len(Delimiter)+FoundFrom), true

                     ))

                     First,Second = Second,First

                     SearchFrom,GetFrom = FoundFrom+1,FoundFrom+1

                  else

                     -- delimiter isn't splitted between first and second chunk

                     if Collect then

                        SearchFrom = string.len(Chunk[First])+1

                        Chunk[First] = Chunk[First]..Chunk[Second]

                     else

                        if string.len(Chunk[First]) >= GetFrom then

                           Delimiter,Collect = Next(coroutine.yield(

                              string.sub(Chunk[First],GetFrom),false))

                        end

                        First,Second = Second,First

                        SearchFrom,GetFrom = 1,1

                     end

                  end

               else

                  -- no delimiter found and no further input

                  break

               end

            end

         end



         if string.len(Chunk[First]) >= GetFrom then

            -- return rest of first chunk

            coroutine.yield(string.sub(Chunk[First],GetFrom),false)

         end

      end

   end)

   -- return (thread); a coroutine

   end

-- MarkusHuber


RecentChanges · preferences
edit · history
Last edited May 28, 2007 10:01 pm GMT (diff)