Lua Project List To Xml

lua-users home
wiki

This sample program will read the list of projects using Lua from the Lua website and outputs the list as a valid XML document. This program requires GNU wget installed in the system (it's available for major Unix and Windows systems, and it's are installed as default in most Linux distributions.

WARNING: The program will read and interpret HTML. Future changes in the format of the list *WILL* break the parser.

#!/usr/bin/env lua



local fname = "uses.html"

os.execute("wget -q -O " .. fname .. " http://www.lua.org/uses.html")



local fp = io.open(fname, "r")

if fp == nil then

  print("Error opening file '" .. fname .. "'.")

  os.exit(1)

end



local s = fp:read("*a")

fp:close()



-- Remove optional spaces from the tags.

s = string.gsub(s, "\n", " ")

s = string.gsub(s, " *< *", " <")

s = string.gsub(s, " *> *", "> ")

s = string.gsub(s, "> *<", "><")



-- Put all the tags in lowercase.

s = string.gsub(s, "(<[^ >]+)", string.lower)



-- Remove images, scripts, etc.

s = string.gsub(s, "<img[^>]*>", "")

s = string.gsub(s, "<script[^>]*>.-</script>", "")



-- "Normalize" links for future use

s = string.gsub(s, "(<a[^>]*HREF *=)", string.lower)

s = string.gsub(s, "<a[^>]*href *= *", "<a href=")





print("<?xml version=\"1.0\" encoding=\"iso-8859-1\" ?>")

print("<luauses>")



for tmp in string.gfind(s, "<h3>.-<hr>") do



  -- Current data format (without spaces and line-breaks):

  --     <h3>

  --       <a NAME="1" HREF="APPURL">APPNAM</a>

  --       <br><small><em>USER</em></small>

  --     </h3>

  --       DESCR [can have html here]

  --       <p> Contact: <a HREF="EMAIL">CONTACT</a>

  --     <hr>



  local i, f, app = string.find(tmp, "<h3>(.-)</h3>")

  if app then

    app = string.gsub(app, "</?em>", "")

    app = string.gsub(app, "<br>", "")

    i, f, appurl, appnam = string.find(app, "<a href=\"([^\"> ]*)\"[^>]*>([^<]*)<")

    if appurl == nil then

      i, f, appnam = string.find(app, "<a[^>]*>([^<]*)</a>")

      appurl = ""

    end

  end



  i, f, user = string.find(tmp, "<small>(.-)</small>")

  if user then

    user = string.gsub(user, "</?.->", "")

    user = string.gsub(user, "&", "&")

  else

    user = ""

  end



  i, f, desc = string.find(tmp, "</h3>(.-)<hr>")

  if desc then

    i, f, cont = string.find(desc, "<p> *Contact: *(.*)")

    if cont then

      desc = string.gsub(desc, "<p> *Contact:(.*)", "")

      cont = string.gsub(cont, "<p> *Contact: *", "")

      i, f, email, name = string.find(cont, "<a href=\"([^ \"]+)\"[^>]*>([^<]+)<")

      if name == nil then

        name = cont

        email = ""

      end

      if email then

        email = string.gsub(email, "mailto:/?/?", "")

      else

        email = ""

      end

    else

      name = ""

      email = ""

    end

    desc = string.gsub(desc, "&", "&")

    desc = string.gsub(desc, "<", "<")

    desc = string.gsub(desc, ">", ">")

  else

    desc = ""

  end



  print(" <use>")

  print("  <app>" .. appnam .. "</app>")

  print("  <url>" .. appurl .. "</url>")

  print("  <user>" .. user .. "</user>")

  print("  <desc>" .. desc .. "</desc>")

  print("  <contact>" .. name .. "</contact>")

  print("  <email>" .. email .. "</email>")

  print(" </use>")

end



print("</luauses>")

-- AlexandreErwinIttner


RecentChanges · preferences
edit · history
Last edited May 28, 2007 9:29 pm GMT (diff)