Scite Using Unicode

lua-users home
wiki

The following are examples of writing and reading Unicode characters in a buffer, specifically dealing with UTF-8. If you can't see the Unicode characters in the first code snippet, try viewing using the UTF-8 character set.


Write Some Unicode Characters

-- -*- coding: utf-8 -*-

-- write some UTF-8 chars <khman@users.sf.net> 20061017 public domain

-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)

function UnicodeWriteSomething()

  -- áéïöü C3 A1 C3 A9 C3 AF C3 B6 C3 BC

  -- 中文 E4 B8 AD E6 96 87

  -- open a new buffer and set encoding as UTF-8

  scite.Open("")

  editor.CodePage = SC_CP_UTF8

  -- string is in UTF-8

  editor:AppendText("áéïöü\n")

  editor:AppendText("中文\n")

  -- string is encoded as escaped sequences

  editor:AppendText("\195\161\195\169\195\175\195\182\195\188\n")

  editor:AppendText("\228\184\173\230\150\135\n")

end


Read And Write Unicode Values

The following functions helps with reading and writing of UTF-8 characters in a buffer. It allows up to 6-byte character sequences to support UCS-4 ranges.

-- -*- coding: utf-8 -*-

-- return value of UTF-8 character <khman@users.sf.net> 20061017 public domain

-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)

function FromUTF8(pos)

  local mod = math.mod

  local function charat(p)

    local v = editor.CharAt[p]; if v < 0 then v = v + 256 end; return v

  end

  local v, c, n = 0, charat(pos), 1

  if c < 128 then v = c

  elseif c < 192 then

    error("Byte values between 0x80 to 0xBF cannot start a multibyte sequence")

  elseif c < 224 then v = mod(c, 32); n = 2

  elseif c < 240 then v = mod(c, 16); n = 3

  elseif c < 248 then v = mod(c,  8); n = 4

  elseif c < 252 then v = mod(c,  4); n = 5

  elseif c < 254 then v = mod(c,  2); n = 6

  else

    error("Byte values between 0xFE and OxFF cannot start a multibyte sequence")

  end

  for i = 2, n do

    pos = pos + 1; c = charat(pos)

    if c < 128 or c > 191 then

      error("Following bytes must have values between 0x80 and 0xBF")

    end

    v = v * 64 + mod(c, 64)

  end

  return v, pos, n

end



-- return UTF-8 sequence string <khman@users.sf.net> 20061017 public domain

-- (see Markus Kuhn's UTF-8 and Unicode FAQ or RFC3629 for more info)

function ToUTF8(v)

  local math = math

  local n, s, b = 1, "", 0

  -- delete this if your version of SciTE goes beyond UCS-2

  if v > 65535 then error("SciTE does not support codes above U+FFFF") end

  if v >= 55296 and v <= 57343 then

    error("failed to convert UTF-16 surrogate pairs to UTF-8")

  end

  if    v >= 67108864 then n = 6; b = 252

  elseif v >= 2097152 then n = 5; b = 248

  elseif v >=   65536 then n = 4; b = 240

  elseif v >=    2048 then n = 3; b = 224

  elseif v >=     128 then n = 2; b = 192

  end

  for i = 2, n do

    local c = math.mod(v, 64); v = math.floor(v / 64)

    s = string.char(c + 128)..s

  end

  s = string.char(v + b)..s

  return s, n

end



-- demonstrate use of FromUTF8() function: display the character code

-- value of the current character under the cursor in the output window

function Demo_FromUTF8()

  print("Character code: "..(FromUTF8(editor.CurrentPos)))

end



-- demonstrate use of ToUTF8() function: display two characters based

-- on the given unicode value

function Demo_ToUTF8()

  editor:AppendText(ToUTF8(tonumber("0x4E2D", 16)))

  editor:AppendText(ToUTF8(tonumber("0x6587", 16)))

end


Display Character Codes from U+0000 to U+FFFF

The following demo function displays a table of Unicode characters. It requires the ToUTF8() function from above.

-- -*- coding: utf-8 -*-

-- write out a UTF-16 table <khman@users.sf.net> 20061017 public domain

function UTF16Table()

  scite.Open("")

  editor.CodePage = SC_CP_UTF8

  editor:AppendText("-*- coding: utf-8 -*-\n")

  editor:AppendText("   Dec ( Hex ) : 0123456789ABCDEF0123456789ABCDEF\n")

  editor:AppendText("-------------------------------------------------\n")

  for p = 0, 65535, 32 do

    ln = string.format("%6d (0x%4X): ", p, p)

    for q = p, p+31 do

      if q < 32 or (q >= 55296 and q <= 57343) then ln = ln.."?"

      else ln = ln..ToUTF8(q)

      end

    end

    ln = ln.."\n"

    editor:AppendText(ln)

  end

end

--KeinHongMan


RecentChanges · preferences
edit · history
Last edited October 17, 2006 5:26 pm GMT (diff)