Scite Unicode Input

lua-users home
wiki

This script replaces the hexadecimal unicode codepoint before the cursor with its utf-8 encoding. If the script is hocked up with the Ctrl+U keyboard combination, then typing 2200 Ctrl+U is replaced with ∀.

This script is also provided as a GPLv2 project on sourceforge: http://sourceforge.net/projects/emitunicodeinscite/

Enjoy!



-- DESCRIPTION:

-- 

-- This lua script adds utf8 unicode input, to the scite text editor.

-- 

-- The scite text editor should be set to use the UTF-8 encoding

-- , because this script adds utf8, into the text buffer of the

-- scite editor. Select File->Encoding->UTF-8, from the

-- menu bar of scite.

-- 

-- For example, it will be possible that you type 2200 CTRL+U

-- , and 2200 is replaced to ∀; (U+2200), in the scite editor.

-- 

-- ______________________________________________________________________________

-- 

-- INSTALL:

-- 

-- To have scite running this script each time you press Ctrl+U, add next lines

-- into your ~/SciTEUser.properties file, where ~ is your home directory.

-- FILE ~/SciTEUser.properties:

--[[

ext.lua.startup.script=$(SciteUserHome)/emitUtf8UnicodeIntoTheSciteEditor.lua

command.name.12.*=Emit UTF8 Unicode

command.subsystem.12.*=3

command.12.*=emitUtf8UnicodeIntoTheSciteEditor

command.mode.12.*=savebefore:no

command.shortcut.12.*=Ctrl+U

--]]

-- ______________________________________________________________________________

-- THE LUA CODE:

-- 

-- Next is the definition of the lua function that is called by scite

-- when CTRL+U is pressed, to replace unicode endpoint encoding, with

-- utf8 encoding of the unicode endpoint.

-- ______________________________________________________________________________





-- Computes the utf8 encoding for a unicode codepoint u

-- , when 0 <= u <= 0x7f

-- 

-- @param unicodeValue the unicode codepoint u

-- 

-- @return the utf8 encoding of the unicode codepoint u

function case1UnicodeToUtf8(unicodeValue)

  --print('case 1')

  local u = unicodeValue

  local byte0 = (u % 0x80)

  local utf8 = string.char(byte0)

  return utf8

end



-- ______________________________________________________________________________

-- Computes the utf8 encoding for a unicode codepoint u

-- , when 0x80 <= u <= 0x7ff

-- 

-- @param unicodeValue the unicode codepoint u

-- 

-- @return the utf8 encoding of the unicode codepoint u

function case2UnicodeToUtf8(unicodeValue)

  --print('case 2')

  local u = unicodeValue

  local byte1 = (0x80 + (u % 0x40) )

  u = math.floor(u / 0x40)

  local byte0 = (0xc0 + (u % 0x20) )

  local utf8 = string.char(byte0, byte1)

  return utf8

end



-- ______________________________________________________________________________

-- Computes the utf8 encoding for a unicode codepoint u

-- , when 0x800 <= u <= 0xffff.

-- 

-- @param unicodeValue the unicode codepoint u

-- 

-- @return the utf8 encoding of the unicode codepoint u

function case3UnicodeToUtf8(unicodeValue)

  local u = unicodeValue

  local byte2 = (0x80 + (u % 0x40))

  -- print('byte2: '..byte2)

  u = math.floor(u / 0x40)

  local byte1 = (0x80 + (u % 0x40))

  -- print('byte1: '..byte1)

  u = math.floor(u / 0x40)

  local byte0 = (0xe0 + (u % 0x10))

  -- print('byte0: '..byte0)

  local utf8 = string.char(byte0, byte1, byte2)

  return utf8

end



-- ______________________________________________________________________________

-- Computes the utf8 encoding for a unicode codepoint u

-- , when 0x10000 <= u <= 0x10ffff.

-- 

-- @param unicodeValue the unicode codepoint u

-- 

-- @return the utf8 encoding of the unicode codepoint u

function case4UnicodeToUtf8(unicodeValue)

  local u = unicodeValue

  local byte3 = (0x80 + (u % 0x40))

  u = math.floor(u / 0x40)

  local byte2 = (0x80 + (u % 0x40))

  u = math.floor(u / 0x40)

  local byte1 = (0x80 + (u % 0x40))

  u = math.floor(u / 0x40)

  local byte0 = (0xf0 + (u % 0x8))

  local utf8 = string.char(byte0, byte1, byte2, byte3)

  return utf8

end



-- ______________________________________________________________________________

-- Converts a unicode integer value, into a utf8 string value.

-- 

-- The unicode integer value is an integer that

-- is greater than or equal to zero.

-- 

-- The utf8 string value is a string that is a sequence of

-- 8 bits characters that give the utf8 encoding of the

-- unicode codepoint given by the unicode integer value.

-- 

-- @param unicodeValue the unicode integer value;

-- a unicode codepoint

-- 

-- @return the utf8 encoding of the unicode codepoint

-- provided by the unicodeValue input argument

function unicodeToUtf8(unicodeValue)

  local u = unicodeValue

  if ((0x800 <= u) and (0xffff >= u))

  then

    return case3UnicodeToUtf8(u)

  end

  if ((0x80 <= u) and (0x7fff >= u))

  then

    return case2UnicodeToUtf8(u)

  end

  if ((0x0 <= u) and (0x7f >= u))

  then

    return case1UnicodeToUtf8(u)

  end

  if( (0x10000 <= u) and (0x10ffff >= u) )

  then

    return case4UnicodeToUtf8(u)

  end

  return nil

end



-- ______________________________________________________________________________

-- Peeks (reads) the character at position i, in the Scite Editor.

-- If the character is the ascii name of a hex digit, it returns

-- the corresponding hex digit, otherwise it returns nil.

-- 

-- @param i position in the Scite Editor

-- @return hex digit at position i, or nil

function peekHexdigit(i)

  local e = editor

  local asciiCode = e.CharAt[i]

  if((0>asciiCode) or (0xff < asciiCode))

  then

    return nil

  end

  local charValue = string.char(asciiCode)

  local hexDigit = tonumber(charValue,0x10)

  return hexDigit -- may be nil

end



-- ______________________________________________________________________________

-- Reads the sequence of maximum length at most 5, at the left of the cursor

-- in the Scite Editor.

-- Encodes the longest suffix of this sequence, that is a hex number, into

-- the utf encoding of this hex number.

-- Replaces this longest suffix, with the utf8 sequence.

-- 

-- @return true a suffix of length greater than zero, at most 5 existed

-- and was replaced with the utf8 encoding of the number it

-- represented

-- 

-- false , when no such suffix existed

function emitUtf8Unicode()

  local e = editor

  local n = e.TextLength

  local i = e.CurrentPos

  local maxlen = 5

  if ((0 == n) or (1 > i))

  then

    return nil -- Success. No request

  end

  local len = 1

  local len2 = 0

  local u = 0

  local thePower = 1

  while (     (len <= maxlen)

          and (0 <= (i - len) )

        )

  do

    local hexDigit = peekHexdigit(i-len,u)

    if (nil == hexDigit)

    then

      break -- out of the while loop

    end

    u = ( u + (thePower * hexDigit) )

    thePower = (0x10 * thePower )

    len2 = len

    --print("u: "..u)

    len = len + 1

  end

  if (0 == len2)

  then

    return nil -- Failure. No unicode

  end

  utf8 = unicodeToUtf8(u)

  if(nil == utf8)

  then

    return nil -- Failure. Unicode to utf8 conversion failed.

  end

  e:SetSel(i-len2,i)

  e:ReplaceSel(utf8)

  --print("utf8: "..utf8)

  return true -- Success.

end



-- ______________________________________________________________________________

-- Emits utf8 encoding in the place of the unicode codepoint

-- in the editor, at the left of the cursor.

-- 

-- Writes a message to the Output pane, if no codepoint existed

-- at the left of the cursor.

-- 

function emitUtf8UnicodeIntoTheSciteEditor()

  local ok = emitUtf8Unicode()

  if not ok

  then

    print("Failed to encode unicode into text editor.")

  end

end



-- ______________________________________________________________________________

-- 

-- Following web pages were useful in writing the lua scite script.

-- 

-- http://lua-users.org/wiki/UsingLuaWithScite

-- http://www.scintilla.org/PaneAPI.html

-- http://www.lua.org/manual/5.1/manual.html#pdf-tonumber

-- https://en.wikipedia.org/wiki/UTF-8

-- 

-- http://lua-users.org/lists/lua-l/2007-08/msg00276.html

-- http://keplerproject.github.io/luadoc/ 




RecentChanges · preferences
edit · history
Last edited August 8, 2013 4:26 pm GMT (diff)