Validate Unicode String |
|
string.subutf8(string, start[,end])
substrings, UTF-8 aware
pos, char = string.nextutf8(string, orig_pos)
returns the char at orig_pos and the next char's position in pos.
for i, char in str:nextutf8(orig_pos)
iterates through the string, starting at orig_pos.
pos = string.seekutf8(string, orig_pos, n)
returns the position orig_pos, N characters forward (or backwards, if N negative).
char = string.utf8char(code)
returns the char the code of which is code.
code = string.utf8code(char)
returns the code of char (UTF-8 character).
len = string.lenutf8(string)
returns the length of string in UTF-8 characters.
UTF-8 BOM has by convention a code of 0. Valid code ranges are: 0-0xD7FF, 0xE000-0x10FFFF.
Unicode is an universal character set, widely used in XML documents.
The point of this is that Unicode characters are 32/64 bits length. With UTF-8, ASCII characters are stored as one byte, others may take from 2 to 7 bytes in length. See RFC 2279 [1].
Note: the above is not correct.
RFC-2279 has been obsoleted by RFC-3629 [2] to bring it into alignment with the Unicode Standard [3]. A reasonably fast standards-compliant pure Lua library can be found at [4]. (link broken)
StephaneArnold 2007-11-13 - I delete the posted code that was not compliant to the latest UTF-8 standard. I have converted some functions of the the 'pure Lua library' to C functions :
lua_utf8.c
/*==================================================================*/ /* C program by sarnold@free.fr 2007, MIT license based on the work of Rici Lake rici@ricilake.net */ /*==================================================================*/ #include <memory.h> #include "lua.h" #include "lauxlib.h" #include "lualib.h" #define INVALID_UTF8 "invalid utf-8 string" #define POINTS_ASCII(p) (*((unsigned char*)p) < 128) #define RANGE(x, min, max) ((x)>=min && (x)<=max) #define RANGE_SND(x) RANGE(x,128,191) #define UTF8_BOM(p) (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF) int sarn_utf8_next(const unsigned char* str) { if (*str < 128) return 1; if (UTF8_BOM(str)) return 3; if (*str < 194) return 0; if (*str > 244) return 0; if (*str < 224 && RANGE_SND(str[1])) return 2; if (RANGE(*str, 225, 239) && *str != 237 && RANGE_SND(str[1]) && RANGE_SND(str[2])) return 3; if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; if (*str == 237 && RANGE(str[1],128,159) && RANGE_SND(str[2])) return 3; if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 240 && RANGE(str[1],144,191) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 244 && RANGE(str[1],128,143) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; return 0; } #define BACK(str, remain) if (--remain == 0) return 0; else str-- int sarn_utf8_prev(unsigned char* str, int remain) { BACK(str,remain); if (*str < 128) return 1; BACK(str,remain); if (RANGE(*str,195,224) && RANGE_SND(str[1])) return 2; BACK(str,remain); if (UTF8_BOM(str)) return 3; if (RANGE(*str, 225, 239) && *str != 237 && RANGE_SND(str[1]) && RANGE_SND(str[2])) return 3; if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; if (*str == 237 && RANGE(str[1],160,191) && RANGE_SND(str[2])) return 3; BACK(str,remain); if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 240 && RANGE(str[1],144,191) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; if (*str == 244 && RANGE(str[1],128,143) && RANGE_SND(str[2]) && RANGE_SND(str[3])) return 4; /* fail back */ return 0; } /** Realign index on an UTF-8 char boundary in str. Returns the offset (0 to 3) to be seeked backwards, or -1 if it fails. */ int sarn_utf8_realign(unsigned char* str, size_t index) { size_t size, i; for (i = 0; i<4 && index>=i;i++) { if (sarn_utf8_next(str-i)!=0) return i; } return -1; } int sarn_utf8_next_func(lua_State* L) { const char *str; size_t pos, clen; char utf8[5]; str = luaL_checkstring(L, 1); pos = luaL_checklong(L, 2); if (strlen(str)<pos) { lua_pushnil(L); return 1; } memset(utf8, '\0', sizeof(utf8)); if (pos == 0) return luaL_error(L, "bad index value : 0"); clen = sarn_utf8_next((unsigned char *)str+pos-1); if (!clen) return luaL_error(L, INVALID_UTF8); lua_pushnumber(L, pos+clen); strncpy(utf8, str+pos-1, clen); lua_pushstring(L, utf8); return 2; } int sarn_utf8_len_func(lua_State *L) { unsigned char *str; int l; size_t len = 0; str = (unsigned char*) luaL_checkstring(L, 1); while (*str) { if (POINTS_ASCII(str)) { str++; len++; continue; } l = sarn_utf8_next(str); if (!l) return luaL_error(L, INVALID_UTF8); len++; str+=l; } lua_pushnumber(L, len); return 1; } int sarn_utf8_seek_func(lua_State *L) { unsigned char* str; int pos, shift; int clen, len; str = (unsigned char*)luaL_checkstring(L, 1); pos = luaL_checklong(L, 2); shift = luaL_checklong(L, 3); len = strlen(str); if (shift == 0) { lua_pushinteger(L, pos); return 1; } if (pos > len || pos < 1) return luaL_error(L, "invalid index (arg #2)"); /* then, pos is 0-based */ pos--; if (abs(shift) > len) { /* out of range */ lua_pushnil(L); return 1; } if (shift < 0) { while ((shift++) != 0) { clen = sarn_utf8_prev(str+pos, pos+1); if (clen == 0 || pos+1 < clen) { lua_pushnil(L); return 1; } pos -= clen; } } else { while ((shift--) != 0) { if (POINTS_ASCII(str+pos)) { pos ++; continue; } clen = sarn_utf8_next(str+pos); if (clen == 0 || pos+clen >= len) { lua_pushnil(L); return 1; } pos += clen; } } lua_pushinteger(L, pos+1); return 1; } int sarn_utf8_char_func(lua_State *L) { unsigned char str[2]; long int i; unsigned long int code; unsigned char result[5]; i = luaL_checklong(L, 1); memset(result, '\0', sizeof(result)); code = i; if (i >= 0xD800 && i <= 0xDFFF) return luaL_error(L, "invalid utf-8 code"); if (i >= 0 && i < 0x110000UL) { if (code == 0) { /* UTF8 BOM */ lua_pushstring(L, "\xEF\xBB\xBF"); return 1; } if (code < 128) { result[0] = code; lua_pushstring(L, (char*)result); return 1; } str[0] = 0x80 + (code & 63); code = code >> 6; if (code < 32) { result[0] = 0xC0+code; result[1] = str[0]; lua_pushstring(L, (char*)result); return 1; } str[1] = code & 0x3f; code = code >> 6; if (code < 16 && (code != 13 || str[1] < 32)) { result[0] = 0xE0 + code; result[1] = str[1] + 0x80; result[2] = str[0]; lua_pushstring(L, (char*)result); return 1; } else if (code >= 16 && code < 0x110) { result[1] = 0x80 + (code & 0x3f); result[0] = 0xF0 + (code >> 6); result[2] = str[1] + 0x80; result[3] = str[0]; lua_pushstring(L, (char*) result); return 1; } } return luaL_error(L, "invalid utf-8 code"); } int sarn_utf8_code_func(lua_State *L) { unsigned char* str; size_t len, i; unsigned long int code; unsigned long int offset[] = {0, 0x3000, 0xE0000UL, 0x3C00000UL}; str = (unsigned char*)luaL_checklstring(L, 1, &len); if (len != sarn_utf8_next(str)) return luaL_error(L, INVALID_UTF8); if (UTF8_BOM(str)) { lua_pushinteger(L, 0); return 1; } code = str[0]; for (i = 1; i < len; i++) { code = (code << 6) + (str[i] & 63); } lua_pushinteger(L, code - offset[len-1]); return 1; } int luaopen_libluautf8 (lua_State *L) { lua_getglobal(L, "string"); lua_pushcfunction(L, sarn_utf8_next_func); lua_setfield(L, -2, "nextutf8"); lua_pushcfunction(L, sarn_utf8_len_func); lua_setfield(L, -2, "utf8len"); lua_pushcfunction(L, sarn_utf8_seek_func); lua_setfield(L, -2, "seekutf8"); lua_pushcfunction(L, sarn_utf8_code_func); lua_setfield(L, -2, "utf8code"); lua_pushcfunction(L, sarn_utf8_char_func); lua_setfield(L, -2, "utf8char"); return 0; }
Makefile
all: compile LUA_CFLAGS=-O2 -fpic LUA_LDFLAGS=-O -shared -fpic compile: lua_utf8 lua_utf8: lua_utf8.c $(CC) $(CFLAGS) $(LUA_CFLAGS) -c lua_utf8.c $(CC) $(CFLAGS) $(LUA_LDFLAGS) -o libluautf8.so lua_utf8.o
module(...,package.seeall) require'libluautf8' local mt = {} local unistr = {} function unistr:new(str) return setmetatable({value = str or ''},mt) end -- redirects methods to unistr mt.__index = function(t,key) if key == 'length' then return string.utf8len(t.value) end if key == 'value' then return t.value end return unistr[key] end -- substrings, utf8 ready -- it might be very expensive -- isn't every encoding function expensive compared to raw access -- to bytes function unistr:sub (first, last) local fn fn = function (str,idx) if idx == 1 or idx == 0 then return idx end if idx<0 then -- negative indices are counted backwards return str:seekutf8(#str, idx) or 1 else return str:seekutf8(1, idx-1) or #str+1 end end local i = fn(self.value, first) if last == nil then return self.value:sub(i) end if last < 0 then if first > 0 or (first<0 and last-first > -last) then -- we must anyway walk through the encoded string -- when walking from the end of the string backwards -- has costs less than walking from the first index -- we choose the least cost -- we get the last index from fn return self.value:sub(i, fn(self.value, last)) end end if first == 0 then return self.value:sub(i, fn(self.value, last)) end return self.value:sub(i, self.value:seekutf8(i, last-first)) end local u2s=function (str) if type(str) == 'string' then return str else return str.value end end -- unicode strings concat function mt.__concat(a,b) return u(u2s(a)..u2s(b)) end -- encoded string length with a metatable is not possible -- so let's stick with a len() method function unistr:len() return self.value:utf8len() end -- iterator function unistr:each(pos) return string.nextutf8, self.value, pos or 1 end -- creates a global "u" function to be used like that: -- str = u"Hello" (it feels Python-like but is really a Lua function) -- then, thanks to the metatable mechanism, concatenation and other funcs -- can be invoked as if it was a simple scalar of type string _G.u = function(str) return unistr:new(str) end function unicodize(f) return function(str) return f(u2s(str)) end end _G.print = unicodize(print) -- return this function return _G.u
Test code
require 'utf8' a=u'hello' b="hello" function assertEqual(name,a,b) if a~=b then print(name.."["..a..'|'..b..']') else --print(name.."...OK") end end for i = 0,10 do assertEqual("sub1."..i,a:sub(i),b:sub(i)) end for i = 0,5 do for j = i,10 do assertEqual("sub2."..i.."-"..j,a:sub(i,j),b:sub(i,j)) end end lentest = {{"h",1},{"",0},{"hel",3},{"hi Stéphane",11}} for _,val in ipairs(lentest) do str = u(val[1]) assertEqual("len1.".._, str.length, val[2]) end firstName=u"Stéphane" lastName = u"Arnold" print("hello "..firstName.." "..lastName)