Validate Unicode String

lua-users home
wiki

This code treat UTF-8 encoded Unicode strings. It provides the following API:

UTF-8 BOM has by convention a code of 0. Valid code ranges are: 0-0xD7FF, 0xE000-0x10FFFF.

The UTF-8 encoding

Unicode is an universal character set, widely used in XML documents.

The point of this is that Unicode characters are 32/64 bits length. With UTF-8, ASCII characters are stored as one byte, others may take from 2 to 7 bytes in length. See RFC 2279 [1].

Note: the above is not correct.

RFC-2279 has been obsoleted by RFC-3629 [2] to bring it into alignment with the Unicode Standard [3]. A reasonably fast standards-compliant pure Lua library can be found at [4]. (link broken)

StephaneArnold 2007-11-13 - I delete the posted code that was not compliant to the latest UTF-8 standard. I have converted some functions of the the 'pure Lua library' to C functions :

lua_utf8.c

/*==================================================================*/

/*			C program by sarnold@free.fr 2007, MIT license

			based on the work of Rici Lake rici@ricilake.net		*/

/*==================================================================*/



#include <memory.h>

#include "lua.h"

#include "lauxlib.h"

#include "lualib.h"



#define INVALID_UTF8 "invalid utf-8 string"

#define POINTS_ASCII(p) (*((unsigned char*)p) < 128)

#define RANGE(x, min, max) ((x)>=min && (x)<=max)

#define RANGE_SND(x) RANGE(x,128,191)

#define UTF8_BOM(p) (p[0] == 0xEF && p[1] == 0xBB && p[2] == 0xBF)

int sarn_utf8_next(const unsigned char* str)

{

	if (*str < 128)

		return 1;

	if (UTF8_BOM(str))

		return 3;

	if (*str < 194)

		return 0;

	if (*str > 244)

		return 0;

	if (*str < 224 && RANGE_SND(str[1]))

		return 2;

	if (RANGE(*str, 225, 239) && *str != 237 

		&& RANGE_SND(str[1]) && RANGE_SND(str[2]))

		return 3;

	if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))

		return 3;

	if (*str == 237 && RANGE(str[1],128,159) && RANGE_SND(str[2]))

		return 3;

	if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	if (*str == 240 && RANGE(str[1],144,191) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	if (*str == 244 && RANGE(str[1],128,143) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	return 0;

}



#define BACK(str, remain) if (--remain == 0) return 0; else str--

int sarn_utf8_prev(unsigned char* str, int remain)

{

	BACK(str,remain);

	if (*str < 128)

		return 1;

	

	BACK(str,remain);

	if (RANGE(*str,195,224) && RANGE_SND(str[1]))

		return 2;

	

	BACK(str,remain);

	if (UTF8_BOM(str))

		return 3;

	if (RANGE(*str, 225, 239) && *str != 237 

		&& RANGE_SND(str[1]) && RANGE_SND(str[2]))

		return 3;

	if (*str == 224 && RANGE(str[1],160,191) && RANGE_SND(str[2]))

		return 3;

	if (*str == 237 && RANGE(str[1],160,191) && RANGE_SND(str[2]))

		return 3;

	

	BACK(str,remain);

	if (RANGE(*str, 241, 243) && RANGE_SND(str[1]) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	if (*str == 240 && RANGE(str[1],144,191) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	if (*str == 244 && RANGE(str[1],128,143) 

		&& RANGE_SND(str[2]) && RANGE_SND(str[3]))

		return 4;

	/* fail back */

	return 0;

}





/** Realign index on an UTF-8 char boundary in str.

	Returns the offset (0 to 3) to be seeked backwards, or -1 if it fails.

 */

int sarn_utf8_realign(unsigned char* str, size_t index)

{

	size_t size, i;

	

	for (i = 0; i<4 && index>=i;i++) {

		if (sarn_utf8_next(str-i)!=0)

			return i;

	}

	return -1;

}

		



int sarn_utf8_next_func(lua_State* L)

{

	const char *str;

	size_t pos, clen;

	char utf8[5];

	

	str = luaL_checkstring(L, 1);

	pos = luaL_checklong(L, 2);

	if (strlen(str)<pos) {

		lua_pushnil(L);

		return 1;

	}

	memset(utf8, '\0', sizeof(utf8));

	

	if (pos == 0)

		return luaL_error(L, "bad index value : 0");

	

	clen = sarn_utf8_next((unsigned char *)str+pos-1);

	if (!clen)

		return luaL_error(L, INVALID_UTF8);

	

	lua_pushnumber(L, pos+clen);

	strncpy(utf8, str+pos-1, clen);

	lua_pushstring(L, utf8);

	return 2;

}



int sarn_utf8_len_func(lua_State *L)

{

	unsigned char *str;

	int l;

	size_t len = 0;

	

	str = (unsigned char*) luaL_checkstring(L, 1);

	

	while (*str) {

		if (POINTS_ASCII(str)) {

			str++;

			len++;

			continue;

		}

		l = sarn_utf8_next(str);

		if (!l)

			return luaL_error(L, INVALID_UTF8);

		

		len++;

		str+=l;

	}

	lua_pushnumber(L, len);

	return 1;

}



int sarn_utf8_seek_func(lua_State *L)

{

	unsigned char* str;

	int pos, shift;

	int clen, len;



	str = (unsigned char*)luaL_checkstring(L, 1);

	pos = luaL_checklong(L, 2);

	shift = luaL_checklong(L, 3);

	len = strlen(str);

	

	if (shift == 0) {

		lua_pushinteger(L, pos);

		return 1;

	}

	

	if (pos > len || pos < 1)

		return luaL_error(L, "invalid index (arg #2)");

	

	/* then, pos is 0-based */

	pos--;

	

	if (abs(shift) > len) {

		/* out of range */

		lua_pushnil(L);

		return 1;

	}

	

	if (shift < 0) {

		while ((shift++) != 0) {

			clen = sarn_utf8_prev(str+pos, pos+1);

			if (clen == 0 || pos+1 < clen) {

				lua_pushnil(L);

				return 1;

			}

			pos -= clen;

		}

	} else {

		while ((shift--) != 0) {

			if (POINTS_ASCII(str+pos)) {

				pos ++;

				continue;

			}

			clen = sarn_utf8_next(str+pos);

			if (clen == 0 || pos+clen >= len) {

				lua_pushnil(L);

				return 1;

			}

			pos += clen;

		}

	}

	

	lua_pushinteger(L, pos+1);

	return 1;

}



int sarn_utf8_char_func(lua_State *L)

{

	unsigned char str[2];

	long int i;

	unsigned long int code;

	unsigned char result[5];

	

	i = luaL_checklong(L, 1);

	memset(result, '\0', sizeof(result));

	code = i;

	

	if (i >= 0xD800 && i <= 0xDFFF)

		return luaL_error(L, "invalid utf-8 code");

	

	if (i >= 0 && i < 0x110000UL) {

		if (code == 0) {

			/* UTF8 BOM */

			lua_pushstring(L, "\xEF\xBB\xBF");

			return 1;

		}

		if (code < 128) {

			result[0] = code;

			lua_pushstring(L, (char*)result);

			return 1;

		}

		str[0] = 0x80 + (code & 63);

		code = code >> 6;

		if (code < 32) {

			result[0] = 0xC0+code;

			result[1] = str[0];

			lua_pushstring(L, (char*)result);

			return 1;

		}

		str[1] = code & 0x3f;

		code = code >> 6;

		if (code < 16 && (code != 13 || str[1] < 32)) {

			result[0] = 0xE0 + code;

			result[1] = str[1] + 0x80;

			result[2] = str[0];

			lua_pushstring(L, (char*)result);

			return 1;

		} else if (code >= 16 && code < 0x110) {

			result[1] = 0x80 + (code & 0x3f);

			result[0] = 0xF0 + (code >> 6);

			result[2] = str[1] + 0x80;

			result[3] = str[0];

			lua_pushstring(L, (char*) result);

			return 1;

		}

	}

	return luaL_error(L, "invalid utf-8 code");

}

	

int sarn_utf8_code_func(lua_State *L)

{

	unsigned char* str;

	size_t len, i;

	unsigned long int code;

	unsigned long int offset[] = {0, 0x3000,

  0xE0000UL,

  0x3C00000UL};

	

	str = (unsigned char*)luaL_checklstring(L, 1, &len);

	

	if (len != sarn_utf8_next(str))

		return luaL_error(L, INVALID_UTF8);

	

	if (UTF8_BOM(str)) {

		lua_pushinteger(L, 0);

		return 1;

	}

		

	

	code = str[0];

	for (i = 1; i < len; i++) {

		code = (code << 6) + (str[i] & 63);

	}

	lua_pushinteger(L, code - offset[len-1]);

	

	return 1;

}

	

	

int luaopen_libluautf8 (lua_State *L)

{

	lua_getglobal(L, "string");

	lua_pushcfunction(L, sarn_utf8_next_func);

	lua_setfield(L, -2, "nextutf8");

	lua_pushcfunction(L, sarn_utf8_len_func);

	lua_setfield(L, -2, "utf8len");

	lua_pushcfunction(L, sarn_utf8_seek_func);

	lua_setfield(L, -2, "seekutf8");

	lua_pushcfunction(L, sarn_utf8_code_func);

	lua_setfield(L, -2, "utf8code");

	lua_pushcfunction(L, sarn_utf8_char_func);

	lua_setfield(L, -2, "utf8char");

	return 0;

}



Makefile

all: compile

LUA_CFLAGS=-O2 -fpic

LUA_LDFLAGS=-O -shared -fpic



compile: lua_utf8



lua_utf8: lua_utf8.c

	$(CC) $(CFLAGS) $(LUA_CFLAGS) -c lua_utf8.c

	$(CC) $(CFLAGS) $(LUA_LDFLAGS) -o libluautf8.so lua_utf8.o

utf8.lua

module(...,package.seeall)

require'libluautf8'



local mt = {}

local unistr = {}

function unistr:new(str)

	return setmetatable({value = str or ''},mt)

end



-- redirects methods to unistr

mt.__index = function(t,key) 

	if key == 'length' then return string.utf8len(t.value) end

	if key == 'value' then return t.value end

	return unistr[key]

end



-- substrings, utf8 ready

-- it might be very expensive

-- isn't every encoding function expensive compared to raw access

-- to bytes

function unistr:sub (first, last)

	local fn 

	fn = function (str,idx)

		if idx == 1 or idx == 0 then return idx end

		if idx<0 then

			-- negative indices are counted backwards

			return str:seekutf8(#str, idx) or 1

		else

			return str:seekutf8(1, idx-1) or #str+1

		end

	end

	local i = fn(self.value, first)

	if last == nil then

		return self.value:sub(i)

	end

	if last < 0 then

		if first > 0 or (first<0 and last-first > -last) then

			-- we must anyway walk through the encoded string

			-- when walking from the end of the string backwards

			-- has costs less than walking from the first index

			-- we choose the least cost

			

			-- we get the last index from fn

			return self.value:sub(i, fn(self.value, last))

		end

	end

	if first == 0 then return self.value:sub(i, fn(self.value, last)) end

	return self.value:sub(i, self.value:seekutf8(i, last-first))	

end

local u2s=function (str)

	if type(str) == 'string' then return str else return str.value end

end

	

-- unicode strings concat

function mt.__concat(a,b) 

	return u(u2s(a)..u2s(b)) 

end

-- encoded string length with a metatable is not possible

-- so let's stick with a len() method

function unistr:len() 

	return self.value:utf8len() 

end

-- iterator

function unistr:each(pos) return string.nextutf8, self.value, pos or 1 end

-- creates a global "u" function to be used like that: 

-- str = u"Hello" (it feels Python-like but is really a Lua function)

-- then, thanks to the metatable mechanism, concatenation and other funcs

-- can be invoked as if it was a simple scalar of type string

_G.u = function(str) return unistr:new(str) end



function unicodize(f)

	return function(str) return f(u2s(str)) end

end

_G.print = unicodize(print)

-- return this function

return _G.u



Test code

require 'utf8'

a=u'hello'

b="hello"



function assertEqual(name,a,b) 

	if a~=b then 

		print(name.."["..a..'|'..b..']') 

	else

		--print(name.."...OK")

	end 

end



for i = 0,10 do

  assertEqual("sub1."..i,a:sub(i),b:sub(i))

end



for i = 0,5 do

	for j = i,10 do

		assertEqual("sub2."..i.."-"..j,a:sub(i,j),b:sub(i,j))

	end

end



lentest = {{"h",1},{"",0},{"hel",3},{"hi Stéphane",11}}



for _,val in ipairs(lentest) do

	str = u(val[1])

	assertEqual("len1.".._, str.length, val[2])

end



firstName=u"Stéphane"

lastName = u"Arnold"

print("hello "..firstName.." "..lastName)


RecentChanges · preferences
edit · history
Last edited May 22, 2009 7:24 pm GMT (diff)