判斷字串是否為utf8編碼

判斷字串是否為utf8編碼

參考「自动辨别文本是不是utf-8的c#程序」 改成 C code


//0000 0080-0000 07FF - 110xxxxx 10xxxxxx    ( 2 octet format)
//0000 0800-0000 FFFF - 1110xxxx 10xxxxxx 10xxxxxx (3 octet format)
BOOL IsUTF8(const char *str)
{
	int   i;
	BYTE cOctets;  // octets to go in this UTF-8 encoded character
	BYTE chr;
	BOOL  bAllAscii= TRUE;
	long iLen = strlen(str);
	 
	cOctets= 0;
	for( i=0; i <iLen; i++ ) 
	{
 		chr = (BYTE)str[i];

		if( (chr & 0x80) != 0 ) bAllAscii= FALSE;
 
		if( cOctets == 0 ) 
		{
			if( chr>= 0x80 )  
			{
				do  {
					chr <<= 1;
					cOctets++;
				}
				while( (chr & 0x80) != 0 );
				cOctets--;
				if( cOctets == 0 ) return FALSE;
			}
		}
		else 
		{
			if( (chr & 0xC0) != 0x80 )
				return FALSE;
			cOctets--;
		}
	}

	if( cOctets> 0 )
		return FALSE;
 
	if( bAllAscii )
		return FALSE;
 
	return TRUE;
}
Dotblogs 的標籤: ,