UTF-8 Encoding and Decoding
參考
CodeProject - UTF-8 Encoding and Decoding
改成 C 版本
#include <string.h>
void EncodeToUTF8(char * szSource, char *szFinal);
void DecodeFromUTF8(char * szSource, char *szFinal);
int main(int argc, char* argv[])
{
char szEncodeFinal[256];
char szDecodeFinal[256];
EncodeToUTF8("123abc測試", szEncodeFinal);
printf("Encode:%s\n", szEncodeFinal);
DecodeFromUTF8(szEncodeFinal, szDecodeFinal);
printf("Decode:%s\n", szDecodeFinal);
return 0;
}
void EncodeToUTF8(char * szSource, char *szFinal)
{
unsigned short ch;
unsigned char bt1, bt2, bt3, bt4, bt5, bt6;
int n, nMax = strlen(szSource);
//CString sFinal, szTemp;
szFinal[0] = ('\0');
for (n = 0; n <nMax; ++n)
{
ch = (unsigned short)szSource[n];
if (ch == ('='))
{
char szTemp[256];
sprintf(szTemp, ("=%02X"), ch);
strcat(szFinal, szTemp);
}
else if (ch <128)
{
char szTemp[2];
szTemp[0] = szSource[n];
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
}
else if (ch <= 2047)
{
char szTemp[256];
bt1 = (unsigned char)(192 + (ch / 64));
bt2 = (unsigned char)(128 + (ch % 64));
//szTemp.Format(("=%02X=%02X"), bt1, bt2);
sprintf(szTemp, ("=%02X=%02X"), bt1, bt2);
//sFinal += szTemp;
strcat(szFinal, szTemp);
}
else if (ch <= 65535)
{
char szTemp[256];
bt1 = (unsigned char)(224 + (ch / 4096));
bt2 = (unsigned char)(128 + ((ch / 64) % 64));
bt3 = (unsigned char)(128 + (ch % 64));
//szTemp.Format(("=%02X=%02X=%02X"), bt1, bt2, bt3);
sprintf(szTemp, ("=%02X=%02X=%02X"), bt1, bt2, bt3);
//sFinal += szTemp;
strcat(szFinal, szTemp);
}
else if (ch <= 2097151)
{
char szTemp[256];
bt1 = (unsigned char)(240 + (ch / 262144));
bt2 = (unsigned char)(128 + ((ch / 4096) % 64));
bt3 = (unsigned char)(128 + ((ch / 64) % 64));
bt4 = (unsigned char)(128 + (ch % 64));
//szTemp.Format(("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
//sFinal += szTemp;
sprintf(szTemp, ("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
strcat(szFinal, szTemp);
}
else if (ch <=67108863)
{
char szTemp[256];
bt1 = (unsigned char)(248 + (ch / 16777216));
bt2 = (unsigned char)(128 + ((ch / 262144) % 64));
bt3 = (unsigned char)(128 + ((ch / 4096) % 64));
bt4 = (unsigned char)(128 + ((ch / 64) % 64));
bt5 = (unsigned char)(128 + (ch % 64));
//szTemp.Format(("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
//sFinal += szTemp;
strcat(szFinal, szTemp);
}
else if (ch <=2147483647)
{
char szTemp[256];
bt1 = (unsigned char)(252 + (ch / 1073741824));
bt2 = (unsigned char)(128 + ((ch / 16777216) % 64));
bt3 = (unsigned char)(128 + ((ch / 262144) % 64));
bt4 = (unsigned char)(128 + ((ch / 4096) % 64));
bt5 = (unsigned char)(128 + ((ch / 64) % 64));
bt6 = (unsigned char)(128 + (ch % 64));
//szTemp.Format(("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
//sFinal += szTemp;
sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
strcat(szFinal, szTemp);
}
}
//return sFinal;
}
unsigned char MakeByte(char ch1, char ch2);
void DecodeFromUTF8(char * szSource, char *szFinal)
{
unsigned char z, y, x, w, v, u;
int n, nMax = strlen(szSource);
unsigned short ch;
//CString sFinal, szTemp;
szFinal[0] = ('\0');
for (n = 0; n <nMax; ++n)
{
ch = (unsigned short)szSource[n];
if (ch != ('='))
{
char szTemp[2];
szTemp[0] = (char)ch;
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
//sFinal += (char)ch;
continue;
}
if (n>= nMax - 2) break; // something is wrong
z = MakeByte(szSource[n+1], szSource[n+2]);
if (z <127)
{
char szTemp[2];
szTemp[0] = (char)z;
szTemp[1] = ('\0');
//sFinal += (char)z;
strcat(szFinal, szTemp);
n = n + 2;
}
else if (z>= 192 && z <= 223)
{
// character is two unsigned chars
char szTemp[2];
if (n>= nMax - 5) break; // something is wrong
y = MakeByte(szSource[n+4], szSource[n+5]);
//sFinal += (char)( (z-192)*64 + (y-128) );
szTemp[0] = (char)( (z-192)*64 + (y-128) );
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
n = n + 5;
}
else if (z>= 224 && z <= 239)
{
// character is three unsigned chars
char szTemp[2];
if (n>= nMax - 8) break; // something is wrong
y = MakeByte(szSource[n+4], szSource[n+5]);
x = MakeByte(szSource[n+7], szSource[n+8]);
//sFinal += (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
szTemp[0] = (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
n = n + 8;
}
else if (z>= 240 && z <= 247)
{
// character is four unsigned chars
char szTemp[2];
if (n>= nMax - 11) break; // something is wrong
y = MakeByte(szSource[n+4], szSource[n+5]);
x = MakeByte(szSource[n+7], szSource[n+8]);
w = MakeByte(szSource[n+10], szSource[n+11]);
//sFinal += (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
szTemp[0] = (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
n = n + 11;
}
else if (z>= 248 && z <= 251)
{
// character is four unsigned chars
char szTemp[2];
if (n>= nMax - 14) break; // something is wrong
y = MakeByte(szSource[n+4], szSource[n+5]);
x = MakeByte(szSource[n+7], szSource[n+8]);
w = MakeByte(szSource[n+10], szSource[n+11]);
v = MakeByte(szSource[n+13], szSource[n+14]);
//sFinal += (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
szTemp[0] = (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
n = n + 14;
}
else if (z>= 252 && z <= 253)
{
// character is four unsigned chars
char szTemp[2];
if (n>= nMax - 17) break; // something is wrong
y = MakeByte(szSource[n+4], szSource[n+5]);
x = MakeByte(szSource[n+7], szSource[n+8]);
w = MakeByte(szSource[n+10], szSource[n+11]);
v = MakeByte(szSource[n+13], szSource[n+14]);
u = MakeByte(szSource[n+16], szSource[n+17]);
//sFinal += (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
szTemp[0] = (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
szTemp[1] = ('\0');
strcat(szFinal, szTemp);
n = n + 17;
}
}
//return sFinal;
}
// helper function for decoding
unsigned char MakeByte(char ch1, char ch2)
{
unsigned char bt1 = 0, bt2 = 0;
unsigned char btFinal;
switch (ch2)
{
case ('0'):
bt2 = 0x00;
break;
case ('1'):
bt2 = 0x01;
break;
case ('2'):
bt2 = 0x02;
break;
case ('3'):
bt2 = 0x03;
break;
case ('4'):
bt2 = 0x04;
break;
case ('5'):
bt2 = 0x05;
break;
case ('6'):
bt2 = 0x06;
break;
case ('7'):
bt2 = 0x07;
break;
case ('8'):
bt2 = 0x08;
break;
case ('9'):
bt2 = 0x09;
break;
case ('A'):
bt2 = 0x0A;
break;
case ('B'):
bt2 = 0x0B;
break;
case ('C'):
bt2 = 0x0C;
break;
case ('D'):
bt2 = 0x0D;
break;
case ('E'):
bt2 = 0x0E;
break;
case ('F'):
bt2 = 0x0F;
break;
}
switch (ch1)
{
case ('0'):
bt1 = 0x00;
break;
case ('1'):
bt1 = 0x10;
break;
case ('2'):
bt1 = 0x20;
break;
case ('3'):
bt1 = 0x30;
break;
case ('4'):
bt1 = 0x40;
break;
case ('5'):
bt1 = 0x50;
break;
case ('6'):
bt1 = 0x60;
break;
case ('7'):
bt1 = 0x70;
break;
case ('8'):
bt1 = 0x80;
break;
case ('9'):
bt1 = 0x90;
break;
case ('A'):
bt1 = 0xA0;
break;
case ('B'):
bt1 = 0xB0;
break;
case ('C'):
bt1 = 0xC0;
break;
case ('D'):
bt1 = 0xD0;
break;
case ('E'):
bt1 = 0xE0;
break;
case ('F'):
bt1 = 0xF0;
break;
}
btFinal = bt2 | bt1;
return btFinal;
}