UTF-8 Encoding and Decoding

UTF-8 Encoding and Decoding

參考

CodeProject - UTF-8 Encoding and Decoding

改成 C 版本


#include <string.h>

void EncodeToUTF8(char * szSource, char *szFinal);
void DecodeFromUTF8(char * szSource, char *szFinal);

int main(int argc, char* argv[])
{
    char szEncodeFinal[256];
    char szDecodeFinal[256];
    EncodeToUTF8("123abc測試", szEncodeFinal);
    printf("Encode:%s\n", szEncodeFinal);
    DecodeFromUTF8(szEncodeFinal, szDecodeFinal);
    printf("Decode:%s\n", szDecodeFinal);
    return 0;
}

void EncodeToUTF8(char * szSource, char *szFinal)
{
    unsigned short ch;
    
    unsigned char bt1, bt2, bt3, bt4, bt5, bt6;
    
    int n, nMax = strlen(szSource);
    
    //CString sFinal, szTemp;
    szFinal[0] = ('\0');
    
    for (n = 0; n <nMax; ++n)
    {
        ch = (unsigned short)szSource[n];
        
        if (ch == ('='))
        {
            char szTemp[256];
            sprintf(szTemp, ("=%02X"), ch);
            
            strcat(szFinal, szTemp);
            
        }
        else if (ch <128)
        {
            char szTemp[2];
            szTemp[0] = szSource[n];
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
        }
        else if (ch <= 2047)
        {
            char szTemp[256];
            bt1 = (unsigned char)(192 + (ch / 64));
            bt2 = (unsigned char)(128 + (ch % 64));
            
            //szTemp.Format(("=%02X=%02X"), bt1, bt2);
            sprintf(szTemp, ("=%02X=%02X"), bt1, bt2);
            
            //sFinal += szTemp;
            strcat(szFinal, szTemp);
        }
        else if (ch <= 65535)
        {
            char szTemp[256];
            bt1 = (unsigned char)(224 + (ch / 4096));
            bt2 = (unsigned char)(128 + ((ch / 64) % 64));
            bt3 = (unsigned char)(128 + (ch % 64));
            
            //szTemp.Format(("=%02X=%02X=%02X"), bt1, bt2, bt3);
            sprintf(szTemp, ("=%02X=%02X=%02X"), bt1, bt2, bt3);
            
            //sFinal += szTemp;
            strcat(szFinal, szTemp);
        }
        else if (ch <= 2097151)
        {
            char szTemp[256];
            bt1 = (unsigned char)(240 + (ch / 262144));
            bt2 = (unsigned char)(128 + ((ch / 4096) % 64));
            bt3 = (unsigned char)(128 + ((ch / 64) % 64));
            bt4 = (unsigned char)(128 + (ch % 64));
            
            //szTemp.Format(("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
            //sFinal += szTemp;
            sprintf(szTemp, ("=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4);
            strcat(szFinal, szTemp);
        }
        else if (ch <=67108863)
        {
            char szTemp[256];
            bt1 = (unsigned char)(248 + (ch / 16777216));
            bt2 = (unsigned char)(128 + ((ch / 262144) % 64));
            bt3 = (unsigned char)(128 + ((ch / 4096) % 64));
            bt4 = (unsigned char)(128 + ((ch / 64) % 64));
            bt5 = (unsigned char)(128 + (ch % 64));
            
            //szTemp.Format(("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
            sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5);
            //sFinal += szTemp;
            strcat(szFinal, szTemp);
        }
        else if (ch <=2147483647)
        {
            char szTemp[256];
            bt1 = (unsigned char)(252 + (ch / 1073741824));
            bt2 = (unsigned char)(128 + ((ch / 16777216) % 64));
            bt3 = (unsigned char)(128 + ((ch / 262144) % 64));
            bt4 = (unsigned char)(128 + ((ch / 4096) % 64));
            bt5 = (unsigned char)(128 + ((ch / 64) % 64));
            bt6 = (unsigned char)(128 + (ch % 64));
            
            //szTemp.Format(("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
            //sFinal += szTemp;
            sprintf(szTemp, ("=%02X=%02X=%02X=%02X=%02X=%02X"), bt1, bt2, bt3, bt4, bt5, bt6);
            strcat(szFinal, szTemp);
        }        
    }    
    //return sFinal;
}

unsigned char MakeByte(char ch1, char ch2);

void DecodeFromUTF8(char * szSource, char *szFinal)
{    
    unsigned char z, y, x, w, v, u;
    int n, nMax = strlen(szSource);
    unsigned short ch;
    
    //CString sFinal, szTemp;
    szFinal[0] = ('\0');
    
    for (n = 0; n <nMax; ++n)
    {
        ch = (unsigned short)szSource[n];
        
        if (ch != ('='))
        {
            char szTemp[2];
            szTemp[0] = (char)ch;
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            
            //sFinal += (char)ch;
            continue;
        }
        
        if (n>= nMax - 2) break; // something is wrong
        z = MakeByte(szSource[n+1], szSource[n+2]);
        
        if (z <127)
        {
            char szTemp[2];
            szTemp[0] = (char)z;
            szTemp[1] = ('\0');
            
            //sFinal += (char)z;
            strcat(szFinal, szTemp);
            
            n = n + 2;
        }
        else if (z>= 192 && z <= 223)
        {
            // character is two unsigned chars
            char szTemp[2];
            if (n>= nMax - 5) break; // something is wrong
            y = MakeByte(szSource[n+4], szSource[n+5]);
            //sFinal += (char)( (z-192)*64 + (y-128) );
            szTemp[0] = (char)( (z-192)*64 + (y-128) );
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            n = n + 5;
        }
        else if (z>= 224 && z <= 239)
        {
            // character is three unsigned chars
            char szTemp[2];
            if (n>= nMax - 8) break; // something is wrong
            y = MakeByte(szSource[n+4], szSource[n+5]);
            x = MakeByte(szSource[n+7], szSource[n+8]);
            //sFinal += (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
            szTemp[0] = (char)( (z-224)*4096 + (y-128)*64 + (x-128) );
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            n = n + 8;
        }
        else if (z>= 240 && z <= 247)
        {
            // character is four unsigned chars
            char szTemp[2];
            if (n>= nMax - 11) break; // something is wrong
            y = MakeByte(szSource[n+4], szSource[n+5]);
            x = MakeByte(szSource[n+7], szSource[n+8]);
            w = MakeByte(szSource[n+10], szSource[n+11]);
            //sFinal += (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
            szTemp[0] = (char)( (z-240)*262144 + (y-128)*4096 + (x-128)*64 + (w-128) );
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            n = n + 11;
        }
        else if (z>= 248 && z <= 251)
        {
            // character is four unsigned chars
            char szTemp[2];
            if (n>= nMax - 14) break; // something is wrong
            y = MakeByte(szSource[n+4], szSource[n+5]);
            x = MakeByte(szSource[n+7], szSource[n+8]);
            w = MakeByte(szSource[n+10], szSource[n+11]);
            v = MakeByte(szSource[n+13], szSource[n+14]);
            //sFinal += (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
            szTemp[0] = (char)( (z-248)*16777216 + (y-128)*262144 + (x-128)*4096 + (w-128)*64 + (v-128) );
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            n = n + 14;
        }
        else if (z>= 252 && z <= 253)
        {
            // character is four unsigned chars
            char szTemp[2];
            if (n>= nMax - 17) break; // something is wrong
            y = MakeByte(szSource[n+4], szSource[n+5]);
            x = MakeByte(szSource[n+7], szSource[n+8]);
            w = MakeByte(szSource[n+10], szSource[n+11]);
            v = MakeByte(szSource[n+13], szSource[n+14]);
            u = MakeByte(szSource[n+16], szSource[n+17]);
            //sFinal += (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
            szTemp[0] = (char)( (z-252)*1073741824 + (y-128)*16777216 + (x-128)*262144 + (w-128)*4096 + (v-128)*64 + (u-128) );
            szTemp[1] = ('\0');
            strcat(szFinal, szTemp);
            n = n + 17;
        }        
    }
    //return sFinal;
}

// helper function for decoding
unsigned char MakeByte(char ch1, char ch2)
{
    unsigned char bt1 = 0, bt2 = 0;
    unsigned char btFinal;
    switch (ch2)
    {
        case ('0'):
        bt2 = 0x00;
        break;
        case ('1'):
        bt2 = 0x01;
        break;
        case ('2'):
        bt2 = 0x02;
        break;
        case ('3'):
        bt2 = 0x03;
        break;
        case ('4'):
        bt2 = 0x04;
        break;
        case ('5'):
        bt2 = 0x05;
        break;
        case ('6'):
        bt2 = 0x06;
        break;
        case ('7'):
        bt2 = 0x07;
        break;
        case ('8'):
        bt2 = 0x08;
        break;
        case ('9'):
        bt2 = 0x09;
        break;
        case ('A'):
        bt2 = 0x0A;
        break;
        case ('B'):
        bt2 = 0x0B;
        break;
        case ('C'):
        bt2 = 0x0C;
        break;
        case ('D'):
        bt2 = 0x0D;
        break;
        case ('E'):
        bt2 = 0x0E;
        break;
        case ('F'):
        bt2 = 0x0F;
        break;
    }
    
    switch (ch1)
    {
        case ('0'):
        bt1 = 0x00;
        break;
        case ('1'):
        bt1 = 0x10;
        break;
        case ('2'):
        bt1 = 0x20;
        break;
        case ('3'):
        bt1 = 0x30;
        break;
        case ('4'):
        bt1 = 0x40;
        break;
        case ('5'):
        bt1 = 0x50;
        break;
        case ('6'):
        bt1 = 0x60;
        break;
        case ('7'):
        bt1 = 0x70;
        break;
        case ('8'):
        bt1 = 0x80;
        break;
        case ('9'):
        bt1 = 0x90;
        break;
        case ('A'):
        bt1 = 0xA0;
        break;
        case ('B'):
        bt1 = 0xB0;
        break;
        case ('C'):
        bt1 = 0xC0;
        break;
        case ('D'):
        bt1 = 0xD0;
        break;
        case ('E'):
        bt1 = 0xE0;
        break;
        case ('F'):
        bt1 = 0xF0;
        break;
    }
    
    btFinal = bt2 | bt1;
    return  btFinal;
}

 

Dotblogs 的標籤: ,