UTF-8

TF-8 is a variant-length character encoding for Unicode, UTF-8 uses 1 to 6 bytes to encode one UNICODE character. (If the UNICODE char is represented on 2 bytes, there is a need for mostly 3 bytes; if the UNICODE char is represented as 4 bytes, 6 bytes may be needed.) 4 or 6 bytes to encode a single char may seem too much, but the UNICODE chars that need that are rarely used.

The transformation table for UTF-8 is presented below:

UNICODE UTF-8
00000000 - 0000007F 0xxxxxxx
00000080 - 000007FF 110xxxxx 10xxxxxx
00000800 - 0000FFFF 1110xxxx 10xxxxxx 10xxxxxx
00010000 - 001FFFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
00200000 - 03FFFFFF 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
04000000 - 7FFFFFFF 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx

The UNICODE characters that actually represent ASCII chars are encoded in a single byte, and the UTF-8 representation is actually the ASCII representation. All other UNICODE characters require at least 2 bytes. Each of these bytes starts with an escape sequence. The first byte has a unique sequence, composed on N bits on 1 followed by 1 bit of 0. The N number of bits of 1 indicates the number of bytes on which the character is encoded.

Byte Order Mark

BOM is a character that indicates the endianness of a UNICODE text encoded in UTF-16, UTF-32 and in the same time a marker to indicate that text is encoded in UTF-8, UTF-16, UTF-32 (UTF-8 is byte-order independent).

Encoding Representation
UTF-8 EF BB BF
UTF-16 Big Endian FE FF
UTF-16 Little Endian FF FE
UTF-32 Big Endian 00 00 FE FF
UTF-32 Little Endian FF FE 00 00

UTF-8 C++ Encoding Sample

Here are four functions written in C++ that encode and decode 2 and 4 bytes UNICODE text in/from UTF-8.

#define         MASKBITS                0×3F
#define         MASKBYTE                0×80
#define         MASK2BYTES              0xC0
#define         MASK3BYTES              0xE0
#define         MASK4BYTES              0xF0
#define         MASK5BYTES              0xF8
#define         MASK6BYTES              0xFC

typedef unsigned short   Unicode2Bytes;
typedef unsigned int     Unicode4Bytes;

void UTF8Encode2BytesUnicode(std::vector< Unicode2Bytes > input,
                             std::vector< byte >& output)
{
   for(int i=0; i < input.size(); i++)
   {
      // 0xxxxxxx
      if(input[i] < 0×80)
      {
         output.push_back((byte)input[i]);
      }
      // 110xxxxx 10xxxxxx
      else if(input[i] < 0×800)
      {
         output.push_back((byte)(MASK2BYTES | input[i] >> 6));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
      // 1110xxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0×10000)
      {
         output.push_back((byte)(MASK3BYTES | input[i] >> 12));
         output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
   }
}

void UTF8Decode2BytesUnicode(std::vector< byte > input,
                             std::vector< Unicode2Bytes >& output)
{
   for(int i=0; i < input.size();)
   {
      Unicode2Bytes ch;

      // 1110xxxx 10xxxxxx 10xxxxxx
      if((input[i] & MASK3BYTES) == MASK3BYTES)
      {
         ch = ((input[i] & 0×0F) << 12) | (
               (input[i+1] & MASKBITS) << 6)
              | (input[i+2] & MASKBITS);
         i += 3;
      }
      // 110xxxxx 10xxxxxx
      else if((input[i] & MASK2BYTES) == MASK2BYTES)
      {
         ch = ((input[i] & 0×1F) << 6) | (input[i+1] & MASKBITS);
         i += 2;
      }
      // 0xxxxxxx
      else if(input[i] < MASKBYTE)
      {
         ch = input[i];
         i += 1;
      }

      output.push_back(ch);
   }
}

void UTF8Encode4BytesUnicode(std::vector< Unicode4Bytes > input,
                             std::vector< byte >& output)
{
   for(int i=0; i < input.size(); i++)
   {
      // 0xxxxxxx
      if(input[i] < 0×80)
      {
         output.push_back((byte)input[i]);
      }
      // 110xxxxx 10xxxxxx
      else if(input[i] < 0×800)
      {
         output.push_back((byte)(MASK2BYTES | input[i] > 6));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
      // 1110xxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0×10000)
      {
         output.push_back((byte)(MASK3BYTES | input[i] >> 12));
         output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0×200000)
      {
         output.push_back((byte)(MASK4BYTES | input[i] >> 18));
         output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
      // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0×4000000)
      {
         output.push_back((byte)(MASK5BYTES | input[i] >> 24));
         output.push_back((byte)(MASKBYTE | input[i] >> 18 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
      // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      else if(input[i] < 0×8000000)
      {
         output.push_back((byte)(MASK6BYTES | input[i] >> 30));
         output.push_back((byte)(MASKBYTE | input[i] >> 18 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS));
         output.push_back((byte)(MASKBYTE | input[i] & MASKBITS));
      }
   }
}

void UTF8Decode4BytesUnicode(std::vector< byte > input,
                             std::vector< Unicode4Bytes >& output)
{
   for(int i=0; i < input.size();)
   {
      Unicode4Bytes ch;

      // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      if((input[i] & MASK6BYTES) == MASK6BYTES)
      {
         ch = ((input[i] & 0×01) << 30) | ((input[i+1] & MASKBITS) << 24)
              | ((input[i+2] & MASKBITS) << 18) | ((input[i+3]
                        & MASKBITS) << 12)
              | ((input[i+4] & MASKBITS) << 6) | (input[i+5] & MASKBITS);
         i += 6;
      }
      // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
      else if((input[i] & MASK5BYTES) == MASK5BYTES)
      {
         ch = ((input[i] & 0×03) << 24) | ((input[i+1]
                & MASKBITS) << 18)
              | ((input[i+2] & MASKBITS) << 12) | ((input[i+3]
                  & MASKBITS) << 6)
              | (input[i+4] & MASKBITS);
         i += 5;
      }
      // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
      else if((input[i] & MASK4BYTES) == MASK4BYTES)
      {
         ch = ((input[i] & 0×07) << 18) | ((input[i+1]
                & MASKBITS) << 12)
              | ((input[i+2] & MASKBITS) << 6) | (input[i+3] & MASKBITS);
         i += 4;
      }
      // 1110xxxx 10xxxxxx 10xxxxxx
      else if((input[i] & MASK3BYTES) == MASK3BYTES)
      {
         ch = ((input[i] & 0×0F) << 12) | ((input[i+1] & MASKBITS) << 6)
              | (input[i+2] & MASKBITS);
         i += 3;
      }
      // 110xxxxx 10xxxxxx
      else if((input[i] & MASK2BYTES) == MASK2BYTES)
      {
         ch = ((input[i] & 0×1F) << 6) | (input[i+1] & MASKBITS);
         i += 2;
      }
      // 0xxxxxxx
      else if(input[i] < MASKBYTE)
      {
         ch = input[i];
         i += 1;
      }
      output.push_back(ch);
   }
}

contributed by

http://www.codeguru.com/cpp/misc/misc/multi-lingualsupport/article.php/c10451

1

cd /media

sudo mkdir data

2

command :    blkid

to find UUID

3

sudo nano /etc/fstab

add

#/dev/sda7
/dev/sdc7 /media/t1 ext3  rw,nosuid,nodev,uhelper=hal
#/dev/sdb1
UUID=56EC61E5EC61BFBB /media/t2 ntfs rw,nosuid,nodev,allow_other,blksize=4096

    
坐看云起时 based on WordPress, RSS and comments design by Gx3.