UTF-8
TF-8 is a variant-length character encoding for Unicode, UTF-8 uses 1 to 6 bytes to encode one UNICODE character. (If the UNICODE char is represented on 2 bytes, there is a need for mostly 3 bytes; if the UNICODE char is represented as 4 bytes, 6 bytes may be needed.) 4 or 6 bytes to encode a single char may seem too much, but the UNICODE chars that need that are rarely used.
The transformation table for UTF-8 is presented below:
| UNICODE | UTF-8 |
| 00000000 - 0000007F | 0xxxxxxx |
| 00000080 - 000007FF | 110xxxxx 10xxxxxx |
| 00000800 - 0000FFFF | 1110xxxx 10xxxxxx 10xxxxxx |
| 00010000 - 001FFFFF | 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 00200000 - 03FFFFFF | 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
| 04000000 - 7FFFFFFF | 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx |
The UNICODE characters that actually represent ASCII chars are encoded in a single byte, and the UTF-8 representation is actually the ASCII representation. All other UNICODE characters require at least 2 bytes. Each of these bytes starts with an escape sequence. The first byte has a unique sequence, composed on N bits on 1 followed by 1 bit of 0. The N number of bits of 1 indicates the number of bytes on which the character is encoded.
Byte Order Mark
BOM is a character that indicates the endianness of a UNICODE text encoded in UTF-16, UTF-32 and in the same time a marker to indicate that text is encoded in UTF-8, UTF-16, UTF-32 (UTF-8 is byte-order independent).
| Encoding | Representation |
| UTF-8 | EF BB BF |
| UTF-16 Big Endian | FE FF |
| UTF-16 Little Endian | FF FE |
| UTF-32 Big Endian | 00 00 FE FF |
| UTF-32 Little Endian | FF FE 00 00 |
UTF-8 C++ Encoding Sample
Here are four functions written in C++ that encode and decode 2 and 4 bytes UNICODE text in/from UTF-8.
#define MASKBITS 0×3F #define MASKBYTE 0×80 #define MASK2BYTES 0xC0 #define MASK3BYTES 0xE0 #define MASK4BYTES 0xF0 #define MASK5BYTES 0xF8 #define MASK6BYTES 0xFC typedef unsigned short Unicode2Bytes; typedef unsigned int Unicode4Bytes; void UTF8Encode2BytesUnicode(std::vector< Unicode2Bytes > input, std::vector< byte >& output) { for(int i=0; i < input.size(); i++) { // 0xxxxxxx if(input[i] < 0×80) { output.push_back((byte)input[i]); } // 110xxxxx 10xxxxxx else if(input[i] < 0×800) { output.push_back((byte)(MASK2BYTES | input[i] >> 6)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } // 1110xxxx 10xxxxxx 10xxxxxx else if(input[i] < 0×10000) { output.push_back((byte)(MASK3BYTES | input[i] >> 12)); output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } } } void UTF8Decode2BytesUnicode(std::vector< byte > input, std::vector< Unicode2Bytes >& output) { for(int i=0; i < input.size();) { Unicode2Bytes ch; // 1110xxxx 10xxxxxx 10xxxxxx if((input[i] & MASK3BYTES) == MASK3BYTES) { ch = ((input[i] & 0×0F) << 12) | ( (input[i+1] & MASKBITS) << 6) | (input[i+2] & MASKBITS); i += 3; } // 110xxxxx 10xxxxxx else if((input[i] & MASK2BYTES) == MASK2BYTES) { ch = ((input[i] & 0×1F) << 6) | (input[i+1] & MASKBITS); i += 2; } // 0xxxxxxx else if(input[i] < MASKBYTE) { ch = input[i]; i += 1; } output.push_back(ch); } } void UTF8Encode4BytesUnicode(std::vector< Unicode4Bytes > input, std::vector< byte >& output) { for(int i=0; i < input.size(); i++) { // 0xxxxxxx if(input[i] < 0×80) { output.push_back((byte)input[i]); } // 110xxxxx 10xxxxxx else if(input[i] < 0×800) { output.push_back((byte)(MASK2BYTES | input[i] > 6)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } // 1110xxxx 10xxxxxx 10xxxxxx else if(input[i] < 0×10000) { output.push_back((byte)(MASK3BYTES | input[i] >> 12)); output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx else if(input[i] < 0×200000) { output.push_back((byte)(MASK4BYTES | input[i] >> 18)); output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx else if(input[i] < 0×4000000) { output.push_back((byte)(MASK5BYTES | input[i] >> 24)); output.push_back((byte)(MASKBYTE | input[i] >> 18 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx else if(input[i] < 0×8000000) { output.push_back((byte)(MASK6BYTES | input[i] >> 30)); output.push_back((byte)(MASKBYTE | input[i] >> 18 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] >> 12 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] >> 6 & MASKBITS)); output.push_back((byte)(MASKBYTE | input[i] & MASKBITS)); } } } void UTF8Decode4BytesUnicode(std::vector< byte > input, std::vector< Unicode4Bytes >& output) { for(int i=0; i < input.size();) { Unicode4Bytes ch; // 1111110x 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx if((input[i] & MASK6BYTES) == MASK6BYTES) { ch = ((input[i] & 0×01) << 30) | ((input[i+1] & MASKBITS) << 24) | ((input[i+2] & MASKBITS) << 18) | ((input[i+3] & MASKBITS) << 12) | ((input[i+4] & MASKBITS) << 6) | (input[i+5] & MASKBITS); i += 6; } // 111110xx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx else if((input[i] & MASK5BYTES) == MASK5BYTES) { ch = ((input[i] & 0×03) << 24) | ((input[i+1] & MASKBITS) << 18) | ((input[i+2] & MASKBITS) << 12) | ((input[i+3] & MASKBITS) << 6) | (input[i+4] & MASKBITS); i += 5; } // 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx else if((input[i] & MASK4BYTES) == MASK4BYTES) { ch = ((input[i] & 0×07) << 18) | ((input[i+1] & MASKBITS) << 12) | ((input[i+2] & MASKBITS) << 6) | (input[i+3] & MASKBITS); i += 4; } // 1110xxxx 10xxxxxx 10xxxxxx else if((input[i] & MASK3BYTES) == MASK3BYTES) { ch = ((input[i] & 0×0F) << 12) | ((input[i+1] & MASKBITS) << 6) | (input[i+2] & MASKBITS); i += 3; } // 110xxxxx 10xxxxxx else if((input[i] & MASK2BYTES) == MASK2BYTES) { ch = ((input[i] & 0×1F) << 6) | (input[i+1] & MASKBITS); i += 2; } // 0xxxxxxx else if(input[i] < MASKBYTE) { ch = input[i]; i += 1; } output.push_back(ch); } } |
contributed by
http://www.codeguru.com/cpp/misc/misc/multi-lingualsupport/article.php/c10451
1
cd /media
sudo mkdir data
2
command : blkid
to find UUID
3
sudo nano /etc/fstab
add
#/dev/sda7
/dev/sdc7 /media/t1 ext3 rw,nosuid,nodev,uhelper=hal
#/dev/sdb1
UUID=56EC61E5EC61BFBB /media/t2 ntfs rw,nosuid,nodev,allow_other,blksize=4096