00001 00002 // 00003 // SFML - Simple and Fast Multimedia Library 00004 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com) 00005 // 00006 // This software is provided 'as-is', without any express or implied warranty. 00007 // In no event will the authors be held liable for any damages arising from the use of this software. 00008 // 00009 // Permission is granted to anyone to use this software for any purpose, 00010 // including commercial applications, and to alter it and redistribute it freely, 00011 // subject to the following restrictions: 00012 // 00013 // 1. The origin of this software must not be misrepresented; 00014 // you must not claim that you wrote the original software. 00015 // If you use this software in a product, an acknowledgment 00016 // in the product documentation would be appreciated but is not required. 00017 // 00018 // 2. Altered source versions must be plainly marked as such, 00019 // and must not be misrepresented as being the original software. 00020 // 00021 // 3. This notice may not be removed or altered from any source distribution. 00022 // 00024 00025 00027 // References : 00028 // 00029 // http://www.unicode.org/ 00030 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c 00031 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h 00032 // http://people.w3.org/rishida/scripts/uniview/conversion 00033 // 00035 00036 00038 template <typename In> 00039 In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 00040 { 00041 // Some useful precomputed data 00042 static const int trailing[256] = 00043 { 00044 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00045 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00046 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00047 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00048 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00049 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 00050 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 00051 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5 00052 }; 00053 static const Uint32 offsets[6] = 00054 { 00055 0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080 00056 }; 00057 00058 // Decode the character 00059 int trailingBytes = trailing[static_cast<Uint8>(*begin)]; 00060 if (begin + trailingBytes < end) 00061 { 00062 output = 0; 00063 switch (trailingBytes) 00064 { 00065 case 5 : output += static_cast<Uint8>(*begin++); output <<= 6; 00066 case 4 : output += static_cast<Uint8>(*begin++); output <<= 6; 00067 case 3 : output += static_cast<Uint8>(*begin++); output <<= 6; 00068 case 2 : output += static_cast<Uint8>(*begin++); output <<= 6; 00069 case 1 : output += static_cast<Uint8>(*begin++); output <<= 6; 00070 case 0 : output += static_cast<Uint8>(*begin++); 00071 } 00072 output -= offsets[trailingBytes]; 00073 } 00074 else 00075 { 00076 // Incomplete character 00077 begin = end; 00078 output = replacement; 00079 } 00080 00081 return begin; 00082 } 00083 00084 00086 template <typename Out> 00087 Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement) 00088 { 00089 // Some useful precomputed data 00090 static const Uint8 firstBytes[7] = 00091 { 00092 0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC 00093 }; 00094 00095 // Encode the character 00096 if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF))) 00097 { 00098 // Invalid character 00099 if (replacement) 00100 *output++ = replacement; 00101 } 00102 else 00103 { 00104 // Valid character 00105 00106 // Get the number of bytes to write 00107 int bytesToWrite = 1; 00108 if (input < 0x80) bytesToWrite = 1; 00109 else if (input < 0x800) bytesToWrite = 2; 00110 else if (input < 0x10000) bytesToWrite = 3; 00111 else if (input <= 0x0010FFFF) bytesToWrite = 4; 00112 00113 // Extract the bytes to write 00114 Uint8 bytes[4]; 00115 switch (bytesToWrite) 00116 { 00117 case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00118 case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00119 case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6; 00120 case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytesToWrite]); 00121 } 00122 00123 // Add them to the output 00124 const Uint8* currentByte = bytes; 00125 switch (bytesToWrite) 00126 { 00127 case 4 : *output++ = *currentByte++; 00128 case 3 : *output++ = *currentByte++; 00129 case 2 : *output++ = *currentByte++; 00130 case 1 : *output++ = *currentByte++; 00131 } 00132 } 00133 00134 return output; 00135 } 00136 00137 00139 template <typename In> 00140 In Utf<8>::Next(In begin, In end) 00141 { 00142 Uint32 codepoint; 00143 return Decode(begin, end, codepoint); 00144 } 00145 00146 00148 template <typename In> 00149 std::size_t Utf<8>::Count(In begin, In end) 00150 { 00151 std::size_t length = 0; 00152 while (begin < end) 00153 { 00154 begin = Next(begin, end); 00155 ++length; 00156 } 00157 00158 return length; 00159 } 00160 00161 00163 template <typename In, typename Out> 00164 Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00165 { 00166 while (begin < end) 00167 { 00168 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 00169 output = Encode(codepoint, output); 00170 } 00171 00172 return output; 00173 } 00174 00175 00177 template <typename In, typename Out> 00178 Out Utf<8>::FromWide(In begin, In end, Out output) 00179 { 00180 while (begin < end) 00181 { 00182 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 00183 output = Encode(codepoint, output); 00184 } 00185 00186 return output; 00187 } 00188 00189 00191 template <typename In, typename Out> 00192 Out Utf<8>::FromLatin1(In begin, In end, Out output) 00193 { 00194 // Latin-1 is directly compatible with Unicode encodings, 00195 // and can thus be treated as (a sub-range of) UTF-32 00196 while (begin < end) 00197 output = Encode(*begin++, output); 00198 00199 return output; 00200 } 00201 00202 00204 template <typename In, typename Out> 00205 Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00206 { 00207 while (begin < end) 00208 { 00209 Uint32 codepoint; 00210 begin = Decode(begin, end, codepoint); 00211 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 00212 } 00213 00214 return output; 00215 } 00216 00217 00219 template <typename In, typename Out> 00220 Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement) 00221 { 00222 while (begin < end) 00223 { 00224 Uint32 codepoint; 00225 begin = Decode(begin, end, codepoint); 00226 output = Utf<32>::EncodeWide(codepoint, output, replacement); 00227 } 00228 00229 return output; 00230 } 00231 00232 00234 template <typename In, typename Out> 00235 Out Utf<8>::ToLatin1(In begin, In end, Out output, char replacement) 00236 { 00237 // Latin-1 is directly compatible with Unicode encodings, 00238 // and can thus be treated as (a sub-range of) UTF-32 00239 while (begin < end) 00240 { 00241 Uint32 codepoint; 00242 begin = Decode(begin, end, codepoint); 00243 *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement; 00244 } 00245 00246 return output; 00247 } 00248 00249 00251 template <typename In, typename Out> 00252 Out Utf<8>::ToUtf8(In begin, In end, Out output) 00253 { 00254 while (begin < end) 00255 *output++ = *begin++; 00256 00257 return output; 00258 } 00259 00260 00262 template <typename In, typename Out> 00263 Out Utf<8>::ToUtf16(In begin, In end, Out output) 00264 { 00265 while (begin < end) 00266 { 00267 Uint32 codepoint; 00268 begin = Decode(begin, end, codepoint); 00269 output = Utf<16>::Encode(codepoint, output); 00270 } 00271 00272 return output; 00273 } 00274 00275 00277 template <typename In, typename Out> 00278 Out Utf<8>::ToUtf32(In begin, In end, Out output) 00279 { 00280 while (begin < end) 00281 { 00282 Uint32 codepoint; 00283 begin = Decode(begin, end, codepoint); 00284 *output++ = codepoint; 00285 } 00286 00287 return output; 00288 } 00289 00290 00292 template <typename In> 00293 In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement) 00294 { 00295 Uint16 first = *begin++; 00296 00297 // If it's a surrogate pair, first convert to a single UTF-32 character 00298 if ((first >= 0xD800) && (first <= 0xDBFF)) 00299 { 00300 if (begin < end) 00301 { 00302 Uint32 second = *begin++; 00303 if ((second >= 0xDC00) && (second <= 0xDFFF)) 00304 { 00305 // The second element is valid: convert the two elements to a UTF-32 character 00306 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000); 00307 } 00308 else 00309 { 00310 // Invalid character 00311 output = replacement; 00312 } 00313 } 00314 else 00315 { 00316 // Invalid character 00317 begin = end; 00318 output = replacement; 00319 } 00320 } 00321 else 00322 { 00323 // We can make a direct copy 00324 output = first; 00325 } 00326 00327 return begin; 00328 } 00329 00330 00332 template <typename Out> 00333 Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement) 00334 { 00335 if (input < 0xFFFF) 00336 { 00337 // The character can be copied directly, we just need to check if it's in the valid range 00338 if ((input >= 0xD800) && (input <= 0xDFFF)) 00339 { 00340 // Invalid character (this range is reserved) 00341 if (replacement) 00342 *output++ = replacement; 00343 } 00344 else 00345 { 00346 // Valid character directly convertible to a single UTF-16 character 00347 *output++ = static_cast<Uint16>(input); 00348 } 00349 } 00350 else if (input > 0x0010FFFF) 00351 { 00352 // Invalid character (greater than the maximum unicode value) 00353 if (replacement) 00354 *output++ = replacement; 00355 } 00356 else 00357 { 00358 // The input character will be converted to two UTF-16 elements 00359 input -= 0x0010000; 00360 *output++ = static_cast<Uint16>((input >> 10) + 0xD800); 00361 *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00); 00362 } 00363 00364 return output; 00365 } 00366 00367 00369 template <typename In> 00370 In Utf<16>::Next(In begin, In end) 00371 { 00372 Uint32 codepoint; 00373 return Decode(begin, end, codepoint); 00374 } 00375 00376 00378 template <typename In> 00379 std::size_t Utf<16>::Count(In begin, In end) 00380 { 00381 std::size_t length = 0; 00382 while (begin < end) 00383 { 00384 begin = Next(begin, end); 00385 ++length; 00386 } 00387 00388 return length; 00389 } 00390 00391 00393 template <typename In, typename Out> 00394 Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00395 { 00396 while (begin < end) 00397 { 00398 Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale); 00399 output = Encode(codepoint, output); 00400 } 00401 00402 return output; 00403 } 00404 00405 00407 template <typename In, typename Out> 00408 Out Utf<16>::FromWide(In begin, In end, Out output) 00409 { 00410 while (begin < end) 00411 { 00412 Uint32 codepoint = Utf<32>::DecodeWide(*begin++); 00413 output = Encode(codepoint, output); 00414 } 00415 00416 return output; 00417 } 00418 00419 00421 template <typename In, typename Out> 00422 Out Utf<16>::FromLatin1(In begin, In end, Out output) 00423 { 00424 // Latin-1 is directly compatible with Unicode encodings, 00425 // and can thus be treated as (a sub-range of) UTF-32 00426 while (begin < end) 00427 *output++ = *begin++; 00428 00429 return output; 00430 } 00431 00432 00434 template <typename In, typename Out> 00435 Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00436 { 00437 while (begin < end) 00438 { 00439 Uint32 codepoint; 00440 begin = Decode(begin, end, codepoint); 00441 output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale); 00442 } 00443 00444 return output; 00445 } 00446 00447 00449 template <typename In, typename Out> 00450 Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement) 00451 { 00452 while (begin < end) 00453 { 00454 Uint32 codepoint; 00455 begin = Decode(begin, end, codepoint); 00456 output = Utf<32>::EncodeWide(codepoint, output, replacement); 00457 } 00458 00459 return output; 00460 } 00461 00462 00464 template <typename In, typename Out> 00465 Out Utf<16>::ToLatin1(In begin, In end, Out output, char replacement) 00466 { 00467 // Latin-1 is directly compatible with Unicode encodings, 00468 // and can thus be treated as (a sub-range of) UTF-32 00469 while (begin < end) 00470 { 00471 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; 00472 begin++; 00473 } 00474 00475 return output; 00476 } 00477 00478 00480 template <typename In, typename Out> 00481 Out Utf<16>::ToUtf8(In begin, In end, Out output) 00482 { 00483 while (begin < end) 00484 { 00485 Uint32 codepoint; 00486 begin = Decode(begin, end, codepoint); 00487 output = Utf<8>::Encode(codepoint, output); 00488 } 00489 00490 return output; 00491 } 00492 00493 00495 template <typename In, typename Out> 00496 Out Utf<16>::ToUtf16(In begin, In end, Out output) 00497 { 00498 while (begin < end) 00499 *output++ = *begin++; 00500 00501 return output; 00502 } 00503 00504 00506 template <typename In, typename Out> 00507 Out Utf<16>::ToUtf32(In begin, In end, Out output) 00508 { 00509 while (begin < end) 00510 { 00511 Uint32 codepoint; 00512 begin = Decode(begin, end, codepoint); 00513 *output++ = codepoint; 00514 } 00515 00516 return output; 00517 } 00518 00519 00521 template <typename In> 00522 In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32) 00523 { 00524 output = *begin++; 00525 return begin; 00526 } 00527 00528 00530 template <typename Out> 00531 Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement) 00532 { 00533 *output++ = input; 00534 return output; 00535 } 00536 00537 00539 template <typename In> 00540 In Utf<32>::Next(In begin, In end) 00541 { 00542 return ++begin; 00543 } 00544 00545 00547 template <typename In> 00548 std::size_t Utf<32>::Count(In begin, In end) 00549 { 00550 return begin - end; 00551 } 00552 00553 00555 template <typename In, typename Out> 00556 Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale) 00557 { 00558 while (begin < end) 00559 *output++ = DecodeAnsi(*begin++, locale); 00560 00561 return output; 00562 } 00563 00564 00566 template <typename In, typename Out> 00567 Out Utf<32>::FromWide(In begin, In end, Out output) 00568 { 00569 while (begin < end) 00570 *output++ = DecodeWide(*begin++); 00571 00572 return output; 00573 } 00574 00575 00577 template <typename In, typename Out> 00578 Out Utf<32>::FromLatin1(In begin, In end, Out output) 00579 { 00580 // Latin-1 is directly compatible with Unicode encodings, 00581 // and can thus be treated as (a sub-range of) UTF-32 00582 while (begin < end) 00583 *output++ = *begin++; 00584 00585 return output; 00586 } 00587 00588 00590 template <typename In, typename Out> 00591 Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale) 00592 { 00593 while (begin < end) 00594 output = EncodeAnsi(*begin++, output, replacement, locale); 00595 00596 return output; 00597 } 00598 00599 00601 template <typename In, typename Out> 00602 Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement) 00603 { 00604 while (begin < end) 00605 output = EncodeWide(*begin++, output, replacement); 00606 00607 return output; 00608 } 00609 00610 00612 template <typename In, typename Out> 00613 Out Utf<32>::ToLatin1(In begin, In end, Out output, char replacement) 00614 { 00615 // Latin-1 is directly compatible with Unicode encodings, 00616 // and can thus be treated as (a sub-range of) UTF-32 00617 while (begin < end) 00618 { 00619 *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement; 00620 begin++; 00621 } 00622 00623 return output; 00624 } 00625 00626 00628 template <typename In, typename Out> 00629 Out Utf<32>::ToUtf8(In begin, In end, Out output) 00630 { 00631 while (begin < end) 00632 output = Utf<8>::Encode(*begin++, output); 00633 00634 return output; 00635 } 00636 00638 template <typename In, typename Out> 00639 Out Utf<32>::ToUtf16(In begin, In end, Out output) 00640 { 00641 while (begin < end) 00642 output = Utf<16>::Encode(*begin++, output); 00643 00644 return output; 00645 } 00646 00647 00649 template <typename In, typename Out> 00650 Out Utf<32>::ToUtf32(In begin, In end, Out output) 00651 { 00652 while (begin < end) 00653 *output++ = *begin++; 00654 00655 return output; 00656 } 00657 00658 00660 template <typename In> 00661 Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale) 00662 { 00663 // On Windows, gcc's standard library (glibc++) has almost 00664 // no support for Unicode stuff. As a consequence, in this 00665 // context we can only use the default locale and ignore 00666 // the one passed as parameter. 00667 00668 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 00669 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 00670 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 00671 00672 wchar_t character = 0; 00673 mbtowc(&character, &input, 1); 00674 return static_cast<Uint32>(character); 00675 00676 #else 00677 00678 // Get the facet of the locale which deals with character conversion 00679 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 00680 00681 // Use the facet to convert each character of the input string 00682 return static_cast<Uint32>(facet.widen(input)); 00683 00684 #endif 00685 } 00686 00687 00689 template <typename In> 00690 Uint32 Utf<32>::DecodeWide(In input) 00691 { 00692 // The encoding of wide characters is not well defined and is left to the system; 00693 // however we can safely assume that it is UCS-2 on Windows and 00694 // UCS-4 on Unix systems. 00695 // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4, 00696 // and UCS-4 *is* UTF-32). 00697 00698 return input; 00699 } 00700 00701 00703 template <typename Out> 00704 Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale) 00705 { 00706 // On Windows, gcc's standard library (glibc++) has almost 00707 // no support for Unicode stuff. As a consequence, in this 00708 // context we can only use the default locale and ignore 00709 // the one passed as parameter. 00710 00711 #if defined(SFML_SYSTEM_WINDOWS) && /* if Windows ... */ \ 00712 (defined(__GLIBCPP__) || defined (__GLIBCXX__)) && /* ... and standard library is glibc++ ... */ \ 00713 !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */ 00714 00715 char character = 0; 00716 if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0) 00717 *output++ = character; 00718 else if (replacement) 00719 *output++ = replacement; 00720 00721 return output; 00722 00723 #else 00724 00725 // Get the facet of the locale which deals with character conversion 00726 const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale); 00727 00728 // Use the facet to convert each character of the input string 00729 *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement); 00730 00731 return output; 00732 00733 #endif 00734 } 00735 00736 00738 template <typename Out> 00739 Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement) 00740 { 00741 // The encoding of wide characters is not well defined and is left to the system; 00742 // however we can safely assume that it is UCS-2 on Windows and 00743 // UCS-4 on Unix systems. 00744 // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4). 00745 // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32). 00746 00747 switch (sizeof(wchar_t)) 00748 { 00749 case 4: 00750 { 00751 *output++ = static_cast<wchar_t>(codepoint); 00752 break; 00753 } 00754 00755 default: 00756 { 00757 if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF))) 00758 { 00759 *output++ = static_cast<wchar_t>(codepoint); 00760 } 00761 else if (replacement) 00762 { 00763 *output++ = replacement; 00764 } 00765 break; 00766 } 00767 } 00768 00769 return output; 00770 }
:: Copyright © 2007-2008 Laurent Gomila, all rights reserved :: Documentation generated by doxygen 1.5.2 ::