SFML logo
  • Main Page
  • Modules
  • Classes
  • Files
  • File List

Utf.inl

00001 
00002 //
00003 // SFML - Simple and Fast Multimedia Library
00004 // Copyright (C) 2007-2009 Laurent Gomila (laurent.gom@gmail.com)
00005 //
00006 // This software is provided 'as-is', without any express or implied warranty.
00007 // In no event will the authors be held liable for any damages arising from the use of this software.
00008 //
00009 // Permission is granted to anyone to use this software for any purpose,
00010 // including commercial applications, and to alter it and redistribute it freely,
00011 // subject to the following restrictions:
00012 //
00013 // 1. The origin of this software must not be misrepresented;
00014 //    you must not claim that you wrote the original software.
00015 //    If you use this software in a product, an acknowledgment
00016 //    in the product documentation would be appreciated but is not required.
00017 //
00018 // 2. Altered source versions must be plainly marked as such,
00019 //    and must not be misrepresented as being the original software.
00020 //
00021 // 3. This notice may not be removed or altered from any source distribution.
00022 //
00024 
00025 
00027 // References :
00028 //
00029 // http://www.unicode.org/
00030 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.c
00031 // http://www.unicode.org/Public/PROGRAMS/CVTUTF/ConvertUTF.h
00032 // http://people.w3.org/rishida/scripts/uniview/conversion
00033 //
00035 
00036 
00038 template <typename In>
00039 In Utf<8>::Decode(In begin, In end, Uint32& output, Uint32 replacement)
00040 {
00041     // Some useful precomputed data
00042     static const int trailing[256] =
00043     {
00044         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00045         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00046         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00047         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00048         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00049         0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
00050         1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
00051         2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 5, 5, 5, 5
00052     };
00053     static const Uint32 offsets[6] =
00054     {
00055         0x00000000, 0x00003080, 0x000E2080, 0x03C82080, 0xFA082080, 0x82082080
00056     };
00057 
00058     // Decode the character
00059     int trailingBytes = trailing[static_cast<Uint8>(*begin)];
00060     if (begin + trailingBytes < end)
00061     {
00062         output = 0;
00063         switch (trailingBytes)
00064         {
00065             case 5 : output += static_cast<Uint8>(*begin++); output <<= 6;
00066             case 4 : output += static_cast<Uint8>(*begin++); output <<= 6;
00067             case 3 : output += static_cast<Uint8>(*begin++); output <<= 6;
00068             case 2 : output += static_cast<Uint8>(*begin++); output <<= 6;
00069             case 1 : output += static_cast<Uint8>(*begin++); output <<= 6;
00070             case 0 : output += static_cast<Uint8>(*begin++);
00071         }
00072         output -= offsets[trailingBytes];
00073     }
00074     else
00075     {
00076         // Incomplete character
00077         begin = end;
00078         output = replacement;
00079     }
00080 
00081     return begin;
00082 }
00083 
00084 
00086 template <typename Out>
00087 Out Utf<8>::Encode(Uint32 input, Out output, Uint8 replacement)
00088 {
00089     // Some useful precomputed data
00090     static const Uint8 firstBytes[7] =
00091     {
00092         0x00, 0x00, 0xC0, 0xE0, 0xF0, 0xF8, 0xFC
00093     };
00094 
00095     // Encode the character
00096     if ((input > 0x0010FFFF) || ((input >= 0xD800) && (input <= 0xDBFF)))
00097     {
00098         // Invalid character
00099         if (replacement)
00100             *output++ = replacement;
00101     }
00102     else
00103     {
00104         // Valid character
00105 
00106         // Get the number of bytes to write
00107         int bytesToWrite = 1;
00108         if      (input <  0x80)       bytesToWrite = 1;
00109         else if (input <  0x800)      bytesToWrite = 2;
00110         else if (input <  0x10000)    bytesToWrite = 3;
00111         else if (input <= 0x0010FFFF) bytesToWrite = 4;
00112 
00113         // Extract the bytes to write
00114         Uint8 bytes[4];
00115         switch (bytesToWrite)
00116         {
00117             case 4 : bytes[3] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00118             case 3 : bytes[2] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00119             case 2 : bytes[1] = static_cast<Uint8>((input | 0x80) & 0xBF); input >>= 6;
00120             case 1 : bytes[0] = static_cast<Uint8> (input | firstBytes[bytesToWrite]);
00121         }
00122 
00123         // Add them to the output
00124         const Uint8* currentByte = bytes;
00125         switch (bytesToWrite)
00126         {
00127             case 4 : *output++ = *currentByte++;
00128             case 3 : *output++ = *currentByte++;
00129             case 2 : *output++ = *currentByte++;
00130             case 1 : *output++ = *currentByte++;
00131         }
00132     }
00133 
00134     return output;
00135 }
00136 
00137 
00139 template <typename In>
00140 In Utf<8>::Next(In begin, In end)
00141 {
00142     Uint32 codepoint;
00143     return Decode(begin, end, codepoint);
00144 }
00145 
00146 
00148 template <typename In>
00149 std::size_t Utf<8>::Count(In begin, In end)
00150 {
00151     std::size_t length = 0;
00152     while (begin < end)
00153     {
00154         begin = Next(begin, end);
00155         ++length;
00156     }
00157 
00158     return length;
00159 }
00160 
00161 
00163 template <typename In, typename Out>
00164 Out Utf<8>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00165 {
00166     while (begin < end)
00167     {
00168         Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale);
00169         output = Encode(codepoint, output);
00170     }
00171 
00172     return output;
00173 }
00174 
00175 
00177 template <typename In, typename Out>
00178 Out Utf<8>::FromWide(In begin, In end, Out output)
00179 {
00180     while (begin < end)
00181     {
00182         Uint32 codepoint = Utf<32>::DecodeWide(*begin++);
00183         output = Encode(codepoint, output);
00184     }
00185 
00186     return output;
00187 }
00188 
00189 
00191 template <typename In, typename Out>
00192 Out Utf<8>::FromLatin1(In begin, In end, Out output)
00193 {
00194     // Latin-1 is directly compatible with Unicode encodings,
00195     // and can thus be treated as (a sub-range of) UTF-32
00196     while (begin < end)
00197         output = Encode(*begin++, output);
00198 
00199     return output;
00200 }
00201 
00202 
00204 template <typename In, typename Out>
00205 Out Utf<8>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00206 {
00207     while (begin < end)
00208     {
00209         Uint32 codepoint;
00210         begin = Decode(begin, end, codepoint);
00211         output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale);
00212     }
00213 
00214     return output;
00215 }
00216 
00217 
00219 template <typename In, typename Out>
00220 Out Utf<8>::ToWide(In begin, In end, Out output, wchar_t replacement)
00221 {
00222     while (begin < end)
00223     {
00224         Uint32 codepoint;
00225         begin = Decode(begin, end, codepoint);
00226         output = Utf<32>::EncodeWide(codepoint, output, replacement);
00227     }
00228 
00229     return output;
00230 }
00231 
00232 
00234 template <typename In, typename Out>
00235 Out Utf<8>::ToLatin1(In begin, In end, Out output, char replacement)
00236 {
00237     // Latin-1 is directly compatible with Unicode encodings,
00238     // and can thus be treated as (a sub-range of) UTF-32
00239     while (begin < end)
00240     {
00241         Uint32 codepoint;
00242         begin = Decode(begin, end, codepoint);
00243         *output++ = codepoint < 256 ? static_cast<char>(codepoint) : replacement;
00244     }
00245 
00246     return output;
00247 }
00248 
00249 
00251 template <typename In, typename Out>
00252 Out Utf<8>::ToUtf8(In begin, In end, Out output)
00253 {
00254     while (begin < end)
00255         *output++ = *begin++;
00256 
00257     return output;
00258 }
00259 
00260 
00262 template <typename In, typename Out>
00263 Out Utf<8>::ToUtf16(In begin, In end, Out output)
00264 {
00265     while (begin < end)
00266     {
00267         Uint32 codepoint;
00268         begin = Decode(begin, end, codepoint);
00269         output = Utf<16>::Encode(codepoint, output);
00270     }
00271 
00272     return output;
00273 }
00274 
00275 
00277 template <typename In, typename Out>
00278 Out Utf<8>::ToUtf32(In begin, In end, Out output)
00279 {
00280     while (begin < end)
00281     {
00282         Uint32 codepoint;
00283         begin = Decode(begin, end, codepoint);
00284         *output++ = codepoint;
00285     }
00286 
00287     return output;
00288 }
00289 
00290 
00292 template <typename In>
00293 In Utf<16>::Decode(In begin, In end, Uint32& output, Uint32 replacement)
00294 {
00295     Uint16 first = *begin++;
00296 
00297     // If it's a surrogate pair, first convert to a single UTF-32 character
00298     if ((first >= 0xD800) && (first <= 0xDBFF))
00299     {
00300         if (begin < end)
00301         {
00302             Uint32 second = *begin++;
00303             if ((second >= 0xDC00) && (second <= 0xDFFF))
00304             {
00305                 // The second element is valid: convert the two elements to a UTF-32 character
00306                 output = static_cast<Uint32>(((first - 0xD800) << 10) + (second - 0xDC00) + 0x0010000);
00307             }
00308             else
00309             {
00310                 // Invalid character
00311                 output = replacement;
00312             }
00313         }
00314         else
00315         {
00316             // Invalid character
00317             begin = end;
00318             output = replacement;
00319         }
00320     }
00321     else
00322     {
00323         // We can make a direct copy
00324         output = first;
00325     }
00326 
00327     return begin;
00328 }
00329 
00330 
00332 template <typename Out>
00333 Out Utf<16>::Encode(Uint32 input, Out output, Uint16 replacement)
00334 {
00335     if (input < 0xFFFF)
00336     {
00337         // The character can be copied directly, we just need to check if it's in the valid range
00338         if ((input >= 0xD800) && (input <= 0xDFFF))
00339         {
00340             // Invalid character (this range is reserved)
00341             if (replacement)
00342                 *output++ = replacement;
00343         }
00344         else
00345         {
00346             // Valid character directly convertible to a single UTF-16 character
00347             *output++ = static_cast<Uint16>(input);
00348         }
00349     }
00350     else if (input > 0x0010FFFF)
00351     {
00352         // Invalid character (greater than the maximum unicode value)
00353         if (replacement)
00354             *output++ = replacement;
00355     }
00356     else
00357     {
00358         // The input character will be converted to two UTF-16 elements
00359         input -= 0x0010000;
00360         *output++ = static_cast<Uint16>((input >> 10)     + 0xD800);
00361         *output++ = static_cast<Uint16>((input & 0x3FFUL) + 0xDC00);
00362     }
00363 
00364     return output;
00365 }
00366 
00367 
00369 template <typename In>
00370 In Utf<16>::Next(In begin, In end)
00371 {
00372     Uint32 codepoint;
00373     return Decode(begin, end, codepoint);
00374 }
00375 
00376 
00378 template <typename In>
00379 std::size_t Utf<16>::Count(In begin, In end)
00380 {
00381     std::size_t length = 0;
00382     while (begin < end)
00383     {
00384         begin = Next(begin, end);
00385         ++length;
00386     }
00387 
00388     return length;
00389 }
00390 
00391 
00393 template <typename In, typename Out>
00394 Out Utf<16>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00395 {
00396     while (begin < end)
00397     {
00398         Uint32 codepoint = Utf<32>::DecodeAnsi(*begin++, locale);
00399         output = Encode(codepoint, output);
00400     }
00401 
00402     return output;
00403 }
00404 
00405 
00407 template <typename In, typename Out>
00408 Out Utf<16>::FromWide(In begin, In end, Out output)
00409 {
00410     while (begin < end)
00411     {
00412         Uint32 codepoint = Utf<32>::DecodeWide(*begin++);
00413         output = Encode(codepoint, output);
00414     }
00415 
00416     return output;
00417 }
00418 
00419 
00421 template <typename In, typename Out>
00422 Out Utf<16>::FromLatin1(In begin, In end, Out output)
00423 {
00424     // Latin-1 is directly compatible with Unicode encodings,
00425     // and can thus be treated as (a sub-range of) UTF-32
00426     while (begin < end)
00427         *output++ = *begin++;
00428 
00429     return output;
00430 }
00431 
00432 
00434 template <typename In, typename Out>
00435 Out Utf<16>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00436 {
00437     while (begin < end)
00438     {
00439         Uint32 codepoint;
00440         begin = Decode(begin, end, codepoint);
00441         output = Utf<32>::EncodeAnsi(codepoint, output, replacement, locale);
00442     }
00443 
00444     return output;
00445 }
00446 
00447 
00449 template <typename In, typename Out>
00450 Out Utf<16>::ToWide(In begin, In end, Out output, wchar_t replacement)
00451 {
00452     while (begin < end)
00453     {
00454         Uint32 codepoint;
00455         begin = Decode(begin, end, codepoint);
00456         output = Utf<32>::EncodeWide(codepoint, output, replacement);
00457     }
00458 
00459     return output;
00460 }
00461 
00462 
00464 template <typename In, typename Out>
00465 Out Utf<16>::ToLatin1(In begin, In end, Out output, char replacement)
00466 {
00467     // Latin-1 is directly compatible with Unicode encodings,
00468     // and can thus be treated as (a sub-range of) UTF-32
00469     while (begin < end)
00470     {
00471         *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
00472         begin++;
00473     }
00474 
00475     return output;
00476 }
00477 
00478 
00480 template <typename In, typename Out>
00481 Out Utf<16>::ToUtf8(In begin, In end, Out output)
00482 {
00483     while (begin < end)
00484     {
00485         Uint32 codepoint;
00486         begin = Decode(begin, end, codepoint);
00487         output = Utf<8>::Encode(codepoint, output);
00488     }
00489 
00490     return output;
00491 }
00492 
00493 
00495 template <typename In, typename Out>
00496 Out Utf<16>::ToUtf16(In begin, In end, Out output)
00497 {
00498     while (begin < end)
00499         *output++ = *begin++;
00500 
00501     return output;
00502 }
00503 
00504 
00506 template <typename In, typename Out>
00507 Out Utf<16>::ToUtf32(In begin, In end, Out output)
00508 {
00509     while (begin < end)
00510     {
00511         Uint32 codepoint;
00512         begin = Decode(begin, end, codepoint);
00513         *output++ = codepoint;
00514     }
00515 
00516     return output;
00517 }
00518 
00519 
00521 template <typename In>
00522 In Utf<32>::Decode(In begin, In end, Uint32& output, Uint32)
00523 {
00524     output = *begin++;
00525     return begin;
00526 }
00527 
00528 
00530 template <typename Out>
00531 Out Utf<32>::Encode(Uint32 input, Out output, Uint32 replacement)
00532 {
00533     *output++ = input;
00534     return output;
00535 }
00536 
00537 
00539 template <typename In>
00540 In Utf<32>::Next(In begin, In end)
00541 {
00542     return ++begin;
00543 }
00544 
00545 
00547 template <typename In>
00548 std::size_t Utf<32>::Count(In begin, In end)
00549 {
00550     return begin - end;
00551 }
00552 
00553 
00555 template <typename In, typename Out>
00556 Out Utf<32>::FromAnsi(In begin, In end, Out output, const std::locale& locale)
00557 {
00558     while (begin < end)
00559         *output++ = DecodeAnsi(*begin++, locale);
00560 
00561     return output;
00562 }
00563 
00564 
00566 template <typename In, typename Out>
00567 Out Utf<32>::FromWide(In begin, In end, Out output)
00568 {
00569     while (begin < end)
00570         *output++ = DecodeWide(*begin++);
00571 
00572     return output;
00573 }
00574 
00575 
00577 template <typename In, typename Out>
00578 Out Utf<32>::FromLatin1(In begin, In end, Out output)
00579 {
00580     // Latin-1 is directly compatible with Unicode encodings,
00581     // and can thus be treated as (a sub-range of) UTF-32
00582     while (begin < end)
00583         *output++ = *begin++;
00584 
00585     return output;
00586 }
00587 
00588 
00590 template <typename In, typename Out>
00591 Out Utf<32>::ToAnsi(In begin, In end, Out output, char replacement, const std::locale& locale)
00592 {
00593     while (begin < end)
00594         output = EncodeAnsi(*begin++, output, replacement, locale);
00595 
00596     return output;
00597 }
00598 
00599 
00601 template <typename In, typename Out>
00602 Out Utf<32>::ToWide(In begin, In end, Out output, wchar_t replacement)
00603 {
00604     while (begin < end)
00605         output = EncodeWide(*begin++, output, replacement);
00606 
00607     return output;
00608 }
00609 
00610 
00612 template <typename In, typename Out>
00613 Out Utf<32>::ToLatin1(In begin, In end, Out output, char replacement)
00614 {
00615     // Latin-1 is directly compatible with Unicode encodings,
00616     // and can thus be treated as (a sub-range of) UTF-32
00617     while (begin < end)
00618     {
00619         *output++ = *begin < 256 ? static_cast<char>(*begin) : replacement;
00620         begin++;
00621     }
00622 
00623     return output;
00624 }
00625 
00626 
00628 template <typename In, typename Out>
00629 Out Utf<32>::ToUtf8(In begin, In end, Out output)
00630 {
00631     while (begin < end)
00632         output = Utf<8>::Encode(*begin++, output);
00633 
00634     return output;
00635 }
00636 
00638 template <typename In, typename Out>
00639 Out Utf<32>::ToUtf16(In begin, In end, Out output)
00640 {
00641     while (begin < end)
00642         output = Utf<16>::Encode(*begin++, output);
00643 
00644     return output;
00645 }
00646 
00647 
00649 template <typename In, typename Out>
00650 Out Utf<32>::ToUtf32(In begin, In end, Out output)
00651 {
00652     while (begin < end)
00653         *output++ = *begin++;
00654 
00655     return output;
00656 }
00657 
00658 
00660 template <typename In>
00661 Uint32 Utf<32>::DecodeAnsi(In input, const std::locale& locale)
00662 {
00663     // On Windows, gcc's standard library (glibc++) has almost
00664     // no support for Unicode stuff. As a consequence, in this
00665     // context we can only use the default locale and ignore
00666     // the one passed as parameter.
00667 
00668     #if defined(SFML_SYSTEM_WINDOWS) &&                       /* if Windows ... */                          \
00669        (defined(__GLIBCPP__) || defined (__GLIBCXX__)) &&     /* ... and standard library is glibc++ ... */ \
00670       !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
00671 
00672         wchar_t character = 0;
00673         mbtowc(&character, &input, 1);
00674         return static_cast<Uint32>(character);
00675 
00676     #else
00677 
00678         // Get the facet of the locale which deals with character conversion
00679         const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
00680 
00681         // Use the facet to convert each character of the input string
00682         return static_cast<Uint32>(facet.widen(input));
00683 
00684     #endif
00685 }
00686 
00687 
00689 template <typename In>
00690 Uint32 Utf<32>::DecodeWide(In input)
00691 {
00692     // The encoding of wide characters is not well defined and is left to the system;
00693     // however we can safely assume that it is UCS-2 on Windows and
00694     // UCS-4 on Unix systems.
00695     // In both cases, a simple copy is enough (UCS-2 is a subset of UCS-4,
00696     // and UCS-4 *is* UTF-32).
00697 
00698     return input;
00699 }
00700 
00701 
00703 template <typename Out>
00704 Out Utf<32>::EncodeAnsi(Uint32 codepoint, Out output, char replacement, const std::locale& locale)
00705 {
00706     // On Windows, gcc's standard library (glibc++) has almost
00707     // no support for Unicode stuff. As a consequence, in this
00708     // context we can only use the default locale and ignore
00709     // the one passed as parameter.
00710 
00711     #if defined(SFML_SYSTEM_WINDOWS) &&                       /* if Windows ... */                          \
00712        (defined(__GLIBCPP__) || defined (__GLIBCXX__)) &&     /* ... and standard library is glibc++ ... */ \
00713       !(defined(__SGI_STL_PORT) || defined(_STLPORT_VERSION)) /* ... and STLPort is not used on top of it */
00714 
00715         char character = 0;
00716         if (wctomb(&character, static_cast<wchar_t>(codepoint)) >= 0)
00717             *output++ = character;
00718         else if (replacement)
00719             *output++ = replacement;
00720 
00721         return output;
00722 
00723     #else
00724 
00725         // Get the facet of the locale which deals with character conversion
00726         const std::ctype<wchar_t>& facet = std::use_facet< std::ctype<wchar_t> >(locale);
00727 
00728         // Use the facet to convert each character of the input string
00729         *output++ = facet.narrow(static_cast<wchar_t>(codepoint), replacement);
00730 
00731         return output;
00732 
00733     #endif
00734 }
00735 
00736 
00738 template <typename Out>
00739 Out Utf<32>::EncodeWide(Uint32 codepoint, Out output, wchar_t replacement)
00740 {
00741     // The encoding of wide characters is not well defined and is left to the system;
00742     // however we can safely assume that it is UCS-2 on Windows and
00743     // UCS-4 on Unix systems.
00744     // For UCS-2 we need to check if the source characters fits in (UCS-2 is a subset of UCS-4).
00745     // For UCS-4 we can do a direct copy (UCS-4 *is* UTF-32).
00746 
00747     switch (sizeof(wchar_t))
00748     {
00749         case 4:
00750         {
00751             *output++ = static_cast<wchar_t>(codepoint);
00752             break;
00753         }
00754 
00755         default:
00756         {
00757             if ((codepoint <= 0xFFFF) && ((codepoint < 0xD800) || (codepoint > 0xDFFF)))
00758             {
00759                 *output++ = static_cast<wchar_t>(codepoint);
00760             }
00761             else if (replacement)
00762             {
00763                 *output++ = replacement;
00764             }
00765             break;
00766         }
00767     }
00768 
00769     return output;
00770 }

 ::  Copyright © 2007-2008 Laurent Gomila, all rights reserved  ::  Documentation generated by doxygen 1.5.2  ::