20 #ifndef PSTORE_SUPPORT_UTF_HPP 21 #define PSTORE_SUPPORT_UTF_HPP 27 #include "pstore/support/gsl.hpp" 43 std::string to8 (
wchar_t const *
const wstr, std::size_t length);
46 std::string to8 (
wchar_t const *
const wstr);
49 inline std::string to8 (std::wstring
const & wstr) {
50 return to8 (wstr.data (), wstr.length ());
58 std::wstring to16 (
char const * str, std::size_t length);
61 std::wstring to16 (
char const * str);
63 inline std::wstring to16 (std::string
const & str) {
64 return to16 (str.data (), str.length ());
71 std::string to_mbcs (
char const * str, std::size_t length);
72 inline std::string to_mbcs (
char const * str) {
73 PSTORE_ASSERT (str !=
nullptr);
74 return to_mbcs (str, std::strlen (str));
76 inline std::string to_mbcs (std::string
const & str) {
77 return to_mbcs (str.data (), str.length ());
79 std::string to_mbcs (
wchar_t const * wstr, std::size_t length);
80 inline std::string to_mbcs (std::wstring
const & str) {
81 return to_mbcs (str.data (), str.length ());
94 std::string mbcs_to8 (
char const * mbcs, std::size_t length);
98 inline std::string mbcs_to8 (
char const * str) {
99 PSTORE_ASSERT (str !=
nullptr);
100 return mbcs_to8 (str, std::strlen (str));
104 inline std::string mbcs_to8 (std::string
const & str) {
105 return mbcs_to8 (str.data (), str.length ());
118 #endif // !defined(_WIN32) 124 using utf8_string = std::basic_string<std::uint8_t>;
125 using utf16_string = std::basic_string<char16_t>;
127 auto operator<< (std::ostream & os, utf8_string
const & s) -> std::ostream &;
132 auto is_well_formed ()
const noexcept ->
bool {
return well_formed_; }
135 enum state { accept, reject };
141 static std::uint8_t
const utf8d_[];
142 char32_t codepoint_ = 0;
143 std::uint8_t state_ = accept;
144 bool well_formed_ =
true;
148 constexpr char32_t replacement_char_code_point = 0xFFFD;
150 template <
typename CharType =
char,
typename OutputIt>
151 auto code_point_to_utf8 (char32_t c, OutputIt out) -> OutputIt;
154 template <
typename CharType =
char,
typename OutputIt>
155 auto replacement_char (OutputIt out) -> OutputIt {
156 return code_point_to_utf8<CharType> (replacement_char_code_point, out);
159 template <
typename CharType,
typename OutputIt>
160 auto code_point_to_utf8 (char32_t
const c, OutputIt out) -> OutputIt {
162 *(out++) = static_cast<CharType> (c);
163 }
else if (c < 0x800) {
164 *(out++) = static_cast<CharType> (c / 64U + 0xC0U);
165 *(out++) = static_cast<CharType> (c % 64U + 0x80U);
166 }
else if ((c >= 0xD800 && c < 0xE000) || c >= 0x110000) {
167 out = replacement_char<CharType> (out);
168 }
else if (c < 0x10000) {
169 *(out++) = static_cast<CharType> ((c / 0x1000U) | 0xE0U);
170 *(out++) = static_cast<CharType> ((c / 64U % 64U) | 0x80U);
171 *(out++) = static_cast<CharType> ((c % 64U) | 0x80U);
173 PSTORE_ASSERT (c < 0x110000);
174 *(out++) = static_cast<CharType> ((c / 0x40000U) | 0xF0U);
175 *(out++) = static_cast<CharType> ((c / 0x1000U % 64U) | 0x80U);
176 *(out++) = static_cast<CharType> ((c / 64U % 64U) | 0x80U);
177 *(out++) = static_cast<CharType> ((c % 64U) | 0x80U);
182 template <
typename ResultType>
183 auto code_point_to_utf8 (char32_t c) -> ResultType {
185 code_point_to_utf8<typename ResultType::value_type> (c, std::back_inserter (result));
189 constexpr
auto nop_swapper (std::uint16_t
const v) noexcept -> std::uint16_t {
return v; }
190 constexpr
auto byte_swapper (std::uint16_t
const v) noexcept -> std::uint16_t {
191 return static_cast<std::uint16_t
> (((v & 0x00FFU) << 8U) | ((v & 0xFF00U) >> 8U));
194 constexpr
auto is_utf16_high_surrogate (std::uint16_t
const code_unit) noexcept ->
bool {
195 return code_unit >= 0xD800 && code_unit <= 0xDBFF;
200 constexpr
auto is_utf16_low_surrogate (std::uint16_t
const code_unit) noexcept ->
bool {
201 return code_unit >= 0xDC00 && code_unit <= 0xDFFF;
206 template <
typename InputIterator,
typename SwapperFunction>
207 auto utf16_to_code_point (InputIterator first, InputIterator last, SwapperFunction swapper)
208 -> std::pair<InputIterator, char32_t> {
210 using value_type =
typename std::remove_cv<
211 typename std::iterator_traits<InputIterator>::value_type>::type;
212 static_assert (std::is_same<value_type, char16_t>::value,
213 "iterator must produce char16_t");
215 PSTORE_ASSERT (first != last);
216 char32_t code_point = 0;
217 char16_t
const code_unit = swapper (*(first++));
218 if (!is_utf16_high_surrogate (code_unit)) {
219 code_point = code_unit;
222 code_point = replacement_char_code_point;
224 auto const high = code_unit;
225 auto const low = swapper (*(first++));
227 if (low < 0xDC00 || low > 0xDFFF) {
228 code_point = replacement_char_code_point;
230 code_point = 0x10000;
231 code_point += (high & 0x03FFU) << 10U;
232 code_point += (low & 0x03FFU);
236 return {first, code_point};
241 template <
typename InputIt,
typename OutputIt,
typename Swapper>
242 auto utf16_to_code_points (InputIt first, InputIt last, OutputIt out, Swapper swapper)
244 while (first != last) {
246 std::tie (first, code_point) = utf16_to_code_point (first, last, swapper);
247 *(out++) = code_point;
252 template <
typename ResultType,
typename InputIt,
typename Swapper>
253 auto utf16_to_code_points (InputIt first, InputIt last, Swapper swapper) -> ResultType {
255 utf16_to_code_points (first, last, std::back_inserter (result), swapper);
259 template <
typename ResultType,
typename InputType,
typename Swapper>
260 auto utf16_to_code_points (InputType
const & src, Swapper swapper) -> ResultType {
261 return utf16_to_code_points<ResultType> (std::begin (src), std::end (src), swapper);
266 template <
typename InputType,
typename Swapper>
267 auto utf16_to_code_point (InputType
const & src, Swapper swapper) -> char32_t {
268 auto end = std::end (src);
270 std::tie (end, cp) = utf16_to_code_point (std::begin (src), end, swapper);
271 PSTORE_ASSERT (end == std::end (src));
277 template <
typename InputIt,
typename OutputIt,
typename Swapper>
278 auto utf16_to_utf8 (InputIt first, InputIt last, OutputIt out, Swapper swapper)
280 while (first != last) {
282 std::tie (first, code_point) = utf16_to_code_point (first, last, swapper);
283 out = code_unit_to_utf8 (code_point, out);
288 template <
typename ResultType,
typename InputIt,
typename Swapper>
289 auto utf16_to_utf8 (InputIt first, InputIt last, Swapper swapper) -> ResultType {
291 utf16_to_utf8 (first, last, std::back_inserter (result), swapper);
295 template <
typename ResultType,
typename InputType,
typename Swapper>
296 auto utf16_to_utf8 (InputType
const & src, Swapper swapper) -> ResultType {
297 return utf16_to_utf8<ResultType> (std::begin (src), std::end (src), swapper);
311 template <
typename CharType>
313 using uchar_type =
typename std::make_unsigned<CharType>::type;
314 return (static_cast<uchar_type> (c) & 0xC0U) != 0x80U;
320 template <
typename Iterator>
321 auto length (Iterator first, Iterator last) -> std::size_t {
323 std::count_if (first, last, [] (
char const c) {
return is_utf_char_start (c); });
324 PSTORE_ASSERT (result >= 0);
325 using utype =
typename std::make_unsigned<decltype (result)>::type;
326 static_assert (std::numeric_limits<utype>::max () <=
327 std::numeric_limits<std::size_t>::max (),
328 "std::size_t cannot hold result of count_if");
329 return static_cast<std::size_t
> (result);
332 template <
typename SpanType>
333 auto length (SpanType span) -> std::size_t {
334 return length (span.begin (), span.end ());
343 auto length (
char const * str, std::size_t nbytes) -> std::size_t;
346 auto length (gsl::czstring str) -> std::size_t;
347 auto length (std::string
const & str) -> std::size_t;
349 inline auto length (std::nullptr_t) noexcept -> std::size_t {
return 0; }
358 auto index (gsl::czstring str, std::size_t pos) -> gsl::czstring;
368 template <
typename InputIterator>
369 auto index (InputIterator
const first, InputIterator
const last, std::size_t
const pos)
371 auto start_count = std::size_t{0};
372 return std::find_if (first, last, [&start_count, pos] (
char const c) {
384 template <
typename SpanType>
385 auto index (SpanType
const span, std::size_t
const pos) ->
386 typename SpanType::element_type * {
387 auto const end = span.end ();
388 auto const it =
index (span.begin (), end, pos);
389 return it == end ? nullptr : &*it;
404 auto slice (gsl::czstring str, std::ptrdiff_t start, std::ptrdiff_t end)
405 -> std::pair<std::ptrdiff_t, std::ptrdiff_t>;
408 using native_string = std::basic_string<TCHAR>;
409 using native_ostringstream = std::basic_ostringstream<TCHAR>;
412 # if defined(_UNICODE) 413 inline auto to_native_string (std::string
const & str) -> std::wstring {
414 return utf::win32::to16 (str);
416 inline auto to_native_string (gsl::czstring
const str) -> std::wstring {
417 return utf::win32::to16 (str);
419 inline auto from_native_string (std::wstring
const & str) -> std::string {
420 return utf::win32::to8 (str);
422 inline auto from_native_string (gsl::cwzstring
const str) -> std::string {
423 return utf::win32::to8 (str);
427 inline auto to_native_string (std::string
const & str) -> std::string {
428 return win32::to_mbcs (str);
430 inline auto to_native_string (gsl::czstring
const str) -> std::string {
431 return win32::to_mbcs (str);
434 inline auto from_native_string (std::string
const & str) -> std::string {
435 return win32::mbcs_to8 (str);
437 inline auto from_native_string (gsl::czstring
const str) -> std::string {
438 return win32::mbcs_to8 (str);
441 constexpr
auto to_native_string (std::string
const & str) noexcept -> std::string
const & {
444 constexpr
auto from_native_string (std::string
const & str) noexcept
445 -> std::string
const & {
452 #endif // PSTORE_SUPPORT_UTF_HPP auto slice(gsl::czstring str, std::ptrdiff_t start, std::ptrdiff_t end) -> std::pair< std::ptrdiff_t, std::ptrdiff_t >
Converts codepoint indices start and end to byte offsets in the buffer at str.
Definition: utf.cpp:61
auto index(gsl::czstring str, std::size_t pos) -> gsl::czstring
Returns a pointer to the beginning of the pos'th UTF-8 codepoint in the buffer at str or nullptr if e...
Definition: utf.cpp:49
An implementation of the Haskell Maybe type.
constexpr auto is_utf_char_start(CharType c) noexcept -> bool
If the top two bits are 0b10, then this is a UTF-8 continuation byte and is skipped; other patterns i...
Definition: utf.hpp:312
auto length(Iterator first, Iterator last) -> std::size_t
Definition: utf.hpp:321
Definition: nonpod2.cpp:40