pstore2
utf.hpp
Go to the documentation of this file.
1 //===- include/pstore/support/utf.hpp ---------------------*- mode: C++ -*-===//
2 //* _ __ *
3 //* _ _| |_ / _| *
4 //* | | | | __| |_ *
5 //* | |_| | |_| _| *
6 //* \__,_|\__|_| *
7 //* *
8 //===----------------------------------------------------------------------===//
9 //
10 // Part of the pstore project, under the Apache License v2.0 with LLVM Exceptions.
11 // See https://github.com/SNSystems/pstore/blob/master/LICENSE.txt for license
12 // information.
13 // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
14 //
15 //===----------------------------------------------------------------------===//
19 
20 #ifndef PSTORE_SUPPORT_UTF_HPP
21 #define PSTORE_SUPPORT_UTF_HPP
22 
23 #include <algorithm>
24 #include <cstddef>
25 #include <iosfwd>
26 
27 #include "pstore/support/gsl.hpp"
28 #include "pstore/support/maybe.hpp"
29 
30 #if defined(_WIN32)
31 
32 # include <tchar.h>
33 
34 namespace pstore {
35  namespace utf {
36  namespace win32 {
37 
43  std::string to8 (wchar_t const * const wstr, std::size_t length);
44 
46  std::string to8 (wchar_t const * const wstr);
47 
49  inline std::string to8 (std::wstring const & wstr) {
50  return to8 (wstr.data (), wstr.length ());
51  }
52 
58  std::wstring to16 (char const * str, std::size_t length);
59 
61  std::wstring to16 (char const * str);
62 
63  inline std::wstring to16 (std::string const & str) {
64  return to16 (str.data (), str.length ());
65  }
66 
71  std::string to_mbcs (char const * str, std::size_t length);
72  inline std::string to_mbcs (char const * str) {
73  PSTORE_ASSERT (str != nullptr);
74  return to_mbcs (str, std::strlen (str));
75  }
76  inline std::string to_mbcs (std::string const & str) {
77  return to_mbcs (str.data (), str.length ());
78  }
79  std::string to_mbcs (wchar_t const * wstr, std::size_t length);
80  inline std::string to_mbcs (std::wstring const & str) {
81  return to_mbcs (str.data (), str.length ());
82  }
84 
89 
94  std::string mbcs_to8 (char const * mbcs, std::size_t length);
95 
98  inline std::string mbcs_to8 (char const * str) {
99  PSTORE_ASSERT (str != nullptr);
100  return mbcs_to8 (str, std::strlen (str));
101  }
102 
104  inline std::string mbcs_to8 (std::string const & str) {
105  return mbcs_to8 (str.data (), str.length ());
106  }
108  } // end namespace win32
109  } // end namespace utf
110 } // end namespace pstore
111 
112 #else //_WIN32
113 
114 // The type follows the Microsoft convention for its naming.
115 // NOLINTNEXTLINE(readability-identifier-naming)
116 using TCHAR = char;
117 
118 #endif // !defined(_WIN32)
119 
120 
121 namespace pstore {
122  namespace utf {
123 
124  using utf8_string = std::basic_string<std::uint8_t>;
125  using utf16_string = std::basic_string<char16_t>;
126 
127  auto operator<< (std::ostream & os, utf8_string const & s) -> std::ostream &;
128 
129  class utf8_decoder {
130  public:
131  auto get (std::uint8_t byte) noexcept -> maybe<char32_t>;
132  auto is_well_formed () const noexcept -> bool { return well_formed_; }
133 
134  private:
135  enum state { accept, reject };
136 
137  static auto decode (gsl::not_null<std::uint8_t *> state,
138  gsl::not_null<char32_t *> codep, std::uint32_t byte) noexcept
139  -> std::uint8_t;
140 
141  static std::uint8_t const utf8d_[];
142  char32_t codepoint_ = 0;
143  std::uint8_t state_ = accept;
144  bool well_formed_ = true;
145  };
146 
147 
148  constexpr char32_t replacement_char_code_point = 0xFFFD;
149 
150  template <typename CharType = char, typename OutputIt>
151  auto code_point_to_utf8 (char32_t c, OutputIt out) -> OutputIt;
152 
153 
154  template <typename CharType = char, typename OutputIt>
155  auto replacement_char (OutputIt out) -> OutputIt {
156  return code_point_to_utf8<CharType> (replacement_char_code_point, out);
157  }
158 
159  template <typename CharType, typename OutputIt>
160  auto code_point_to_utf8 (char32_t const c, OutputIt out) -> OutputIt {
161  if (c < 0x80) {
162  *(out++) = static_cast<CharType> (c);
163  } else if (c < 0x800) {
164  *(out++) = static_cast<CharType> (c / 64U + 0xC0U);
165  *(out++) = static_cast<CharType> (c % 64U + 0x80U);
166  } else if ((c >= 0xD800 && c < 0xE000) || c >= 0x110000) {
167  out = replacement_char<CharType> (out);
168  } else if (c < 0x10000) {
169  *(out++) = static_cast<CharType> ((c / 0x1000U) | 0xE0U);
170  *(out++) = static_cast<CharType> ((c / 64U % 64U) | 0x80U);
171  *(out++) = static_cast<CharType> ((c % 64U) | 0x80U);
172  } else {
173  PSTORE_ASSERT (c < 0x110000);
174  *(out++) = static_cast<CharType> ((c / 0x40000U) | 0xF0U);
175  *(out++) = static_cast<CharType> ((c / 0x1000U % 64U) | 0x80U);
176  *(out++) = static_cast<CharType> ((c / 64U % 64U) | 0x80U);
177  *(out++) = static_cast<CharType> ((c % 64U) | 0x80U);
178  }
179  return out;
180  }
181 
182  template <typename ResultType>
183  auto code_point_to_utf8 (char32_t c) -> ResultType {
184  ResultType result;
185  code_point_to_utf8<typename ResultType::value_type> (c, std::back_inserter (result));
186  return result;
187  }
188 
189  constexpr auto nop_swapper (std::uint16_t const v) noexcept -> std::uint16_t { return v; }
190  constexpr auto byte_swapper (std::uint16_t const v) noexcept -> std::uint16_t {
191  return static_cast<std::uint16_t> (((v & 0x00FFU) << 8U) | ((v & 0xFF00U) >> 8U));
192  }
193 
194  constexpr auto is_utf16_high_surrogate (std::uint16_t const code_unit) noexcept -> bool {
195  return code_unit >= 0xD800 && code_unit <= 0xDBFF;
196  }
197 
198  // is_utf16_low_surrogate
199  // ~~~~~~~~~~~~~~~~~~~~~~
200  constexpr auto is_utf16_low_surrogate (std::uint16_t const code_unit) noexcept -> bool {
201  return code_unit >= 0xDC00 && code_unit <= 0xDFFF;
202  }
203 
204  // utf16_to_code_point
205  // ~~~~~~~~~~~~~~~~~~~
206  template <typename InputIterator, typename SwapperFunction>
207  auto utf16_to_code_point (InputIterator first, InputIterator last, SwapperFunction swapper)
208  -> std::pair<InputIterator, char32_t> {
209 
210  using value_type = typename std::remove_cv<
211  typename std::iterator_traits<InputIterator>::value_type>::type;
212  static_assert (std::is_same<value_type, char16_t>::value,
213  "iterator must produce char16_t");
214 
215  PSTORE_ASSERT (first != last);
216  char32_t code_point = 0;
217  char16_t const code_unit = swapper (*(first++));
218  if (!is_utf16_high_surrogate (code_unit)) {
219  code_point = code_unit;
220  } else {
221  if (first == last) {
222  code_point = replacement_char_code_point;
223  } else {
224  auto const high = code_unit;
225  auto const low = swapper (*(first++));
226 
227  if (low < 0xDC00 || low > 0xDFFF) {
228  code_point = replacement_char_code_point;
229  } else {
230  code_point = 0x10000;
231  code_point += (high & 0x03FFU) << 10U;
232  code_point += (low & 0x03FFU);
233  }
234  }
235  }
236  return {first, code_point};
237  }
238 
239  // utf16_to_code_points
240  // ~~~~~~~~~~~~~~~~~~~~
241  template <typename InputIt, typename OutputIt, typename Swapper>
242  auto utf16_to_code_points (InputIt first, InputIt last, OutputIt out, Swapper swapper)
243  -> OutputIt {
244  while (first != last) {
245  char32_t code_point;
246  std::tie (first, code_point) = utf16_to_code_point (first, last, swapper);
247  *(out++) = code_point;
248  }
249  return out;
250  }
251 
252  template <typename ResultType, typename InputIt, typename Swapper>
253  auto utf16_to_code_points (InputIt first, InputIt last, Swapper swapper) -> ResultType {
254  ResultType result;
255  utf16_to_code_points (first, last, std::back_inserter (result), swapper);
256  return result;
257  }
258 
259  template <typename ResultType, typename InputType, typename Swapper>
260  auto utf16_to_code_points (InputType const & src, Swapper swapper) -> ResultType {
261  return utf16_to_code_points<ResultType> (std::begin (src), std::end (src), swapper);
262  }
263 
264  // utf16_to_code_point
265  // ~~~~~~~~~~~~~~~~~~~
266  template <typename InputType, typename Swapper>
267  auto utf16_to_code_point (InputType const & src, Swapper swapper) -> char32_t {
268  auto end = std::end (src);
269  char32_t cp;
270  std::tie (end, cp) = utf16_to_code_point (std::begin (src), end, swapper);
271  PSTORE_ASSERT (end == std::end (src));
272  return cp;
273  }
274 
275  // utf16_to_utf8
276  // ~~~~~~~~~~~~~
277  template <typename InputIt, typename OutputIt, typename Swapper>
278  auto utf16_to_utf8 (InputIt first, InputIt last, OutputIt out, Swapper swapper)
279  -> OutputIt {
280  while (first != last) {
281  char32_t code_point;
282  std::tie (first, code_point) = utf16_to_code_point (first, last, swapper);
283  out = code_unit_to_utf8 (code_point, out);
284  }
285  return out;
286  }
287 
288  template <typename ResultType, typename InputIt, typename Swapper>
289  auto utf16_to_utf8 (InputIt first, InputIt last, Swapper swapper) -> ResultType {
290  ResultType result;
291  utf16_to_utf8 (first, last, std::back_inserter (result), swapper);
292  return result;
293  }
294 
295  template <typename ResultType, typename InputType, typename Swapper>
296  auto utf16_to_utf8 (InputType const & src, Swapper swapper) -> ResultType {
297  return utf16_to_utf8<ResultType> (std::begin (src), std::end (src), swapper);
298  }
299 
300  } // end namespace utf
301 } // end namespace pstore
302 
303 
304 
305 namespace pstore {
306  namespace utf {
307 
311  template <typename CharType>
312  constexpr auto is_utf_char_start (CharType c) noexcept -> bool {
313  using uchar_type = typename std::make_unsigned<CharType>::type;
314  return (static_cast<uchar_type> (c) & 0xC0U) != 0x80U;
315  }
316 
319 
320  template <typename Iterator>
321  auto length (Iterator first, Iterator last) -> std::size_t {
322  auto const result =
323  std::count_if (first, last, [] (char const c) { return is_utf_char_start (c); });
324  PSTORE_ASSERT (result >= 0);
325  using utype = typename std::make_unsigned<decltype (result)>::type;
326  static_assert (std::numeric_limits<utype>::max () <=
327  std::numeric_limits<std::size_t>::max (),
328  "std::size_t cannot hold result of count_if");
329  return static_cast<std::size_t> (result);
330  }
331 
332  template <typename SpanType>
333  auto length (SpanType span) -> std::size_t {
334  return length (span.begin (), span.end ());
335  }
336 
343  auto length (char const * str, std::size_t nbytes) -> std::size_t;
344 
346  auto length (gsl::czstring str) -> std::size_t;
347  auto length (std::string const & str) -> std::size_t;
348 
349  inline auto length (std::nullptr_t) noexcept -> std::size_t { return 0; }
351 
354 
358  auto index (gsl::czstring str, std::size_t pos) -> gsl::czstring;
359 
368  template <typename InputIterator>
369  auto index (InputIterator const first, InputIterator const last, std::size_t const pos)
370  -> InputIterator {
371  auto start_count = std::size_t{0};
372  return std::find_if (first, last, [&start_count, pos] (char const c) {
373  return is_utf_char_start (c) ? (start_count++ == pos) : false;
374  });
375  }
376 
384  template <typename SpanType>
385  auto index (SpanType const span, std::size_t const pos) ->
386  typename SpanType::element_type * {
387  auto const end = span.end ();
388  auto const it = index (span.begin (), end, pos);
389  return it == end ? nullptr : &*it;
390  }
391 
393 
394 
404  auto slice (gsl::czstring str, std::ptrdiff_t start, std::ptrdiff_t end)
405  -> std::pair<std::ptrdiff_t, std::ptrdiff_t>;
406 
407 
408  using native_string = std::basic_string<TCHAR>;
409  using native_ostringstream = std::basic_ostringstream<TCHAR>;
410 
411 #if defined(_WIN32)
412 # if defined(_UNICODE)
413  inline auto to_native_string (std::string const & str) -> std::wstring {
414  return utf::win32::to16 (str);
415  }
416  inline auto to_native_string (gsl::czstring const str) -> std::wstring {
417  return utf::win32::to16 (str);
418  }
419  inline auto from_native_string (std::wstring const & str) -> std::string {
420  return utf::win32::to8 (str);
421  }
422  inline auto from_native_string (gsl::cwzstring const str) -> std::string {
423  return utf::win32::to8 (str);
424  }
425 # else
426  // This is Windows in "Multibyte character set" mode.
427  inline auto to_native_string (std::string const & str) -> std::string {
428  return win32::to_mbcs (str);
429  }
430  inline auto to_native_string (gsl::czstring const str) -> std::string {
431  return win32::to_mbcs (str);
432  }
433 # endif //_UNICODE
434  inline auto from_native_string (std::string const & str) -> std::string {
435  return win32::mbcs_to8 (str);
436  }
437  inline auto from_native_string (gsl::czstring const str) -> std::string {
438  return win32::mbcs_to8 (str);
439  }
440 #else //_WIN32
441  constexpr auto to_native_string (std::string const & str) noexcept -> std::string const & {
442  return str;
443  }
444  constexpr auto from_native_string (std::string const & str) noexcept
445  -> std::string const & {
446  return str;
447  }
448 #endif
449  } // end namespace utf
450 } // end namespace pstore
451 
452 #endif // PSTORE_SUPPORT_UTF_HPP
auto slice(gsl::czstring str, std::ptrdiff_t start, std::ptrdiff_t end) -> std::pair< std::ptrdiff_t, std::ptrdiff_t >
Converts codepoint indices start and end to byte offsets in the buffer at str.
Definition: utf.cpp:61
auto index(gsl::czstring str, std::size_t pos) -> gsl::czstring
Returns a pointer to the beginning of the pos&#39;th UTF-8 codepoint in the buffer at str or nullptr if e...
Definition: utf.cpp:49
An implementation of the Haskell Maybe type.
Definition: gsl.hpp:589
constexpr auto is_utf_char_start(CharType c) noexcept -> bool
If the top two bits are 0b10, then this is a UTF-8 continuation byte and is skipped; other patterns i...
Definition: utf.hpp:312
auto length(Iterator first, Iterator last) -> std::size_t
Definition: utf.hpp:321
Definition: nonpod2.cpp:40
Definition: maybe.hpp:49
Definition: utf.hpp:129