Fcitx
utf8.cpp
1 /*
2  * SPDX-FileCopyrightText: 2016-2016 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 
8 #include "utf8.h"
9 #include <cstdint>
10 #include <string>
11 #include "cutf8.h"
12 
13 namespace fcitx::utf8 {
14 
15 bool UCS4IsValid(uint32_t code) {
16  return ((code) < 0x110000 && (((code) & 0xFFFFF800) != 0xD800));
17 }
18 
19 std::string UCS4ToUTF8(uint32_t code) {
20  if (!code) {
21  return "";
22  }
23  char buf[FCITX_UTF8_MAX_LENGTH + 1];
24  auto length = fcitx_ucs4_to_utf8(code, buf);
25  return {buf, buf + length};
26 }
27 
28 #ifdef _WIN32
29 
30 std::string UTF16ToUTF8(std::wstring_view data) {
31  std::string result;
32  for (size_t i = 0; i < data.size();) {
33  uint32_t ucs4 = 0;
34  uint16_t chr = data[i];
35  uint16_t chrNext = (i + 1 == data.size()) ? 0 : data[i + 1];
36  if (chr < 0xD800 || chr > 0xDFFF) {
37  ucs4 = chr;
38  i += 1;
39  } else if (0xD800 <= chr && chr <= 0xDBFF) {
40  if (!chrNext) {
41  return {};
42  }
43  if (0xDC00 <= chrNext && chrNext <= 0xDFFF) {
44  /* We have a valid surrogate pair. */
45  ucs4 = (((chr & 0x3FF) << 10) | (chrNext & 0x3FF)) + (1 << 16);
46  i += 2;
47  }
48  } else if (0xDC00 <= chr && chr <= 0xDFFF) {
49  return {};
50  }
51  result.append(utf8::UCS4ToUTF8(ucs4));
52  }
53  return result;
54 }
55 
56 std::wstring UTF8ToUTF16(std::string_view str) {
57  if (!utf8::validate(str)) {
58  return {};
59  }
60  std::wstring result;
61  for (const auto ucs4 : utf8::MakeUTF8CharRange(str)) {
62  if (ucs4 < 0x10000) {
63  result.push_back(static_cast<uint16_t>(ucs4));
64  } else if (ucs4 < 0x110000) {
65  result.push_back(0xD800 | (((ucs4 - 0x10000) >> 10) & 0x3ff));
66  result.push_back(0xDC00 | (ucs4 & 0x3ff));
67  } else {
68  return {};
69  }
70  }
71  return result;
72 }
73 
74 #endif
75 
76 } // namespace fcitx::utf8
std::string UCS4ToUTF8(uint32_t code)
Convert UCS4 to UTF8 string.
Definition: utf8.cpp:19
size_t length(Iter start, Iter end)
Return the number UTF-8 characters in the string iterator range.
Definition: utf8.h:33
bool validate(Iter start, Iter end)
Check if the string iterator range is valid utf8 string.
Definition: utf8.h:74
C++ Utility functions for handling utf8 strings.
C-style utf8 utility functions.
bool UCS4IsValid(uint32_t code)
Check if a ucs4 is valid.
Definition: utf8.cpp:15
int fcitx_ucs4_to_utf8(uint32_t c, char *output)
Convert ucs4 char to utf8, need to have enough memory for it.
Definition: cutf8.cpp:99