Fcitx
utf8.cpp
1 /*
2  * SPDX-FileCopyrightText: 2016-2016 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 
8 #include "utf8.h"
9 #include <cassert>
10 #include <cstdint>
11 #include <string>
12 #include <string_view>
13 #include "cutf8.h"
14 
15 namespace fcitx::utf8 {
16 
17 bool UCS4IsValid(uint32_t code) {
18  return ((code) < 0x110000 && (((code) & 0xFFFFF800) != 0xD800));
19 }
20 
21 std::string UCS4ToUTF8(uint32_t code) {
22  if (!code) {
23  return "";
24  }
25  char buf[FCITX_UTF8_MAX_LENGTH + 1];
26  auto length = fcitx_ucs4_to_utf8(code, buf);
27  return {buf, buf + length};
28 }
29 
30 bool replaceInvalidInplace(std::string &str, char replacement) {
31  auto iter = str.begin();
32  auto end = str.end();
33  bool replaced = false;
34  while (iter != end) {
35  uint32_t chr;
36  auto next = getNextChar(iter, end, &chr);
37  if (isValidChar(chr)) {
38  assert(next != iter);
39  iter = next;
40  } else if (chr == INVALID_CHAR) {
41  *iter = replacement;
42  ++iter;
43  replaced = true;
44  } else if (chr == NOT_ENOUGH_SPACE) {
45  *iter = replacement;
46  ++iter;
47  while (iter != end && isContinuationByte(*iter)) {
48  *iter = replacement;
49  ++iter;
50  }
51  replaced = true;
52  }
53  }
54  return replaced;
55 }
56 
57 std::string replaceInvalid(std::string_view str, char replacement) {
58  std::string result(str);
59  replaceInvalidInplace(result, replacement);
60  return result;
61 }
62 
63 #ifdef _WIN32
64 
65 std::string UTF16ToUTF8(std::wstring_view data) {
66  std::string result;
67  for (size_t i = 0; i < data.size();) {
68  uint32_t ucs4 = 0;
69  uint16_t chr = data[i];
70  uint16_t chrNext = (i + 1 == data.size()) ? 0 : data[i + 1];
71  if (chr < 0xD800 || chr > 0xDFFF) {
72  ucs4 = chr;
73  i += 1;
74  } else if (0xD800 <= chr && chr <= 0xDBFF) {
75  if (!chrNext) {
76  return {};
77  }
78  if (0xDC00 <= chrNext && chrNext <= 0xDFFF) {
79  /* We have a valid surrogate pair. */
80  ucs4 = (((chr & 0x3FF) << 10) | (chrNext & 0x3FF)) + (1 << 16);
81  i += 2;
82  }
83  } else if (0xDC00 <= chr && chr <= 0xDFFF) {
84  return {};
85  }
86  result.append(utf8::UCS4ToUTF8(ucs4));
87  }
88  return result;
89 }
90 
91 std::wstring UTF8ToUTF16(std::string_view str) {
92  if (!utf8::validate(str)) {
93  return {};
94  }
95  std::wstring result;
96  for (const auto ucs4 : utf8::MakeUTF8CharRange(str)) {
97  if (ucs4 < 0x10000) {
98  result.push_back(static_cast<uint16_t>(ucs4));
99  } else if (ucs4 < 0x110000) {
100  result.push_back(0xD800 | (((ucs4 - 0x10000) >> 10) & 0x3ff));
101  result.push_back(0xDC00 | (ucs4 & 0x3ff));
102  } else {
103  return {};
104  }
105  }
106  return result;
107 }
108 
109 #endif
110 
111 } // namespace fcitx::utf8
bool isContinuationByte(char c)
Check if the byte is a utf8 continuation byte.
Definition: utf8.h:337
std::string UCS4ToUTF8(uint32_t code)
Convert UCS4 to UTF8 string.
Definition: utf8.cpp:21
size_t length(Iter start, Iter end)
Return the number UTF-8 characters in the string iterator range.
Definition: utf8.h:33
bool validate(Iter start, Iter end)
Check if the string iterator range is valid utf8 string.
Definition: utf8.h:74
C++ Utility functions for handling utf8 strings.
bool isValidChar(uint32_t c)
Check the chr value is not two invalid value above.
Definition: utf8.h:97
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
Definition: utf8.h:91
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
Definition: utf8.h:94
C-style utf8 utility functions.
bool UCS4IsValid(uint32_t code)
Check if a ucs4 is valid.
Definition: utf8.cpp:17
bool replaceInvalidInplace(std::string &str, char replacement)
Replace invalid UTF-8 sequences in-place with given byte.
Definition: utf8.cpp:30
int fcitx_ucs4_to_utf8(uint32_t c, char *output)
Convert ucs4 char to utf8, need to have enough memory for it.
Definition: cutf8.cpp:98
std::string replaceInvalid(std::string_view str, char replacement)
Replace invalid UTF-8 sequences with given byte.
Definition: utf8.cpp:57