12 #include <string_view> 18 return ((code) < 0x110000 && (((code) & 0xFFFFF800) != 0xD800));
25 char buf[FCITX_UTF8_MAX_LENGTH + 1];
27 return {buf, buf +
length};
31 auto iter = str.begin();
33 bool replaced =
false;
36 auto next = getNextChar(iter, end, &chr);
58 std::string result(str);
65 std::string UTF16ToUTF8(std::wstring_view data) {
67 for (
size_t i = 0; i < data.size();) {
69 uint16_t chr = data[i];
70 uint16_t chrNext = (i + 1 == data.size()) ? 0 : data[i + 1];
71 if (chr < 0xD800 || chr > 0xDFFF) {
74 }
else if (0xD800 <= chr && chr <= 0xDBFF) {
78 if (0xDC00 <= chrNext && chrNext <= 0xDFFF) {
80 ucs4 = (((chr & 0x3FF) << 10) | (chrNext & 0x3FF)) + (1 << 16);
83 }
else if (0xDC00 <= chr && chr <= 0xDFFF) {
91 std::wstring UTF8ToUTF16(std::string_view str) {
96 for (
const auto ucs4 : utf8::MakeUTF8CharRange(str)) {
98 result.push_back(static_cast<uint16_t>(ucs4));
99 }
else if (ucs4 < 0x110000) {
100 result.push_back(0xD800 | (((ucs4 - 0x10000) >> 10) & 0x3ff));
101 result.push_back(0xDC00 | (ucs4 & 0x3ff));
bool isContinuationByte(char c)
Check if the byte is a utf8 continuation byte.
std::string UCS4ToUTF8(uint32_t code)
Convert UCS4 to UTF8 string.
size_t length(Iter start, Iter end)
Return the number UTF-8 characters in the string iterator range.
bool validate(Iter start, Iter end)
Check if the string iterator range is valid utf8 string.
C++ Utility functions for handling utf8 strings.
bool isValidChar(uint32_t c)
Check the chr value is not two invalid value above.
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
C-style utf8 utility functions.
bool UCS4IsValid(uint32_t code)
Check if a ucs4 is valid.
bool replaceInvalidInplace(std::string &str, char replacement)
Replace invalid UTF-8 sequences in-place with given byte.
int fcitx_ucs4_to_utf8(uint32_t c, char *output)
Convert ucs4 char to utf8, need to have enough memory for it.
std::string replaceInvalid(std::string_view str, char replacement)
Replace invalid UTF-8 sequences with given byte.