15 #define CONT(i) fcitx::utf8::isContinuationByte(in[i]) 16 #define VAL(i, s) ((in[i] & 0x3f) << s) 18 #define UTF8_LENGTH(Char) \ 25 : ((Char) < 0x200000 ? 4 \ 26 : ((Char) < 0x4000000 ? 5 : 6))))) 28 #define UNICODE_VALID(Char) \ 29 ((Char) < 0x110000 && (((Char) & 0xFFFFF800) != 0xD800)) 45 if (!(in[0] & 0x80)) {
50 if ((in[0] & 0xe0) == 0xc0 && CONT(1)) {
55 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2)) {
60 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3)) {
65 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4)) {
70 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) &&
100 output[0] = (char)(c & 0xFF);
105 output[0] = (char)(0xC0 + ((c >> 6) & 0x1F));
106 output[1] = (char)(0x80 + (c & 0x3F));
111 output[0] = (char)(0xE0 + ((c >> 12) & 0x0F));
112 output[1] = (char)(0x80 + ((c >> 6) & 0x3F));
113 output[2] = (char)(0x80 + (c & 0x3F));
118 output[0] = (char)(0xF0 + ((c >> 18) & 0x07));
119 output[1] = (char)(0x80 + ((c >> 12) & 0x3F));
120 output[2] = (char)(0x80 + ((c >> 6) & 0x3F));
121 output[3] = (char)(0x80 + (c & 0x3F));
127 output[0] = (char)(0xF8 + ((c >> 24) & 0x03));
128 output[1] = (char)(0x80 + ((c >> 18) & 0x3F));
129 output[2] = (char)(0x80 + ((c >> 12) & 0x3F));
130 output[3] = (char)(0x80 + ((c >> 6) & 0x3F));
131 output[4] = (char)(0x80 + (c & 0x3F));
135 output[0] = (char)(0xFC + ((c >> 30) & 0x01));
136 output[1] = (char)(0x80 + ((c >> 24) & 0x3F));
137 output[2] = (char)(0x80 + ((c >> 18) & 0x3F));
138 output[3] = (char)(0x80 + ((c >> 12) & 0x3F));
139 output[4] = (char)(0x80 + ((c >> 6) & 0x3F));
140 output[5] = (char)(0x80 + (c & 0x3F));
146 const auto *in =
reinterpret_cast<const unsigned char *
>(i);
147 if (!(in[0] & 0x80)) {
149 return (
char *)in + 1;
153 if ((in[0] & 0xe0) == 0xc0 && CONT(1)) {
154 *chr = ((in[0] & 0x1f) << 6) | VAL(1, 0);
155 return (
char *)in + 2;
159 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2)) {
160 *chr = ((in[0] & 0xf) << 12) | VAL(1, 6) | VAL(2, 0);
161 return (
char *)in + 3;
165 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3)) {
166 *chr = ((in[0] & 0x7) << 18) | VAL(1, 12) | VAL(2, 6) | VAL(3, 0);
167 return (
char *)in + 4;
171 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4)) {
172 *chr = ((in[0] & 0x3) << 24) | VAL(1, 18) | VAL(2, 12) | VAL(3, 6) |
174 return (
char *)in + 5;
178 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) &&
180 *chr = ((in[0] & 0x1) << 30) | VAL(1, 24) | VAL(2, 18) | VAL(3, 12) |
181 VAL(4, 6) | VAL(5, 0);
182 return (
char *)in + 6;
187 return (
char *)in + 1;
193 while (*s && l < n) {
203 static uint32_t fcitx_utf8_get_char_extended(
const char *s,
int max_len,
205 const auto *p =
reinterpret_cast<const unsigned char *
>(s);
208 uint32_t wc =
static_cast<unsigned char>(*p);
222 }
else if (wc < 0xf0) {
225 }
else if (wc < 0xf8) {
228 }
else if (wc < 0xfc) {
231 }
else if (wc < 0xfe) {
238 if (max_len >= 0 && len > max_len) {
239 for (i = 1; i < max_len; i++) {
240 if ((((
unsigned char *)p)[i] & 0xc0) != 0x80) {
248 for (i = 1; i < len; ++i) {
249 uint32_t ch = ((
unsigned char *)p)[i];
251 if ((ch & 0xc0) != 0x80) {
263 if (UTF8_LENGTH(wc) != len) {
282 result = fcitx_utf8_get_char_extended(p, max_len, &len);
284 if (result & 0x80000000) {
287 if (!UNICODE_VALID(result)) {
319 size_t diff = next - s;
324 memcpy(str, s, diff);
338 while (byte && *str) {
341 str, (byte > FCITX_UTF8_MAX_LENGTH ? FCITX_UTF8_MAX_LENGTH : byte),
357 while (byte && *str) {
361 size_t diff = next - str;
void fcitx_utf8_strncpy(char *str, const char *s, size_t byte)
Copy most byte length, but keep utf8 valid.
char * fcitx_utf8_get_nth_char(const char *s, uint32_t n)
Get the pointer to the nth character.
C++ Utility functions for handling utf8 strings.
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
constexpr size_t INVALID_LENGTH
Possible return value of lengthValidated if the string is not valid.
uint32_t fcitx_utf8_get_char_validated(const char *p, int max_len, int *plen)
Get validated character.
size_t fcitx_utf8_strlen(const char *s)
Get utf8 string length.
C-style utf8 utility functions.
unsigned int fcitx_utf8_char_len(const char *in)
Get the number of bytes of next character.
size_t fcitx_utf8_strnlen_validated(const char *str, size_t byte)
Count most byte length, utf8 string length and validates the string.
size_t fcitx_utf8_strnlen(const char *str, size_t byte)
Count most byte length, utf8 string length.
int fcitx_ucs4_to_utf8(uint32_t c, char *output)
Convert ucs4 char to utf8, need to have enough memory for it.
int fcitx_ucs4_char_len(uint32_t c)
Return the utf8 bytes of a UCS4 char.
char * fcitx_utf8_get_char(const char *i, uint32_t *chr)
Get UCS-4 char in the utf8 string.
bool fcitx_utf8_check_string(const char *s)
Check if the string is valid utf8 string.