14 #define FCITX_ISUTF8_CB(c) (((c) & 0xc0) == 0x80) 16 #define CONT(i) FCITX_ISUTF8_CB(in[i]) 17 #define VAL(i, s) ((in[i] & 0x3f) << s) 19 #define UTF8_LENGTH(Char) \ 26 : ((Char) < 0x200000 ? 4 \ 27 : ((Char) < 0x4000000 ? 5 : 6))))) 29 #define UNICODE_VALID(Char) \ 30 ((Char) < 0x110000 && (((Char) & 0xFFFFF800) != 0xD800)) 46 if (!(in[0] & 0x80)) {
51 if ((in[0] & 0xe0) == 0xc0 && CONT(1)) {
56 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2)) {
61 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3)) {
66 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4)) {
71 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) &&
101 output[0] = (char)(c & 0xFF);
106 output[0] = (char)(0xC0 + ((c >> 6) & 0x1F));
107 output[1] = (char)(0x80 + (c & 0x3F));
112 output[0] = (char)(0xE0 + ((c >> 12) & 0x0F));
113 output[1] = (char)(0x80 + ((c >> 6) & 0x3F));
114 output[2] = (char)(0x80 + (c & 0x3F));
119 output[0] = (char)(0xF0 + ((c >> 18) & 0x07));
120 output[1] = (char)(0x80 + ((c >> 12) & 0x3F));
121 output[2] = (char)(0x80 + ((c >> 6) & 0x3F));
122 output[3] = (char)(0x80 + (c & 0x3F));
128 output[0] = (char)(0xF8 + ((c >> 24) & 0x03));
129 output[1] = (char)(0x80 + ((c >> 18) & 0x3F));
130 output[2] = (char)(0x80 + ((c >> 12) & 0x3F));
131 output[3] = (char)(0x80 + ((c >> 6) & 0x3F));
132 output[4] = (char)(0x80 + (c & 0x3F));
136 output[0] = (char)(0xFC + ((c >> 30) & 0x01));
137 output[1] = (char)(0x80 + ((c >> 24) & 0x3F));
138 output[2] = (char)(0x80 + ((c >> 18) & 0x3F));
139 output[3] = (char)(0x80 + ((c >> 12) & 0x3F));
140 output[4] = (char)(0x80 + ((c >> 6) & 0x3F));
141 output[5] = (char)(0x80 + (c & 0x3F));
147 const auto *in =
reinterpret_cast<const unsigned char *
>(i);
148 if (!(in[0] & 0x80)) {
150 return (
char *)in + 1;
154 if ((in[0] & 0xe0) == 0xc0 && CONT(1)) {
155 *chr = ((in[0] & 0x1f) << 6) | VAL(1, 0);
156 return (
char *)in + 2;
160 if ((in[0] & 0xf0) == 0xe0 && CONT(1) && CONT(2)) {
161 *chr = ((in[0] & 0xf) << 12) | VAL(1, 6) | VAL(2, 0);
162 return (
char *)in + 3;
166 if ((in[0] & 0xf8) == 0xf0 && CONT(1) && CONT(2) && CONT(3)) {
167 *chr = ((in[0] & 0x7) << 18) | VAL(1, 12) | VAL(2, 6) | VAL(3, 0);
168 return (
char *)in + 4;
172 if ((in[0] & 0xfc) == 0xf8 && CONT(1) && CONT(2) && CONT(3) && CONT(4)) {
173 *chr = ((in[0] & 0x3) << 24) | VAL(1, 18) | VAL(2, 12) | VAL(3, 6) |
175 return (
char *)in + 5;
179 if ((in[0] & 0xfe) == 0xfc && CONT(1) && CONT(2) && CONT(3) && CONT(4) &&
181 *chr = ((in[0] & 0x1) << 30) | VAL(1, 24) | VAL(2, 18) | VAL(3, 12) |
182 VAL(4, 6) | VAL(5, 0);
183 return (
char *)in + 6;
188 return (
char *)in + 1;
194 while (*s && l < n) {
204 static uint32_t fcitx_utf8_get_char_extended(
const char *s,
int max_len,
206 const auto *p =
reinterpret_cast<const unsigned char *
>(s);
209 uint32_t wc =
static_cast<unsigned char>(*p);
223 }
else if (wc < 0xf0) {
226 }
else if (wc < 0xf8) {
229 }
else if (wc < 0xfc) {
232 }
else if (wc < 0xfe) {
239 if (max_len >= 0 && len > max_len) {
240 for (i = 1; i < max_len; i++) {
241 if ((((
unsigned char *)p)[i] & 0xc0) != 0x80) {
249 for (i = 1; i < len; ++i) {
250 uint32_t ch = ((
unsigned char *)p)[i];
252 if ((ch & 0xc0) != 0x80) {
264 if (UTF8_LENGTH(wc) != len) {
283 result = fcitx_utf8_get_char_extended(p, max_len, &len);
285 if (result & 0x80000000) {
288 if (!UNICODE_VALID(result)) {
320 size_t diff = next - s;
325 memcpy(str, s, diff);
339 while (byte && *str) {
342 str, (byte > FCITX_UTF8_MAX_LENGTH ? FCITX_UTF8_MAX_LENGTH : byte),
358 while (byte && *str) {
362 size_t diff = next - str;
void fcitx_utf8_strncpy(char *str, const char *s, size_t byte)
Copy most byte length, but keep utf8 valid.
char * fcitx_utf8_get_nth_char(const char *s, uint32_t n)
Get the pointer to the nth character.
C++ Utility functions for handling utf8 strings.
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
constexpr size_t INVALID_LENGTH
Possible return value of lengthValidated if the string is not valid.
uint32_t fcitx_utf8_get_char_validated(const char *p, int max_len, int *plen)
Get validated character.
size_t fcitx_utf8_strlen(const char *s)
Get utf8 string length.
C-style utf8 utility functions.
unsigned int fcitx_utf8_char_len(const char *in)
Get the number of bytes of next character.
size_t fcitx_utf8_strnlen_validated(const char *str, size_t byte)
Count most byte length, utf8 string length and validates the string.
size_t fcitx_utf8_strnlen(const char *str, size_t byte)
Count most byte length, utf8 string length.
int fcitx_ucs4_to_utf8(uint32_t c, char *output)
Convert ucs4 char to utf8, need to have enough memory for it.
int fcitx_ucs4_char_len(uint32_t c)
Return the utf8 bytes of a UCS4 char.
char * fcitx_utf8_get_char(const char *i, uint32_t *chr)
Get UCS-4 char in the utf8 string.
bool fcitx_utf8_check_string(const char *s)
Check if the string is valid utf8 string.