7 #ifndef _FCITX_UTILS_UTF8_H_ 8 #define _FCITX_UTILS_UTF8_H_ 15 #include <sys/types.h> 21 #include <string_view> 24 #include <fcitx-utils/fcitxutils_export.h> 25 #include <fcitx-utils/macros.h> 26 #include <fcitx-utils/misc.h> 32 template <
typename Iter>
33 inline size_t length(Iter start, Iter end) {
41 return length(std::begin(s), std::end(s));
46 inline size_t length(
const T &s,
size_t start,
size_t end) {
47 return length(std::next(std::begin(s), start),
48 std::next(std::begin(s), end));
59 template <
typename Iter>
73 template <
typename Iter>
81 return validate(std::begin(s), std::end(s));
85 FCITXUTILS_EXPORT std::string
UCS4ToUTF8(uint32_t code);
103 template <
typename Iter>
104 inline uint32_t
getChar(Iter iter, Iter end) {
105 const char *c = &(*iter);
110 template <
typename T>
112 return getChar(std::begin(s), std::end(s));
115 template <
typename Iter>
116 inline Iter getNextChar(Iter iter, Iter end, uint32_t *chr) {
117 const char *c = &(*iter);
120 return std::next(iter, plen);
127 template <
typename Iter>
129 const char *c = &(*iter);
134 template <
typename Iter>
140 template <
typename Iter>
145 template <
typename Iter>
146 uint32_t getLastChar(Iter iter, Iter end) {
148 while (iter != end) {
149 iter = getNextChar(iter, end, &c);
157 template <
typename T>
158 uint32_t getLastChar(
const T &str) {
159 return getLastChar(std::begin(str), std::end(str));
164 template <
typename Iter>
167 using iterator_category = std::input_iterator_tag;
168 using value_type = uint32_t;
169 using difference_type = std::ptrdiff_t;
170 using reference =
const value_type &;
171 using pointer =
const value_type *;
173 UTF8CharIterator(Iter iter, Iter end) : iter_(iter), end_(end) { update(); }
176 reference operator*()
const {
return currentChar_; }
178 pointer operator->()
const {
return ¤tChar_; }
180 std::pair<Iter, Iter> charRange()
const {
return {iter_, next_}; }
182 size_t charLength()
const {
return std::distance(iter_, next_); }
184 std::string_view view()
const {
185 return std::string_view{&*iter_, charLength()};
201 return iter_ == other.iter_;
204 return !operator==(other);
209 next_ = getNextChar(iter_, end_, ¤tChar_);
210 if (iter_ != end_ && iter_ == next_) {
211 throw std::runtime_error(
"Invalid UTF8 character.");
215 uint32_t currentChar_ = 0;
221 template <
typename Iter>
222 auto MakeUTF8CharIterator(Iter iter, Iter end) {
226 template <
typename T>
227 auto MakeUTF8CharRange(
const T &str) {
228 return MakeIterRange(MakeUTF8CharIterator(std::begin(str), std::end(str)),
229 MakeUTF8CharIterator(std::end(str), std::end(str)));
232 template <
typename Iter>
235 using iterator_category = std::input_iterator_tag;
236 using value_type = std::string_view;
237 using difference_type = std::ptrdiff_t;
238 using reference =
const value_type &;
239 using pointer =
const value_type *;
246 reference operator*()
const {
return currentView_; }
248 pointer operator->()
const {
return ¤tView_; }
250 size_t charLength()
const {
return currentView_.size(); }
252 uint32_t chr()
const {
return currentChar_; }
267 return iter_ == other.iter_;
270 return !operator==(other);
275 next_ = getNextChar(iter_, end_, ¤tChar_);
276 if (iter_ != end_ && iter_ == next_) {
277 throw std::runtime_error(
"Invalid UTF8 character.");
279 currentView_ = std::string_view(&*iter_, std::distance(iter_, next_));
282 std::string_view currentView_;
283 uint32_t currentChar_ = 0;
289 template <
typename Iter>
290 auto MakeUTF8StringViewIterator(Iter iter, Iter end) {
294 template <
typename T>
295 auto MakeUTF8StringViewRange(
const T &str) {
296 return MakeIterRange(
297 MakeUTF8StringViewIterator(std::begin(str), std::end(str)),
298 MakeUTF8StringViewIterator(std::end(str), std::end(str)));
303 std::string UTF16ToUTF8(std::wstring_view data);
304 std::wstring UTF8ToUTF16(std::string_view str);
310 #endif // _FCITX_UTILS_UTF8_H_ bool validate(const T &s)
Check if the string is valid utf8 string.
char * fcitx_utf8_get_nth_char(const char *s, uint32_t n)
Get the pointer to the nth character.
std::string UCS4ToUTF8(uint32_t code)
Convert UCS4 to UTF8 string.
size_t lengthValidated(const T &s)
Validate and return the number UTF-8 characters in the string.
bool isValidChar(uint32_t c)
Check the chr value is not two invalid value above.
Iter nextNChar(Iter iter, size_t n)
Move iter over next n character.
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
constexpr size_t INVALID_LENGTH
Possible return value of lengthValidated if the string is not valid.
uint32_t fcitx_utf8_get_char_validated(const char *p, int max_len, int *plen)
Get validated character.
C-style utf8 utility functions.
bool UCS4IsValid(uint32_t code)
Check if a ucs4 is valid.
Iter nextChar(Iter iter)
Move iter over next one character.
size_t fcitx_utf8_strnlen_validated(const char *str, size_t byte)
Count most byte length, utf8 string length and validates the string.
size_t fcitx_utf8_strnlen(const char *str, size_t byte)
Count most byte length, utf8 string length.
uint32_t getChar(const T &s)
Get next UCS4 char, may return INVALID_CHAR or NOT_ENOUGH_SPACE.
Helper class to iterate character, you need to validate the string before using it.
size_t length(const T &s, size_t start, size_t end)
Return the number UTF-8 characters in the string.
ssize_t ncharByteLength(Iter iter, size_t n)
get the byte length of next N utf-8 character.