Fcitx
utf8.h
Go to the documentation of this file.
1 /*
2  * SPDX-FileCopyrightText: 2015-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 #ifndef _FCITX_UTILS_UTF8_H_
8 #define _FCITX_UTILS_UTF8_H_
9 
10 /// \addtogroup FcitxUtils
11 /// \{
12 /// \file
13 /// \brief C++ Utility functions for handling utf8 strings.
14 
15 #include <sys/types.h>
16 #include <cstddef>
17 #include <cstdint>
18 #include <iterator>
19 #include <stdexcept>
20 #include <string>
21 #include <string_view>
22 #include <utility>
23 #include <fcitx-utils/cutf8.h>
24 #include <fcitx-utils/fcitxutils_export.h>
25 #include <fcitx-utils/macros.h>
26 #include <fcitx-utils/misc.h>
27 
28 namespace fcitx::utf8 {
29 
30 /// \brief Return the number UTF-8 characters in the string iterator range.
31 /// \see lengthValidated()
32 template <typename Iter>
33 inline size_t length(Iter start, Iter end) {
34  return fcitx_utf8_strnlen(&(*start), std::distance(start, end));
35 }
36 
37 /// \brief Return the number UTF-8 characters in the string.
38 /// \see lengthValidated()
39 template <typename T>
40 inline size_t length(const T &s) {
41  return length(std::begin(s), std::end(s));
42 }
43 
44 /// \brief Return the number UTF-8 characters in the string.
45 template <typename T>
46 inline size_t length(const T &s, size_t start, size_t end) {
47  return length(std::next(std::begin(s), start),
48  std::next(std::begin(s), end));
49 }
50 
51 /// \brief Possible return value of lengthValidated if the string is not valid.
52 /// \see lengthValidated()
53 constexpr size_t INVALID_LENGTH = static_cast<size_t>(-1);
54 
55 /// \brief Validate and return the number UTF-8 characters in the string
56 /// iterator range
57 ///
58 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
59 template <typename Iter>
60 inline size_t lengthValidated(Iter start, Iter end) {
61  return fcitx_utf8_strnlen_validated(&(*start), std::distance(start, end));
62 }
63 
64 /// \brief Validate and return the number UTF-8 characters in the string
65 ///
66 /// Will return INVALID_LENGTH if string is not a valid utf8 string.
67 template <typename T>
68 inline size_t lengthValidated(const T &s) {
69  return lengthValidated(std::begin(s), std::end(s));
70 }
71 
72 /// \brief Check if the string iterator range is valid utf8 string
73 template <typename Iter>
74 inline bool validate(Iter start, Iter end) {
75  return lengthValidated(start, end) != INVALID_LENGTH;
76 }
77 
78 /// \brief Check if the string is valid utf8 string.
79 template <typename T>
80 inline bool validate(const T &s) {
81  return validate(std::begin(s), std::end(s));
82 }
83 
84 /// \brief Convert UCS4 to UTF8 string.
85 FCITXUTILS_EXPORT std::string UCS4ToUTF8(uint32_t code);
86 
87 /// \brief Check if a ucs4 is valid.
88 FCITXUTILS_EXPORT bool UCS4IsValid(uint32_t code);
89 
90 /// \brief Possible return value for getChar.
91 constexpr uint32_t INVALID_CHAR = static_cast<uint32_t>(-1);
92 
93 /// \brief Possible return value for getChar.
94 constexpr uint32_t NOT_ENOUGH_SPACE = static_cast<uint32_t>(-2);
95 
96 /// \brief Check the chr value is not two invalid value above.
97 inline bool isValidChar(uint32_t c) {
98  return c != INVALID_CHAR && c != NOT_ENOUGH_SPACE;
99 }
100 
101 /// \brief Get next UCS4 char from iter, do not cross end. May return
102 /// INVALID_CHAR or NOT_ENOUGH_SPACE
103 template <typename Iter>
104 inline uint32_t getChar(Iter iter, Iter end) {
105  const char *c = &(*iter);
106  return fcitx_utf8_get_char_validated(c, std::distance(iter, end), nullptr);
107 }
108 
109 /// \brief Get next UCS4 char, may return INVALID_CHAR or NOT_ENOUGH_SPACE
110 template <typename T>
111 inline uint32_t getChar(const T &s) {
112  return getChar(std::begin(s), std::end(s));
113 }
114 
115 template <typename Iter>
116 inline Iter getNextChar(Iter iter, Iter end, uint32_t *chr) {
117  const char *c = &(*iter);
118  int plen = 0;
119  *chr = fcitx_utf8_get_char_validated(c, std::distance(iter, end), &plen);
120  return std::next(iter, plen);
121 }
122 
123 /// \brief get the byte length of next N utf-8 character.
124 ///
125 /// This function has no error check on invalid string or end of string. Check
126 /// the string before use it.
127 template <typename Iter>
128 inline ssize_t ncharByteLength(Iter iter, size_t n) {
129  const char *c = &(*iter);
130  return fcitx_utf8_get_nth_char(c, n) - c;
131 }
132 
133 /// \brief Move iter over next n character.
134 template <typename Iter>
135 inline Iter nextNChar(Iter iter, size_t n) {
136  return std::next(iter, ncharByteLength(iter, n));
137 }
138 
139 /// \brief Move iter over next one character.
140 template <typename Iter>
141 Iter nextChar(Iter iter) {
142  return nextNChar(iter, 1);
143 }
144 
145 template <typename Iter>
146 uint32_t getLastChar(Iter iter, Iter end) {
147  uint32_t c = NOT_ENOUGH_SPACE;
148  while (iter != end) {
149  iter = getNextChar(iter, end, &c);
150  if (!isValidChar(c)) {
151  break;
152  }
153  }
154  return c;
155 }
156 
157 template <typename T>
158 uint32_t getLastChar(const T &str) {
159  return getLastChar(std::begin(str), std::end(str));
160 }
161 
162 /// \brief Helper class to iterate character, you need to validate the string
163 /// before using it.
164 template <typename Iter>
166 public:
167  using iterator_category = std::input_iterator_tag;
168  using value_type = uint32_t;
169  using difference_type = std::ptrdiff_t;
170  using reference = const value_type &;
171  using pointer = const value_type *;
172 
173  UTF8CharIterator(Iter iter, Iter end) : iter_(iter), end_(end) { update(); }
174  FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8CharIterator)
175 
176  reference operator*() const { return currentChar_; }
177 
178  pointer operator->() const { return &currentChar_; }
179 
180  std::pair<Iter, Iter> charRange() const { return {iter_, next_}; }
181 
182  size_t charLength() const { return std::distance(iter_, next_); }
183 
184  std::string_view view() const {
185  return std::string_view{&*iter_, charLength()};
186  }
187 
188  UTF8CharIterator &operator++() {
189  iter_ = next_;
190  update();
191  return *this;
192  }
193 
194  UTF8CharIterator operator++(int) {
195  auto old = *this;
196  ++(*this);
197  return old;
198  }
199 
200  bool operator==(const UTF8CharIterator &other) {
201  return iter_ == other.iter_;
202  }
203  bool operator!=(const UTF8CharIterator &other) {
204  return !operator==(other);
205  }
206 
207 private:
208  void update() {
209  next_ = getNextChar(iter_, end_, &currentChar_);
210  if (iter_ != end_ && iter_ == next_) {
211  throw std::runtime_error("Invalid UTF8 character.");
212  }
213  }
214 
215  uint32_t currentChar_ = 0;
216  Iter iter_;
217  Iter next_;
218  Iter end_;
219 };
220 
221 template <typename Iter>
222 auto MakeUTF8CharIterator(Iter iter, Iter end) {
223  return UTF8CharIterator<Iter>(iter, end);
224 }
225 
226 template <typename T>
227 auto MakeUTF8CharRange(const T &str) {
228  return MakeIterRange(MakeUTF8CharIterator(std::begin(str), std::end(str)),
229  MakeUTF8CharIterator(std::end(str), std::end(str)));
230 }
231 
232 template <typename Iter>
234 public:
235  using iterator_category = std::input_iterator_tag;
236  using value_type = std::string_view;
237  using difference_type = std::ptrdiff_t;
238  using reference = const value_type &;
239  using pointer = const value_type *;
240 
241  UTF8StringViewIter(Iter iter, Iter end) : iter_(iter), end_(end) {
242  update();
243  }
244  FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(UTF8StringViewIter)
245 
246  reference operator*() const { return currentView_; }
247 
248  pointer operator->() const { return &currentView_; }
249 
250  size_t charLength() const { return currentView_.size(); }
251 
252  uint32_t chr() const { return currentChar_; }
253 
254  UTF8StringViewIter &operator++() {
255  iter_ = next_;
256  update();
257  return *this;
258  }
259 
260  UTF8StringViewIter operator++(int) {
261  auto old = *this;
262  ++(*this);
263  return old;
264  }
265 
266  bool operator==(const UTF8StringViewIter &other) {
267  return iter_ == other.iter_;
268  }
269  bool operator!=(const UTF8StringViewIter &other) {
270  return !operator==(other);
271  }
272 
273 private:
274  void update() {
275  next_ = getNextChar(iter_, end_, &currentChar_);
276  if (iter_ != end_ && iter_ == next_) {
277  throw std::runtime_error("Invalid UTF8 character.");
278  }
279  currentView_ = std::string_view(&*iter_, std::distance(iter_, next_));
280  }
281 
282  std::string_view currentView_;
283  uint32_t currentChar_ = 0;
284  Iter iter_;
285  Iter next_;
286  Iter end_;
287 };
288 
289 template <typename Iter>
290 auto MakeUTF8StringViewIterator(Iter iter, Iter end) {
291  return UTF8StringViewIter<Iter>(iter, end);
292 }
293 
294 template <typename T>
295 auto MakeUTF8StringViewRange(const T &str) {
296  return MakeIterRange(
297  MakeUTF8StringViewIterator(std::begin(str), std::end(str)),
298  MakeUTF8StringViewIterator(std::end(str), std::end(str)));
299 }
300 
301 #ifdef _WIN32
302 
303 std::string UTF16ToUTF8(std::wstring_view data);
304 std::wstring UTF8ToUTF16(std::string_view str);
305 
306 #endif
307 
308 } // namespace fcitx::utf8
309 
310 #endif // _FCITX_UTILS_UTF8_H_
bool validate(const T &s)
Check if the string is valid utf8 string.
Definition: utf8.h:80
char * fcitx_utf8_get_nth_char(const char *s, uint32_t n)
Get the pointer to the nth character.
Definition: cutf8.cpp:191
std::string UCS4ToUTF8(uint32_t code)
Convert UCS4 to UTF8 string.
Definition: utf8.cpp:19
size_t lengthValidated(const T &s)
Validate and return the number UTF-8 characters in the string.
Definition: utf8.h:68
bool isValidChar(uint32_t c)
Check the chr value is not two invalid value above.
Definition: utf8.h:97
Iter nextNChar(Iter iter, size_t n)
Move iter over next n character.
Definition: utf8.h:135
constexpr uint32_t INVALID_CHAR
Possible return value for getChar.
Definition: utf8.h:91
constexpr uint32_t NOT_ENOUGH_SPACE
Possible return value for getChar.
Definition: utf8.h:94
constexpr size_t INVALID_LENGTH
Possible return value of lengthValidated if the string is not valid.
Definition: utf8.h:53
uint32_t fcitx_utf8_get_char_validated(const char *p, int max_len, int *plen)
Get validated character.
Definition: cutf8.cpp:275
C-style utf8 utility functions.
bool UCS4IsValid(uint32_t code)
Check if a ucs4 is valid.
Definition: utf8.cpp:15
Iter nextChar(Iter iter)
Move iter over next one character.
Definition: utf8.h:141
size_t fcitx_utf8_strnlen_validated(const char *str, size_t byte)
Count most byte length, utf8 string length and validates the string.
Definition: cutf8.cpp:337
size_t fcitx_utf8_strnlen(const char *str, size_t byte)
Count most byte length, utf8 string length.
Definition: cutf8.cpp:355
uint32_t getChar(const T &s)
Get next UCS4 char, may return INVALID_CHAR or NOT_ENOUGH_SPACE.
Definition: utf8.h:111
Helper class to iterate character, you need to validate the string before using it.
Definition: utf8.h:165
size_t length(const T &s, size_t start, size_t end)
Return the number UTF-8 characters in the string.
Definition: utf8.h:46
ssize_t ncharByteLength(Iter iter, size_t n)
get the byte length of next N utf-8 character.
Definition: utf8.h:128