crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Utf8.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Utf8.hpp
24  *
25  * Namespace for global UTF-8 helper functions.
26  *
27  * Created on: Dec 10, 2018
28  * Author: ans
29  */
30 
31 #ifndef HELPER_UTF8_HPP_
32 #define HELPER_UTF8_HPP_
33 
34 #include "../Main/Exception.hpp"
35 
36 #include "../_extern/utf8/source/utf8.h"
37 
38 #include <cstddef> // std::size_t
39 #include <string> // std::string
40 #include <string_view> // std::string_view
41 
44 
45  /*
46  * CONSTANTS
47  */
48 
51 
53  inline constexpr auto utf8MemoryFactor{2};
54 
56  inline constexpr auto bitmaskTopBit{0x80};
57 
59  inline constexpr auto bitmaskTopTwoBits{0xc0};
60 
62  inline constexpr auto shiftSixBits{6};
63 
65  inline constexpr auto bitmaskLastSixBits0b000001{0x3F};
66 
68  inline constexpr auto oneByte{1};
69 
71  inline constexpr auto twoBytes{2};
72 
74  inline constexpr auto threeBytes{3};
75 
77  inline constexpr auto fourBytes{4};
78 
80 
81  /*
82  * DECLARATION
83  */
84 
87 
88  std::string iso88591ToUtf8(std::string_view strIn);
89 
93 
94  bool isValidUtf8(std::string_view stringToCheck, std::string& errTo);
95  bool isLastCharValidUtf8(std::string_view stringToCheck);
96  bool isSingleUtf8Char(std::string_view stringToCheck);
97 
101 
102  bool repairUtf8(std::string_view strIn, std::string& strOut);
103 
107 
108  std::size_t length(std::string_view str);
109 
111 
112  /*
113  * EXCEPTION CLASS
114  */
115 
117 
123 
124  /*
125  * IMPLEMENTATION
126  */
127 
128  /*
129  * CONVERSION
130  */
131 
133 
139  inline std::string iso88591ToUtf8(std::string_view strIn) {
140  std::string strOut;
141 
142  // guess maximum memory requirement
143  strOut.reserve(strIn.size() * utf8MemoryFactor);
144 
145  for(const uint8_t c : strIn) {
146  if(c < bitmaskTopBit) {
147  strOut.push_back(c);
148  }
149  else {
150  strOut.push_back(bitmaskTopTwoBits | c >> shiftSixBits);
151  strOut.push_back(bitmaskTopBit | (c & bitmaskLastSixBits0b000001));
152  }
153  }
154 
155  return strOut;
156  }
157 
158  /*
159  * VALIDATION
160  */
161 
163 
177  inline bool isValidUtf8(std::string_view stringToCheck, std::string& errTo) {
178  try {
179  return utf8::is_valid(stringToCheck.cbegin(), stringToCheck.cend());
180  }
181  catch(const utf8::exception& e) {
182  errTo = e.what();
183 
184  return false;
185  }
186  }
187 
189 
204  inline bool isLastCharValidUtf8(const std::string& stringToCheck) {
205  if(stringToCheck.empty()) {
206  return true;
207  }
208 
209  // check for valid one-byte character
210  auto pos{stringToCheck.size() - 1};
211 
212  if(utf8::is_valid(stringToCheck.substr(pos, oneByte))) {
213  return true;
214  }
215 
216  if(stringToCheck.size() < twoBytes) {
217  return false;
218  }
219 
220  --pos;
221 
222  // check for valid two-byte character
223  if(utf8::is_valid(stringToCheck.substr(pos, twoBytes))) {
224  return true;
225  }
226 
227  if(stringToCheck.size() < threeBytes) {
228  return false;
229  }
230 
231  --pos;
232 
233  // check for valid three-byte character
234  if(utf8::is_valid(stringToCheck.substr(pos, threeBytes))) {
235  return true;
236  }
237 
238  if(stringToCheck.size() < fourBytes) {
239  return false;
240  }
241 
242  --pos;
243 
244  // check for valid four-byte character
245  if(utf8::is_valid(stringToCheck.substr(pos, fourBytes))) {
246  return true;
247  }
248 
249  return false;
250  }
251 
253 
263  inline bool isSingleUtf8Char(std::string_view stringToCheck) {
264  return utf8::distance(stringToCheck.begin(), stringToCheck.end()) == 1;
265  }
266 
267  /*
268  * REPAIR
269  */
270 
272 
294  inline bool repairUtf8(std::string_view strIn, std::string& strOut) {
295  try {
296  if(utf8::is_valid(strIn.cbegin(), strIn.cend())) {
297  return false;
298  }
299 
300  utf8::replace_invalid(strIn.begin(), strIn.end(), back_inserter(strOut));
301 
302  return true;
303  }
304  catch(const utf8::exception& e) {
305  throw Exception("UTF-8 error: " + std::string(e.what()));
306  }
307  }
308 
309  /*
310  * LENGTH
311  */
312 
327  inline std::size_t length(std::string_view str) {
328  constexpr unsigned char maxOneByte{127};
329  constexpr unsigned char checkTwoBytes{0xE0};
330  constexpr unsigned char isTwoBytes{0xC0};
331  constexpr unsigned char checkThreeBytes{0xF0};
332  constexpr unsigned char isThreeBytes{0xE0};
333  constexpr unsigned char checkFourBytes{0xF8};
334  constexpr unsigned char isFourBytes{0xF0};
335 
336  constexpr std::size_t skipTwoBytes{2};
337  constexpr std::size_t skipThreeBytes{3};
338 
339  std::size_t result{};
340 
341  const auto bytes{
342  str.length()
343  };
344 
345  for(std::size_t pos{}; pos < bytes; ++pos) {
346  ++result;
347 
348  const unsigned char byte{
349  static_cast<unsigned char>(str[pos])
350  };
351 
352  if(byte <= maxOneByte) {
353  // one byte
354  continue;
355  }
356 
357  if((byte & checkTwoBytes) == isTwoBytes) {
358  // two bytes
359  ++pos;
360  }
361  else if((byte & checkThreeBytes) == isThreeBytes) {
362  // three bytes
363  pos += skipTwoBytes;
364  }
365  else if((byte & checkFourBytes) == isFourBytes) {
366  // four bytes
367  pos += skipThreeBytes;
368  }
369  else {
370  std::string exceptionString{"Invalid UTF-8 in '"};
371 
372  exceptionString += str;
373  exceptionString += "'";
374 
375  throw Exception(exceptionString);
376  }
377  }
378 
379  return result;
380  }
381 
382 } /* namespace crawlservpp::Helper::Utf8 */
383 
384 #endif /* HELPER_UTF8_HPP_ */
constexpr auto utf8MemoryFactor
Factor for guessing the maximum amount of memory used for UTF-8 compared to ISO-8859-1.
Definition: Utf8.hpp:53
constexpr auto bitmaskTopBit
Bit mask to extract the first bit of a multibyte character.
Definition: Utf8.hpp:56
bool isValidUtf8(std::string_view stringToCheck, std::string &errTo)
Checks whether a string contains valid UTF-8.
Definition: Utf8.hpp:177
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr auto threeBytes
Three bytes.
Definition: Utf8.hpp:74
std::string iso88591ToUtf8(std::string_view strIn)
Converts a string from ISO-8859-1 to UTF-8.
Definition: Utf8.hpp:139
constexpr auto oneByte
One byte.
Definition: Utf8.hpp:68
bool isLastCharValidUtf8(std::string_view stringToCheck)
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
bool repairUtf8(std::string_view strIn, std::string &strOut)
Replaces invalid UTF-8 characters in the given string and returns whether invalid characters occured...
Definition: Utf8.hpp:294
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
constexpr auto fourBytes
Four bytes.
Definition: Utf8.hpp:77
constexpr auto twoBytes
Two bytes.
Definition: Utf8.hpp:71
Namespace for global UTF-8 encoding functions.
Definition: Utf8.hpp:43
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
constexpr auto bitmaskTopTwoBits
Bit mask to extract the top two bits of a multibyte character.
Definition: Utf8.hpp:59
constexpr auto bitmaskLastSixBits0b000001
Bit mask to check the last six bits for 0b000001.
Definition: Utf8.hpp:65
constexpr auto shiftSixBits
Shift six bits.
Definition: Utf8.hpp:62
bool isSingleUtf8Char(std::string_view stringToCheck)
Returns whether the given string contains exactly one UTF-8 code point.
Definition: Utf8.hpp:263