31 #ifndef HELPER_UTF8_HPP_ 32 #define HELPER_UTF8_HPP_ 34 #include "../Main/Exception.hpp" 36 #include "../_extern/utf8/source/utf8.h" 40 #include <string_view> 94 bool isValidUtf8(std::string_view stringToCheck, std::string& errTo);
102 bool repairUtf8(std::string_view strIn, std::string& strOut);
108 std::size_t
length(std::string_view str);
145 for(
const uint8_t c : strIn) {
177 inline bool isValidUtf8(std::string_view stringToCheck, std::string& errTo) {
179 return utf8::is_valid(stringToCheck.cbegin(), stringToCheck.cend());
181 catch(
const utf8::exception& e) {
205 if(stringToCheck.empty()) {
210 auto pos{stringToCheck.size() - 1};
212 if(utf8::is_valid(stringToCheck.substr(pos,
oneByte))) {
216 if(stringToCheck.size() <
twoBytes) {
223 if(utf8::is_valid(stringToCheck.substr(pos,
twoBytes))) {
234 if(utf8::is_valid(stringToCheck.substr(pos,
threeBytes))) {
245 if(utf8::is_valid(stringToCheck.substr(pos,
fourBytes))) {
264 return utf8::distance(stringToCheck.begin(), stringToCheck.end()) == 1;
294 inline bool repairUtf8(std::string_view strIn, std::string& strOut) {
296 if(utf8::is_valid(strIn.cbegin(), strIn.cend())) {
300 utf8::replace_invalid(strIn.begin(), strIn.end(), back_inserter(strOut));
304 catch(
const utf8::exception& e) {
305 throw Exception(
"UTF-8 error: " + std::string(e.what()));
327 inline std::size_t
length(std::string_view str) {
328 constexpr
unsigned char maxOneByte{127};
329 constexpr
unsigned char checkTwoBytes{0xE0};
330 constexpr
unsigned char isTwoBytes{0xC0};
331 constexpr
unsigned char checkThreeBytes{0xF0};
332 constexpr
unsigned char isThreeBytes{0xE0};
333 constexpr
unsigned char checkFourBytes{0xF8};
334 constexpr
unsigned char isFourBytes{0xF0};
336 constexpr std::size_t skipTwoBytes{2};
337 constexpr std::size_t skipThreeBytes{3};
339 std::size_t result{};
345 for(std::size_t pos{}; pos <
bytes; ++pos) {
348 const unsigned char byte{
349 static_cast<unsigned char>(str[pos])
352 if(byte <= maxOneByte) {
357 if((byte & checkTwoBytes) == isTwoBytes) {
361 else if((byte & checkThreeBytes) == isThreeBytes) {
365 else if((byte & checkFourBytes) == isFourBytes) {
367 pos += skipThreeBytes;
370 std::string exceptionString{
"Invalid UTF-8 in '"};
372 exceptionString += str;
373 exceptionString +=
"'";
constexpr auto utf8MemoryFactor
Factor for guessing the maximum amount of memory used for UTF-8 compared to ISO-8859-1.
Definition: Utf8.hpp:53
constexpr auto bitmaskTopBit
Bit mask to extract the first bit of a multibyte character.
Definition: Utf8.hpp:56
bool isValidUtf8(std::string_view stringToCheck, std::string &errTo)
Checks whether a string contains valid UTF-8.
Definition: Utf8.hpp:177
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr auto threeBytes
Three bytes.
Definition: Utf8.hpp:74
std::string iso88591ToUtf8(std::string_view strIn)
Converts a string from ISO-8859-1 to UTF-8.
Definition: Utf8.hpp:139
constexpr auto oneByte
One byte.
Definition: Utf8.hpp:68
bool isLastCharValidUtf8(std::string_view stringToCheck)
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
bool repairUtf8(std::string_view strIn, std::string &strOut)
Replaces invalid UTF-8 characters in the given string and returns whether invalid characters occured...
Definition: Utf8.hpp:294
Class for UTF-8 exceptions.
Definition: Utf8.hpp:122
constexpr auto fourBytes
Four bytes.
Definition: Utf8.hpp:77
constexpr auto twoBytes
Two bytes.
Definition: Utf8.hpp:71
Namespace for global UTF-8 encoding functions.
Definition: Utf8.hpp:43
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
constexpr auto bitmaskTopTwoBits
Bit mask to extract the top two bits of a multibyte character.
Definition: Utf8.hpp:59
constexpr auto bitmaskLastSixBits0b000001
Bit mask to check the last six bits for 0b000001.
Definition: Utf8.hpp:65
constexpr auto shiftSixBits
Shift six bits.
Definition: Utf8.hpp:62
bool isSingleUtf8Char(std::string_view stringToCheck)
Returns whether the given string contains exactly one UTF-8 code point.
Definition: Utf8.hpp:263