pstore2
Classes | Typedefs | Functions | Variables
utf.hpp File Reference

Functionality for processing UTF-8 strings. More...

#include <algorithm>
#include <cstddef>
#include <iosfwd>
#include "pstore/support/gsl.hpp"
#include "pstore/support/maybe.hpp"
Include dependency graph for utf.hpp:
This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes

class  pstore::utf::utf8_decoder
 

Typedefs

using TCHAR = char
 
using pstore::utf::utf8_string = std::basic_string< std::uint8_t >
 
using pstore::utf::utf16_string = std::basic_string< char16_t >
 
using pstore::utf::native_string = std::basic_string< TCHAR >
 
using pstore::utf::native_ostringstream = std::basic_ostringstream< TCHAR >
 

Functions

auto pstore::utf::operator<< (std::ostream &os, utf8_string const &s) -> std::ostream &
 
template<typename CharType = char, typename OutputIt >
auto pstore::utf::code_point_to_utf8 (char32_t c, OutputIt out) -> OutputIt
 
template<typename CharType = char, typename OutputIt >
auto pstore::utf::replacement_char (OutputIt out) -> OutputIt
 
template<typename ResultType >
auto pstore::utf::code_point_to_utf8 (char32_t c) -> ResultType
 
constexpr auto pstore::utf::nop_swapper (std::uint16_t const v) noexcept -> std::uint16_t
 
constexpr auto pstore::utf::byte_swapper (std::uint16_t const v) noexcept -> std::uint16_t
 
constexpr auto pstore::utf::is_utf16_high_surrogate (std::uint16_t const code_unit) noexcept -> bool
 
constexpr auto pstore::utf::is_utf16_low_surrogate (std::uint16_t const code_unit) noexcept -> bool
 
template<typename InputIterator , typename SwapperFunction >
auto pstore::utf::utf16_to_code_point (InputIterator first, InputIterator last, SwapperFunction swapper) -> std::pair< InputIterator, char32_t >
 
template<typename InputIt , typename OutputIt , typename Swapper >
auto pstore::utf::utf16_to_code_points (InputIt first, InputIt last, OutputIt out, Swapper swapper) -> OutputIt
 
template<typename ResultType , typename InputIt , typename Swapper >
auto pstore::utf::utf16_to_code_points (InputIt first, InputIt last, Swapper swapper) -> ResultType
 
template<typename ResultType , typename InputType , typename Swapper >
auto pstore::utf::utf16_to_code_points (InputType const &src, Swapper swapper) -> ResultType
 
template<typename InputType , typename Swapper >
auto pstore::utf::utf16_to_code_point (InputType const &src, Swapper swapper) -> char32_t
 
template<typename InputIt , typename OutputIt , typename Swapper >
auto pstore::utf::utf16_to_utf8 (InputIt first, InputIt last, OutputIt out, Swapper swapper) -> OutputIt
 
template<typename ResultType , typename InputIt , typename Swapper >
auto pstore::utf::utf16_to_utf8 (InputIt first, InputIt last, Swapper swapper) -> ResultType
 
template<typename ResultType , typename InputType , typename Swapper >
auto pstore::utf::utf16_to_utf8 (InputType const &src, Swapper swapper) -> ResultType
 
template<typename CharType >
constexpr auto pstore::utf::is_utf_char_start (CharType c) noexcept -> bool
 If the top two bits are 0b10, then this is a UTF-8 continuation byte and is skipped; other patterns in these top two bits represent the start of a character. More...
 
auto pstore::utf::slice (gsl::czstring str, std::ptrdiff_t start, std::ptrdiff_t end) -> std::pair< std::ptrdiff_t, std::ptrdiff_t >
 Converts codepoint indices start and end to byte offsets in the buffer at str. More...
 
constexpr auto pstore::utf::to_native_string (std::string const &str) noexcept -> std::string const &
 
constexpr auto pstore::utf::from_native_string (std::string const &str) noexcept -> std::string const &
 
template<typename Iterator >
auto pstore::utf::length (Iterator first, Iterator last) -> std::size_t
 
template<typename SpanType >
auto pstore::utf::length (SpanType span) -> std::size_t
 
auto pstore::utf::length (char const *str, std::size_t nbytes) -> std::size_t
 Returns the number of UTF-8 code points in the buffer given by a start address and length. More...
 
auto pstore::utf::length (gsl::czstring str) -> std::size_t
 Returns the number of UTF-8 code points in the null-terminated buffer at str.
 
auto pstore::utf::length (std::string const &str) -> std::size_t
 
auto pstore::utf::length (std::nullptr_t) noexcept -> std::size_t
 
auto pstore::utf::index (gsl::czstring str, std::size_t pos) -> gsl::czstring
 Returns a pointer to the beginning of the pos'th UTF-8 codepoint in the buffer at str or nullptr if either str is nullptr or if index was too large. More...
 
template<typename InputIterator >
auto pstore::utf::index (InputIterator const first, InputIterator const last, std::size_t const pos) -> InputIterator
 Returns an iterator to the beginning of the pos'th UTF-8 codepoint in the range given by first and last. More...
 
template<typename SpanType >
auto pstore::utf::index (SpanType const span, std::size_t const pos) -> typename SpanType::element_type *
 Returns a pointer to the beginning of the pos'th UTF-8 codepoint in the supplied span. More...
 

Variables

constexpr char32_t pstore::utf::replacement_char_code_point = 0xFFFD
 

Detailed Description

Functionality for processing UTF-8 strings.

On Windows, provides an additional set of functions to convert UTF-8 strings to and from UTF-16.

Function Documentation

◆ index() [1/3]

auto pstore::utf::index ( gsl::czstring  str,
std::size_t  pos 
) -> gsl::czstring

Returns a pointer to the beginning of the pos'th UTF-8 codepoint in the buffer at str or nullptr if either str is nullptr or if index was too large.

Returns a reference to the beginning of the pos'th UTF-8 code-point in a sequence.

◆ index() [2/3]

template<typename InputIterator >
auto pstore::utf::index ( InputIterator const  first,
InputIterator const  last,
std::size_t const  pos 
) -> InputIterator

Returns an iterator to the beginning of the pos'th UTF-8 codepoint in the range given by first and last.

Parameters
firstThe start of the range of elements to examine
lastThe end of the range of elements to examine
posThe number of code points to move
Returns
An iterator that is 'pos' codepoints after the start of the range or 'last' if the end of the range was encountered.

◆ index() [3/3]

template<typename SpanType >
auto pstore::utf::index ( SpanType const  span,
std::size_t const  pos 
) -> typename SpanType::element_type *

Returns a pointer to the beginning of the pos'th UTF-8 codepoint in the supplied span.

Parameters
spanA span of memory containing a sequence of UTF-8 codepoints.
posThe number of code points to move
Returns
A pointer that is 'pos' codepoints after the start of span or nullptr if the end of the range was encountered.

◆ is_utf_char_start()

template<typename CharType >
constexpr auto pstore::utf::is_utf_char_start ( CharType  c) -> bool
noexcept

If the top two bits are 0b10, then this is a UTF-8 continuation byte and is skipped; other patterns in these top two bits represent the start of a character.

◆ length() [1/2]

template<typename Iterator >
auto pstore::utf::length ( Iterator  first,
Iterator  last 
) -> std::size_t

Returns the number of UTF-8 code-points in a sequence.

◆ length() [2/2]

auto pstore::utf::length ( char const *  str,
std::size_t  nbytes 
) -> std::size_t

Returns the number of UTF-8 code points in the buffer given by a start address and length.

Parameters
strThe buffer start address.
nbytesThe number of bytes in the buffer.
Returns
The number of UTF-8 code points in the buffer given by 'str' and 'nbytes'.

◆ slice()

auto pstore::utf::slice ( gsl::czstring  str,
std::ptrdiff_t  start,
std::ptrdiff_t  end 
) -> std::pair<std::ptrdiff_t, std::ptrdiff_t>

Converts codepoint indices start and end to byte offsets in the buffer at str.

Parameters
strA UTF-8 encoded character string.
startThe code-point index of the start of a character range within the string 'str'.
endThe code-point index of the end of a character range within the string 'str'.
Returns
A pair containing the the byte offset of the start UTF-8 code-unit and the byte offset of the end UTF-8 code-unit. Either value may be -1 if they were out-of-range.