Implementation of the ctype methods on Unicode codepoints. More...

#include <unicode.h>

Static Public Member Functions
static size_t	utf8_bytes (const void *sequence)
	Return the number of bytes the UTF-8 character starting at sequence is taking. More...

static size_t	utf8_bytes (uint32_t codepoint)
	Return the number of bytes necessary to convert the codepoint into UTF-8. More...

static size_t	isutf8 (const void utf8_start, const void end_of_buffer)
	Examine the UTF-8 sequence to determine whether or not it is valid UTF-8. More...

static uint32_t	utf8_to_codepoint (const void utf8_start, const void end_of_buffer, size_t &bytes_consumed)
	Convert the UTF-8 sequence into a Unicode codepoin (i.e. decode the UTF-8). More...

static size_t	codepoint_to_utf8 (void utf8_start, const void end_of_buffer, uint32_t codepoint)
	Encode the Unicode codepoint in UTF-8 into buffer utf8_start, not exceeding end_of_buffer. More...

static void	tocasefold (std::vector< uint32_t > &casefolded, uint32_t codepoint)
	Strip all accents, non-alphanumerics, and then casefold. More...

static const uint32_t *	tocasefold (uint32_t codepoint)
	Return a pointer to a 0 terminated array of codepoints that is the casefolded normalised codepoint. More...

static int	isalpha (uint32_t codepoint)
	Unicode version is isalpha(). More...

static int	isalnum (uint32_t codepoint)
	Unicode version is isalnum(). More...

static int	isupper (uint32_t codepoint)
	Unicode version is isupper(). More...

static int	islower (uint32_t codepoint)
	Unicode version is islower(). More...

static int	iscntrl (uint32_t codepoint)
	Unicode version is iscntrl(). More...

static int	isdigit (uint32_t codepoint)
	Unicode version is isdigit(). More...

static int	isgraph (uint32_t codepoint)
	Unicode version is isgraph(). More...

static int	ispunct (uint32_t codepoint)
	Unicode version is ispunct(). More...

static int	isspace (uint32_t codepoint)
	Unicode version is isspace(), by the "C" isspace() and Unicode definition. More...

static int	isuspace (uint32_t codepoint)
	Unicode version is isspace(), by the Unicode definition. More...

static int	isxdigit (uint32_t codepoint)
	Unicode version is isxdigit(). More...

static int	ismark (uint32_t codepoint)
	Check to see if the codepoint is a mark. More...

static int	issymbol (uint32_t codepoint)
	Check to see if the codepoint is a symbol. More...

static int	isxmlnamestartchar (uint32_t codepoint)
	Check to see if the codepoint is a valid character to start and XML tag name with. More...

static int	isxmlnamechar (uint32_t codepoint)
	Check to see if the codepoint is a valid character to follow a NameStartChar in an XML tag name. More...

static void	unittest (void)
	Unit test this class.

Static Public Attributes
static constexpr uint32_t	replacement_character = 0xFFFD
	bad UTF-8 characters become codepoint U+FFFD (the Unicode replacement character designed for that purpose).

static constexpr size_t	max_casefold_expansion_factor = 18
	The maximum number of codepoints a case-folded codepoint can take.

static constexpr size_t	max_utf8_bytes = 4
	The maximum number of bytes that a UTF8 codepoint can take.

static constexpr size_t	max_codepoint = 0x10FFFF
	The highest valid Unicode codepoint.

Detailed Description

Implementation of the ctype methods on Unicode codepoints.

Member Function Documentation

◆ codepoint_to_utf8()

static size_t JASS::unicode::codepoint_to_utf8	(	void *	utf8_start,
		const void *	end_of_buffer,
		uint32_t	codepoint
	)

inlinestatic

Encode the Unicode codepoint in UTF-8 into buffer utf8_start, not exceeding end_of_buffer.

Parameters

utf8_start	[out] The start of the UTF-8 sequence.
end_of_buffer	[in] The end of the buffer to contain the UTF-8 sequence (not necessarily the end of the UTF-8 sequence).
codepoint	[in] The codepoint to turn into UTF-8.

Returns: The number of bytes written into utf8_start, or 0 on error (e.g. overrun would occur, or invalid codepoint).

◆ isalnum()

static int JASS::unicode::isalnum ( uint32_t codepoint )

inlinestatic

Unicode version is isalnum().

Character is of the general Unicode category "Alphabetic" or of the general Unicode category "Nd, Nl, No". That is, the character is alphanumeric

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if alphabetic, else false.

◆ isalpha()

static int JASS::unicode::isalpha ( uint32_t codepoint )

inlinestatic

Unicode version is isalpha().

Character is of the general Unicode category "Alphabetic".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if alphabetic, else false.

◆ iscntrl()

static int JASS::unicode::iscntrl ( uint32_t codepoint )

inlinestatic

Unicode version is iscntrl().

Character is of the general Unicode category "Cc".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a control character, else false.

◆ isdigit()

static int JASS::unicode::isdigit ( uint32_t codepoint )

inlinestatic

Unicode version is isdigit().

Character is of the general Unicode category "Nd, Nl, No". These categories include numeric characters that are not digits, but this routine maintains its name for backwards compatibility with the "C" routine of the same name.

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a numeric character, else false.

◆ isgraph()

static int JASS::unicode::isgraph ( uint32_t codepoint )

inlinestatic

Unicode version is isgraph().

Character is of the general Unicode category "L, M, N, P, S, Zs".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a graphical character, else false.

◆ islower()

static int JASS::unicode::islower ( uint32_t codepoint )

inlinestatic

Unicode version is islower().

Character is of the general Unicode category "Unicode lowercase".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if lowercase alphabetic, else false.

◆ ismark()

static int JASS::unicode::ismark ( uint32_t codepoint )

inlinestatic

Check to see if the codepoint is a mark.

Character is of the general Unicode category "M".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a mark character, else false.

◆ ispunct()

static int JASS::unicode::ispunct ( uint32_t codepoint )

inlinestatic

Unicode version is ispunct().

Character is of the general Unicode category "Pd, Ps, Pe, Pc, Po, Pi, Pf".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a punctuation character, else false.

◆ isspace()

static int JASS::unicode::isspace ( uint32_t codepoint )

inlinestatic

Unicode version is isspace(), by the "C" isspace() and Unicode definition.

Character is "Part of C0(tab, vertical tab, form feed, carriage return, and linefeed characters), Zs, Zl, Zp, and NEL(U+0085)".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a space character, else false.

◆ issymbol()

static int JASS::unicode::issymbol ( uint32_t codepoint )

inlinestatic

Check to see if the codepoint is a symbol.

Character is of the general Unicode category "S".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a symbol character, else false.

◆ isupper()

static int JASS::unicode::isupper ( uint32_t codepoint )

inlinestatic

Unicode version is isupper().

Character is of the general Unicode category "Unicode uppercase".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if uppercase alphabetic, else false.

◆ isuspace()

static int JASS::unicode::isuspace ( uint32_t codepoint )

inlinestatic

Unicode version is isspace(), by the Unicode definition.

Character is "Zs".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a space character, else false.

◆ isutf8()

static size_t JASS::unicode::isutf8	(	const void *	utf8_start,
		const void *	end_of_buffer
	)

inlinestatic

Examine the UTF-8 sequence to determine whether or not it is valid UTF-8.

Parameters

utf8_start	[in] The start of the UTF-8 sequence.
end_of_buffer	[in] The end of the buffer containing the UTF-8 sequence (not necessarily the end of the UTF-8 sequence).

Returns: The length of the UTF-8 sequence, or 0 on bad sequence. That is, true or false.

◆ isxdigit()

static int JASS::unicode::isxdigit ( uint32_t codepoint )

inlinestatic

Unicode version is isxdigit().

Character is of the general Unicode category "Hex_Digit or ASCII_Hex_Digit".

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if a hex character, else false.

◆ isxmlnamechar()

static int JASS::unicode::isxmlnamechar ( uint32_t codepoint )

inlinestatic

Check to see if the codepoint is a valid character to follow a NameStartChar in an XML tag name.

According to XML production 4a, valid Unicode characters are (NameStartChar | "-" | "." | [0-9] | #xB7 | [#x0300-#x036F] | [#x203F-#x2040]) For details see: "Extensible Markup Language (XML) 1.0 (Fifth Edition) W3C Recommendation 26 November 2008", https://www.w3.org/TR/REC-xml

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if an XML NameChar character, else false.

◆ isxmlnamestartchar()

static int JASS::unicode::isxmlnamestartchar ( uint32_t codepoint )

inlinestatic

Check to see if the codepoint is a valid character to start and XML tag name with.

Parameters

codepoint [in] The Unicode codepoint to check.

Returns: true if an XML NameStartChar character, else false.

◆ tocasefold() [1/2]

static void JASS::unicode::tocasefold	(	std::vector< uint32_t > &	casefolded,
		uint32_t	codepoint
	)

inlinestatic

Strip all accents, non-alphanumerics, and then casefold.

This is the JASS character normalisation method. It converts to Unicode "NFKD", strips all non-alpha-numerics, then performs Unicode casefolding "C+F". As unicode decomposition is involved (and casefolding) the resulting string can be considerably larger than a single codepoint. The worst case is the single codepoint U+FDFA becoming 18 codepoints once normalisd. Two codepoints, U+FDFA and U+FDFB expand into strings that contain spaces; it is the caller's responsibility to manage this should it need to be managed.

Parameters

casefolded	[out] The normalise Unicode codepoint string is appended to this parameter.
codepoint	[in] The codepoint to normalise.

◆ tocasefold() [2/2]

static const uint32_t* JASS::unicode::tocasefold ( uint32_t codepoint )

inlinestatic

Return a pointer to a 0 terminated array of codepoints that is the casefolded normalised codepoint.

This is the JASS character normalisation method. It converts to Unicode "NFKD", strips all non-alpha-numerics, then performs Unicode casefolding "C+F". As unicode decomposition is involved (and casefolding) the resulting string can be considerably larger than a single codepoint. The worst case is the single codepoint U+FDFA becoming 18 codepoints once normalisd. Two codepoints, U+FDFA and U+FDFB expand into strings that contain spaces; it is the caller's responsibility to manage this should it need to be managed.

Parameters

codepoint [in] The codepoint to normalise.

◆ utf8_bytes() [1/2]

static size_t JASS::unicode::utf8_bytes ( const void * sequence )

inlinestatic

Return the number of bytes the UTF-8 character starting at sequence is taking.

Computing the length is done by looking only at the first character in the sequence which contains the length bits.

Parameters

sequence [in] A pointer to a UTF-8 character

Returns: The length (in bytes) of the UTF-8 character starting at sequence, or 0 if the UTF-8 length byte is malformed.

◆ utf8_bytes() [2/2]

static size_t JASS::unicode::utf8_bytes ( uint32_t codepoint )

inlinestatic

Return the number of bytes necessary to convert the codepoint into UTF-8.

Parameters

codepoint [in] The codepoint to convert.

Returns: The minimum length (in bytes) of the UTF-8 sequence necessary to store codepoint.

◆ utf8_to_codepoint()

static uint32_t JASS::unicode::utf8_to_codepoint	(	const void *	utf8_start,
		const void *	end_of_buffer,
		size_t &	bytes_consumed
	)

inlinestatic

Convert the UTF-8 sequence into a Unicode codepoin (i.e. decode the UTF-8).

Parameters

utf8_start	[in] The start of the UTF-8 sequence.
end_of_buffer	[in] The end of the buffer containing the UTF-8 sequence (not necessarily the end of the UTF-8 sequence).
bytes_consumed	[out] The number of bytes taken by the UTF-8 sequence (computed as a byproduct of decoding).

Returns: The Unicode codepoint, or unicode::replacement_character on invalid input.

The documentation for this class was generated from the following file:

source/unicode.h

Static Public Member Functions

Static Public Attributes

Detailed Description

Member Function Documentation

◆ codepoint_to_utf8()

◆ isalnum()

◆ isalpha()

◆ iscntrl()

◆ isdigit()

◆ isgraph()

◆ islower()

◆ ismark()

◆ ispunct()

◆ isspace()

◆ issymbol()

◆ isupper()

◆ isuspace()

◆ isutf8()

◆ isxdigit()

◆ isxmlnamechar()

◆ isxmlnamestartchar()

◆ tocasefold() [1/2]

◆ tocasefold() [2/2]

◆ utf8_bytes() [1/2]

◆ utf8_bytes() [2/2]

◆ utf8_to_codepoint()