168 end_of_document = NULL;
170 current_token.
count = 1;
198 the_document = &document;
213 current =
reinterpret_cast<const uint8_t *
>(document.c_str());
214 end_of_document = current + document.size();
parser()
Constructor.
Definition: parser.h:164
void build_unicode_numeric_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end)
Helper function used to build numeric token from UTF-8.
Definition: parser.cpp:51
XML empty tag (without the "<" or "/>").
Definition: parser.h:70
token current_token
The token that is currently being build. A reference to this is returned when the token is complete...
Definition: parser.h:127
C++ slices (string-descriptors)
Definition: slice.h:27
Simple, but fast, XML parser.
Definition: parser.h:39
void * address(void) const
Extract the pointer value from the slice.
Definition: slice.h:269
const uint8_t * end_of_document
Pointer to the end of the document, used to avoid read past end of buffer.
Definition: parser.h:126
Container class representing a document through the indexing pipeline.
Definition: document.h:31
document build_document
A document used when a string is passed into this object.
Definition: parser.h:120
numeric token
Definition: parser.h:68
token_type
The type of the token.
Definition: parser.h:65
Holder class for an impact ordered postings list.
XML conditional. This is not properly interpreted. its just the "INCLUDE" or "IGNORE" that is returne...
Definition: parser.h:75
slice lexeme
The token itself, stored as a slice (pointer / length pair)
Definition: parser.h:85
static void unittest(void)
Unit test this class.
Definition: parser.cpp:454
void build_unicode_alphabetic_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end)
Helper function used to build alphabetic token from UTF-8.
Definition: parser.cpp:25
uint8_t buffer[max_token_length]
The token manages its memory through this buffer.
Definition: parser.h:84
XML start tag (just the tag name)
Definition: parser.h:69
XML definition (DOCTYPE, ELEMENT, etc) without the "<!" and ">".
Definition: parser.h:74
static constexpr size_t max_token_length
Any token longer that this will be truncated at this length.
Definition: parser.h:81
virtual void set_document(const class document &document)
Start parsing from the start of this document.
Definition: parser.h:196
slice contents
The contents of the document (or likewise).
Definition: document.h:43
Other type of token (punctuation, etc).
Definition: parser.h:77
virtual ~parser()
Destructor.
Definition: parser.h:180
A document withing the indexing pipeline.
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser.cpp:79
token eof_token
Sentinal returned when reading past end of document.
Definition: parser.h:123
token_type type
The type of this token (See token_type)
Definition: parser.h:86
The final token is marked as an EOF token (and has no content).
Definition: parser.h:78
const uint8_t * current
The current location within the document.
Definition: parser.h:125
virtual void set_document(const std::string &document)
Parse a string (rather than a document).
Definition: parser.h:211
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
const document * the_document
The document that is currently being parsed.
Definition: parser.h:124
A token as returned by the parser.
Definition: parser.h:58
Slices (also known as string-descriptors) for C++.
XML comment (with the "<!--" and–>" removed).
Definition: parser.h:72
XML end tag (without the "<" or ">".
Definition: parser.h:71
Definition: compress_integer_elias_delta_simd.c:23
XML processing instruction (a "<?" sequence) with the "<?" and "?>" removed/.
Definition: parser.h:76
alphabetic token
Definition: parser.h:67
size_t size(void) const
Return the length of this slice.
Definition: slice.h:256
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87
static size_t unittest_count(const char *string)
count the numner of tokens in the given string.
Definition: parser.cpp:429
XML CDATA (just the CDATA)
Definition: parser.h:73