JASSv2
parser.h
Go to the documentation of this file.
1 /*
2  PARSER.H
3  --------
4  Copyright (c) 2016 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
14 #pragma once
15 
16 #include <stdint.h>
17 
18 #include "slice.h"
19 #include "document.h"
20 #include "index_postings_impact.h"
21 
22 namespace JASS
23  {
24  /*
25  CLASS PARSER
26  ------------
27  */
39  class parser
40  {
41  public:
42  /*
43  CLASS PARSER::TOKEN
44  -------------------
45  */
58  class token
59  {
60  public:
66  {
78  eof
79  };
80  public:
81  static constexpr size_t max_token_length = 1024;
82 
83  public:
88 
89  public:
90  /*
91  PARSER::TOKEN::GET()
92  --------------------
93  */
98  const slice &get(void) const
99  {
100  return lexeme;
101  }
102 
103  /*
104  PARSER::TOKEN::SET()
105  --------------------
106  */
110  void set(slice term)
111  {
112  *buffer = '\0';
113  lexeme = term;
114  type = other;
115  count = 1;
116  }
117  };
118 
119  private:
121 
122  protected:
125  const uint8_t *current;
126  const uint8_t *end_of_document;
128 
129  protected:
130  /*
131  PARSER::BUILD_UNICODE_ALPHABETIC_TOKEN()
132  ----------------------------------------
133  */
141  void build_unicode_alphabetic_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end);
142 
143  /*
144  PARSER::BUILD_UNICODE_NUMERIC_TOKEN()
145  -------------------------------------
146  */
154  void build_unicode_numeric_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end);
155 
156  public:
157  /*
158  PARSER::PARSER()
159  ----------------
160  */
165  {
166  current = NULL;
167  the_document = NULL;
168  end_of_document = NULL;
169  eof_token.type = token::eof;
170  current_token.count = 1; // We're generating a stream of individual tokens, so each all counts are 1.
171  }
172 
173  /*
174  PARSER::~PARSER()
175  -----------------
176  */
180  virtual ~parser()
181  {
182  /*
183  Nothing
184  */
185  }
186 
187  /*
188  PARSER::SET_DOCUMENT()
189  ----------------------
190  */
196  virtual void set_document(const class document &document)
197  {
198  the_document = &document;
199  current = (uint8_t *)document.contents.address();
200  end_of_document = (uint8_t *)document.contents.address() + document.contents.size();
201  }
202 
203  /*
204  PARSER::SET_DOCUMENT()
205  ----------------------
206  */
211  virtual void set_document(const std::string &document)
212  {
213  current = reinterpret_cast<const uint8_t *>(document.c_str());
214  end_of_document = current + document.size();
215  }
216 
217  /*
218  PARSER::GET_NEXT_TOKEN()
219  ------------------------
220  */
225  virtual const class parser::token &get_next_token(void);
226 
227  /*
228  PARSER::UNITTEST_COUNT()
229  ------------------------
230  */
236  static size_t unittest_count(const char *string);
237 
238  /*
239  PARSER::UNITTEST()
240  ------------------
241  */
245  static void unittest(void);
246  };
247  }
parser()
Constructor.
Definition: parser.h:164
void build_unicode_numeric_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end)
Helper function used to build numeric token from UTF-8.
Definition: parser.cpp:51
XML empty tag (without the "<" or "/>").
Definition: parser.h:70
token current_token
The token that is currently being build. A reference to this is returned when the token is complete...
Definition: parser.h:127
C++ slices (string-descriptors)
Definition: slice.h:27
Simple, but fast, XML parser.
Definition: parser.h:39
void * address(void) const
Extract the pointer value from the slice.
Definition: slice.h:269
const uint8_t * end_of_document
Pointer to the end of the document, used to avoid read past end of buffer.
Definition: parser.h:126
Container class representing a document through the indexing pipeline.
Definition: document.h:31
document build_document
A document used when a string is passed into this object.
Definition: parser.h:120
numeric token
Definition: parser.h:68
token_type
The type of the token.
Definition: parser.h:65
Holder class for an impact ordered postings list.
XML conditional. This is not properly interpreted. its just the "INCLUDE" or "IGNORE" that is returne...
Definition: parser.h:75
slice lexeme
The token itself, stored as a slice (pointer / length pair)
Definition: parser.h:85
static void unittest(void)
Unit test this class.
Definition: parser.cpp:454
void build_unicode_alphabetic_token(uint32_t codepoint, size_t bytes, uint8_t *&buffer_pos, uint8_t *buffer_end)
Helper function used to build alphabetic token from UTF-8.
Definition: parser.cpp:25
uint8_t buffer[max_token_length]
The token manages its memory through this buffer.
Definition: parser.h:84
XML start tag (just the tag name)
Definition: parser.h:69
XML definition (DOCTYPE, ELEMENT, etc) without the "<!" and ">".
Definition: parser.h:74
static constexpr size_t max_token_length
Any token longer that this will be truncated at this length.
Definition: parser.h:81
virtual void set_document(const class document &document)
Start parsing from the start of this document.
Definition: parser.h:196
slice contents
The contents of the document (or likewise).
Definition: document.h:43
Other type of token (punctuation, etc).
Definition: parser.h:77
virtual ~parser()
Destructor.
Definition: parser.h:180
A document withing the indexing pipeline.
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser.cpp:79
token eof_token
Sentinal returned when reading past end of document.
Definition: parser.h:123
token_type type
The type of this token (See token_type)
Definition: parser.h:86
The final token is marked as an EOF token (and has no content).
Definition: parser.h:78
const uint8_t * current
The current location within the document.
Definition: parser.h:125
virtual void set_document(const std::string &document)
Parse a string (rather than a document).
Definition: parser.h:211
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
const document * the_document
The document that is currently being parsed.
Definition: parser.h:124
A token as returned by the parser.
Definition: parser.h:58
Slices (also known as string-descriptors) for C++.
XML comment (with the "<!--" and–>" removed).
Definition: parser.h:72
XML end tag (without the "<" or ">".
Definition: parser.h:71
Definition: compress_integer_elias_delta_simd.c:23
XML processing instruction (a "<?" sequence) with the "<?" and "?>" removed/.
Definition: parser.h:76
alphabetic token
Definition: parser.h:67
size_t size(void) const
Return the length of this slice.
Definition: slice.h:256
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87
static size_t unittest_count(const char *string)
count the numner of tokens in the given string.
Definition: parser.cpp:429
XML CDATA (just the CDATA)
Definition: parser.h:73