JASSv2
parser_unicoil_json.h
Go to the documentation of this file.
1 /*
2  PARSER_UNICOIL_JSON.H
3  ---------------------
4  Copyright (c) 2021 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
23 #pragma once
24 
25 #include <charconv>
26 
27 #include "ascii.h"
28 #include "parser.h"
29 #include "unicode.h"
30 
31 namespace JASS
32  {
33  /*
34  CLASS PARSER_UNICOIL_JSON
35  -------------------------
36  */
40  class parser_unicoil_json : public parser
41  {
42  public:
43  /*
44  PARSER_UNICOIL_JSON::PARSER_UNICOIL_JSON()
45  ------------------------------------------
46  */
51  parser()
52  {
53  /* Nothing */
54  }
55 
56  /*
57  PARSER_UNICOIL_JSON::~PARSER_UNICOIL_JSON()
58  -------------------------------------------
59  */
64  {
65  /*
66  Nothing
67  */
68  }
69 
70  /*
71  PARSER_UNICOIL_JSON::GET_NEXT_TOKEN()
72  -------------------------------------
73  */
78  virtual const class parser::token &get_next_token(void)
79  {
80  /*
81  Check for EOF
82  */
83  if (current >= end_of_document)
84  return eof_token;
85 
86  /*
87  Set up the parser
88  */
89  uint8_t *buffer_pos = current_token.buffer;
90  uint8_t *buffer_end = current_token.buffer + sizeof(current_token.buffer);
91 
92  /*
93  Find the term
94  */
95  while (current < end_of_document)
96  {
97  if (*current == '"')
98  break;
99  current++;
100  }
101 
102  current++;
103 
104  /*
105  Copy the term to the token buffer
106  */
107  while (current < end_of_document)
108  {
109  if (*current == '\\')
110  {
111  if (buffer_pos < buffer_end)
112  *buffer_pos++ = *current;
113  current++; // escaped characters in JSON
114  }
115  else if (*current == '"')
116  break;
117  if (buffer_pos < buffer_end)
118  *buffer_pos++ = *current;
119  current++;
120  }
121 
122  /*
123  Find the term count
124  */
125  while (current < end_of_document)
126  {
127  if (ascii::isdigit(*current))
128  break;
129  current++;
130  }
131 
132  const uint8_t *count_start = current;
133 
134  while (current < end_of_document)
135  {
136  if (!ascii::isdigit(*current))
137  break;
138  current++;
139  }
140 
141  const uint8_t *count_finish = current;
142 
143  /*
144  Construct a token object
145  */
146  current_token.lexeme = slice((void *)current_token.buffer, (void *)buffer_pos);
148  std::from_chars((char *)count_start, (char *)count_finish, current_token.count);
149 
151  {
152  std::cout << current_token.lexeme;
153  std::cout << " " << current_token.count << "\n";
155  }
156 
158  }
159 
160  /*
161  PARSER_UNICOIL_JSON::UNITTEST()
162  -------------------------------
163  */
167  static void unittest(void);
168  };
169  }
Methods that work on Unicode codepoints.
parser_unicoil_json()
Constructor.
Definition: parser_unicoil_json.h:50
token current_token
The token that is currently being build. A reference to this is returned when the token is complete...
Definition: parser.h:127
C++ slices (string-descriptors)
Definition: slice.h:27
Simple, but fast, XML parser.
Definition: parser.h:39
const uint8_t * end_of_document
Pointer to the end of the document, used to avoid read past end of buffer.
Definition: parser.h:126
slice lexeme
The token itself, stored as a slice (pointer / length pair)
Definition: parser.h:85
Simple XML parser that does&#39;t do either attributes or entities.
uint8_t buffer[max_token_length]
The token manages its memory through this buffer.
Definition: parser.h:84
virtual ~parser_unicoil_json()
Destructor.
Definition: parser_unicoil_json.h:63
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token. Note that the definition of token here is no t...
Definition: parser_unicoil_json.h:78
static void unittest(void)
Unit test this class.
Definition: parser_unicoil_json.cpp:16
static constexpr size_t largest_impact
The largest allowable immpact score (255 is an good value).
Definition: index_postings_impact.h:42
token eof_token
Sentinal returned when reading past end of document.
Definition: parser.h:123
token_type type
The type of this token (See token_type)
Definition: parser.h:86
const uint8_t * current
The current location within the document.
Definition: parser.h:125
Parser for documents from the UniCOIL data in JSON format.
Definition: parser_unicoil_json.h:40
A token as returned by the parser.
Definition: parser.h:58
Definition: compress_integer_elias_delta_simd.c:23
static int isdigit(uint8_t c)
Is this character a digit?
Definition: ascii.h:134
alphabetic token
Definition: parser.h:67
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87
fast locale-ignoring version of the C runtime library ctype methods for plain 7-bit ASCII...