fsm
tokenizer.h
1 #pragma once
2 
3 #include <cstddef>
4 #include <ctre.hpp>
5 #include <optional>
6 #include <string>
7 #include <string_view>
8 #include <utility>
9 
10 #include <magic_enum/magic_enum.hpp>
11 
12 #include "base/type_traits.h"
13 
14 using namespace std::literals;
15 
16 namespace escad::json {
17 
18 struct view {
19 
20  view(std::string_view input) : input_(input) {}
21 
22  std::string_view substr() const { return input_.substr(pos_); }
23 
24  std::string_view substr(std::size_t len) const {
25  return input_.substr(pos_, len);
26  }
27 
28  std::string_view substr(std::size_t start, std::size_t len) const {
29  return input_.substr(start, len);
30  }
31 
32  std::string_view consume(std::size_t len) {
33  auto result = input_.substr(pos_, len);
34  pos_ += len;
35  return result;
36  }
37 
38  std::string_view input_;
39  std::size_t pos_ = 0;
40 };
41 
42 template <class TokenType, ctll::fixed_string Regex> class tokenizer {
43 
44 public:
45  using Token = TokenType;
46 
47  static constexpr auto TokenTypeSize = magic_enum::enum_count<TokenType>();
48 
49  tokenizer(view &input) : view_(input) {}
50 
51  tokenizer(view &&input) : view_(input) {}
52 
53  // Get the next token
54  std::optional<TokenType> next() {
55 
56  bool found = false;
57  TokenType result;
58  if (auto match = ctre::multiline_starts_with<Regex>(view_.substr())) {
59 
60  mpl::for_sequence(std::make_index_sequence<TokenTypeSize>{}, [&](auto i) {
61  if (match.template get<i + 1>()) {
62  result = magic_enum::enum_value<TokenType>(i);
63  found = true;
64  }
65  });
66  }
67  if (found) {
68  return result;
69  }
70  return std::nullopt;
71  }
72 
73  bool isToken(TokenType type) {
74  bool found = false;
75  if (auto match = ctre::multiline_starts_with<Regex>(view_.substr())) {
76  auto type_integer = magic_enum::enum_integer(type);
77  mpl::for_sequence(std::make_index_sequence<TokenTypeSize>{}, [&](auto i) {
78  if (i == type_integer) {
79  if (match.template get<i + 1>()) {
80  found = true;
81  }
82  }
83  });
84  }
85  return found;
86  }
87 
88  [[deprecated("Don't use this function anymore")]] std::string_view consume() {
89  if (auto match = ctre::multiline_starts_with<Regex>(view_.substr())) {
90  view_.consume(match.size());
91  return view_.consume(match.size());
92  }
93  return {};
94  }
95 
96  std::optional<std::string_view> consume(TokenType type) {
97 
98  bool found = false;
99  std::string_view str = ""sv;
100 
101  if (auto match = ctre::multiline_starts_with<Regex>(view_.substr())) {
102 
103  auto type_integer = magic_enum::enum_integer(type);
104 
105  mpl::for_sequence(std::make_index_sequence<TokenTypeSize>{}, [&](auto i) {
106  if (i == type_integer) {
107  if (auto capture = match.template get<i + 1>()) {
108  str = view_.consume(capture.size());
109  found = true;
110  }
111  }
112  });
113  }
114  if (found) {
115  return str;
116  }
117  return std::nullopt;
118  }
119 
120  const view &getView() const { return view_; }
121 
122  // protected:
123  view &view_;
124 };
125 
126 // Regex for JSON tokens
127 constexpr auto jsonTokenRegex = ctll::fixed_string{
128  "(\\s+)|(\\u007b)|(\\u007d)|(\\u005b)|(\\u005d)|(:)|(,)|(\")|"
129  "(true)|(false)|(null)"};
130 
131 enum class jsonTokenType {
132  WS,
133  OPEN_BRACE,
134  CLOSE_BRACE,
135  OPEN_BRACKET,
136  CLOSE_BRACKET,
137  COLON,
138  COMMA,
139  DOUBLE_QUOTE,
140  TRUE,
141  FALSE,
142  NULL_
143 };
144 
146 
147 // Regex for String tokens
148 constexpr auto stringTokenRegex =
149  ctll::fixed_string{"(\")"
150  "|(\\u005Cu[0-9a-fA-F]{4})"
151  "|([^\"\\u005C\\u0000-\\u001f\\u007F]+)"
152  "|(\\u005C[bfnrt/\\\"])"};
153 
154 enum class stringTokenType { DOUBLE_QUOTE, HEX, CHARS, ESCAPE };
155 
157 
158 constexpr auto numberTokenRegex = ctll::fixed_string{"([+\\-])"
159  "|([0-9])"
160  "|(\\.)"
161  "|([eE])"};
162 
163 enum class numberTokenType { SIGN, DIGIT, DOT, EXP };
164 
166 
167 } // namespace escad::json
Definition: tokenizer.h:18
Definition: tokenizer.h:42
Definition: simple_fsm.cpp:15