crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
RegEx.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * RegEx.hpp
24  *
25  * Using the PCRE2 library to implement a Perl-Compatible Regular Expressions
26  * query with boolean, single and/or multiple results. An expression is only
27  * created when needed.
28  *
29  * Created on: Oct 17, 2018
30  * Author: ans
31  */
32 
33 #ifndef QUERY_REGEX_HPP_
34 #define QUERY_REGEX_HPP_
35 
36 #include "../Main/Exception.hpp"
37 #include "../Wrapper/PCRE.hpp"
38 #include "../Wrapper/PCREMatch.hpp"
39 
40 #include <pcre2.h>
41 
42 #include <array> // std::array
43 #include <cstdint> // std::int32_t, std::uint32_t
44 #include <string> // std::string, std::to_string
45 #include <vector> // std::vector
46 
47 namespace crawlservpp::Query {
48 
49  /*
50  * CONSTANTS
51  */
52 
55 
57  inline constexpr auto pcre2ErrorBufferLength{1024};
58 
60  inline constexpr auto bitmaskTopBit{0x80};
61 
63  inline constexpr auto bitmaskTopTwoBits{0xc0};
64 
66 
67  /*
68  * DECLARATION
69  */
70 
72 
76  class RegEx {
77  public:
80 
81  RegEx(const std::string& expression, bool single, bool multi);
82 
86 
87  [[nodiscard]] bool getBool(const std::string& text) const;
88  void getFirst(const std::string& text, std::string& resultTo) const;
89  void getAll(const std::string& text, std::vector<std::string>& resultTo) const;
90  [[nodiscard]] bool valid() const noexcept;
91 
93 
95 
109 
110  private:
111  Wrapper::PCRE expressionSingle;
112  Wrapper::PCRE expressionMulti;
113  };
114 
115  /*
116  * IMPLEMENTATION
117  */
118 
120 
140  inline RegEx::RegEx(const std::string& expression, bool single, bool multi) {
141  std::string queryString(expression);
142  std::int32_t errorNumber{};
143  PCRE2_SIZE errorOffset{};
144 
145  // remove newlines at the end of the expression
146  while(queryString.back() == '\n') {
147  queryString.pop_back();
148  }
149 
150  // check arguments
151  if(queryString.empty()) {
152  throw RegEx::Exception("Expression is empty");
153  }
154 
155  if(!single && !multi) {
156  throw RegEx::Exception("No result type for expression specified");
157  }
158 
159  // compile expression(s)
160  if(single) {
161  this->expressionSingle.set(
162  pcre2_compile(
163  static_cast<PCRE2_SPTR>(
164  static_cast<const void *>(
165  queryString.c_str()
166  )
167  ),
168  PCRE2_ZERO_TERMINATED,
169  PCRE2_UTF | PCRE2_UCP,
170  &errorNumber,
171  &errorOffset,
172  nullptr
173  )
174  );
175 
176  if(!(this->expressionSingle.valid())) {
177  // RegEx error
178  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
179 
180  pcre2_get_error_message(
181  errorNumber,
182  static_cast<PCRE2_UCHAR8 *>(
183  static_cast<void *>(
184  errorBuffer.data()
185  )
186  ),
188  );
189 
190  std::string exceptionString{"Compilation error at "};
191 
192  exceptionString += std::to_string(errorOffset);
193  exceptionString += ": ";
194  exceptionString += errorBuffer.data();
195 
196  throw RegEx::Exception(exceptionString);
197  }
198  }
199 
200  if(multi) {
201  this->expressionMulti.set(
202  pcre2_compile(
203  static_cast<PCRE2_SPTR>(
204  static_cast<const void *>(
205  expression.c_str()
206  )
207  ),
208  PCRE2_ZERO_TERMINATED,
209  PCRE2_UTF | PCRE2_UCP | PCRE2_MULTILINE,
210  &errorNumber,
211  &errorOffset,
212  nullptr
213  )
214  );
215 
216  if(!(this->expressionMulti.valid())) {
217  // RegEx error
218  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
219 
220  pcre2_get_error_message(
221  errorNumber,
222  static_cast<PCRE2_UCHAR8 *>(
223  static_cast<void *>(
224  errorBuffer.data()
225  )
226  ),
228  );
229 
230  std::string exceptionString{"Compilation error at "};
231 
232  exceptionString += std::to_string(errorOffset);
233  exceptionString += ": ";
234  exceptionString += errorBuffer.data();
235 
236  throw RegEx::Exception(exceptionString);
237  }
238  }
239  }
240 
242 
261  inline bool RegEx::getBool(const std::string& text) const {
262  // check compiled expression
263  if(!(this->expressionSingle.valid())) {
264  throw RegEx::Exception("No single result expression compiled");
265  }
266 
267  // get first match
268  Wrapper::PCREMatch pcreMatch(
269  pcre2_match_data_create_from_pattern(
270  this->expressionSingle.getc(),
271  nullptr
272  )
273  );
274 
275  auto result{
276  pcre2_match(
277  this->expressionSingle.getc(),
278  static_cast<PCRE2_SPTR>(
279  static_cast<const void *>(
280  text.c_str()
281  )
282  ),
283  text.length(),
284  0,
285  0,
286  pcreMatch.get(),
287  nullptr
288  )
289  };
290 
291  // check result
292  if(result <= 0) {
293  switch(result) {
294  case PCRE2_ERROR_NOMATCH:
295  // no match found -> result is false
296  return false;
297 
298  case 0:
299  // output vector was too small (should not happen when using pcre2_match_data_create_from_pattern(...))
300  throw RegEx::Exception("Result vector unexpectedly too small");
301 
302  default:
303  // match error: set error message and delete match
304  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
305 
306  pcre2_get_error_message(
307  result,
308  static_cast<PCRE2_UCHAR8 *>(
309  static_cast<void *>(
310  errorBuffer.data()
311  )
312  ),
314  );
315 
316  throw RegEx::Exception(errorBuffer.data());
317  }
318  }
319 
320  // at least one match found -> result is true
321  return true;
322  }
323 
325 
345  inline void RegEx::getFirst(const std::string& text, std::string& resultTo) const {
346  // empty target
347  resultTo.clear();
348 
349  // check compiled expression
350  if(!(this->expressionSingle.valid())) {
351  throw RegEx::Exception("No single result expression compiled");
352  }
353 
354  // get first match
355  Wrapper::PCREMatch pcreMatch(
356  pcre2_match_data_create_from_pattern(
357  this->expressionSingle.getc(),
358  nullptr
359  )
360  );
361 
362  int result{
363  pcre2_match(
364  this->expressionSingle.getc(),
365  static_cast<PCRE2_SPTR>(
366  static_cast<const void *>(
367  text.c_str()
368  )
369  ),
370  text.length(),
371  0,
372  0,
373  pcreMatch.get(),
374  nullptr
375  )
376  };
377 
378  // check result
379  if(result <= 0) {
380  switch(result) {
381  case PCRE2_ERROR_NOMATCH:
382  // no match found -> result is empty string
383  return;
384 
385  case 0:
386  // output vector was too small (should not happen when using pcre2_match_data_create_from_pattern(...))
387  throw RegEx::Exception("Result vector unexpectedly too small");
388 
389  default:
390  // matching error
391  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
392 
393  pcre2_get_error_message(
394  result,
395  static_cast<PCRE2_UCHAR8 *>(
396  static_cast<void *>(
397  errorBuffer.data()
398  )
399  ),
401  );
402 
403  throw RegEx::Exception(errorBuffer.data());
404  }
405  }
406 
407  // at least one match found -> get resulting match
408  auto * pcreOVector{pcre2_get_ovector_pointer(pcreMatch.get())};
409 
410  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
411  resultTo = text.substr(pcreOVector[0], pcreOVector[1] - pcreOVector[0]);
412  }
413 
415 
435  inline void RegEx::getAll(const std::string& text, std::vector<std::string>& resultTo) const {
436  // empty target
437  resultTo.clear();
438 
439  // check compiled expression
440  if(!(this->expressionMulti.valid())) {
441  throw RegEx::Exception("No multi result expression compiled");
442  }
443 
444  // get first match
445  Wrapper::PCREMatch pcreMatch(
446  pcre2_match_data_create_from_pattern(
447  this->expressionMulti.getc(),
448  nullptr
449  )
450  );
451 
452  auto result{
453  pcre2_match(
454  this->expressionMulti.getc(),
455  static_cast<PCRE2_SPTR>(
456  static_cast<const void *>(
457  text.c_str()
458  )
459  ),
460  text.length(),
461  0,
462  0,
463  pcreMatch.get(),
464  nullptr
465  )
466  };
467 
468  // check result
469  if(result <= 0) {
470  switch(result) {
471  case PCRE2_ERROR_NOMATCH:
472  // no match found -> result is empty array
473  return;
474 
475  case 0:
476  // output vector was too small (should not happen when using pcre2_match_data_create_from_pattern(...))
477  throw RegEx::Exception("Result vector unexpectedly too small");
478 
479  default:
480  // matching error
481  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
482 
483  pcre2_get_error_message(
484  result,
485  static_cast<PCRE2_UCHAR8 *>(
486  static_cast<void *>(
487  errorBuffer.data()
488  )
489  ),
491  );
492 
493  throw RegEx::Exception(errorBuffer.data());
494  }
495  }
496 
497  // at least one match found -> save first match
498  auto * pcreOVector{pcre2_get_ovector_pointer(pcreMatch.get())};
499 
500  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
501  resultTo.emplace_back(text, pcreOVector[0], pcreOVector[1] - pcreOVector[0]);
502 
503  // get RegEx options
504  std::uint32_t pcreOptions{};
505  std::uint32_t pcreNewLineOption{};
506 
507  pcre2_pattern_info(this->expressionMulti.getc(), PCRE2_INFO_ALLOPTIONS, &pcreOptions);
508  pcre2_pattern_info(this->expressionMulti.getc(), PCRE2_INFO_NEWLINE, &pcreNewLineOption);
509 
510  const auto pcreUTF8{(pcreOptions & PCRE2_UTF) != 0};
511  const auto pcreNewLine{
512  pcreNewLineOption == PCRE2_NEWLINE_ANY
513  || pcreNewLineOption == PCRE2_NEWLINE_CRLF
514  || pcreNewLineOption == PCRE2_NEWLINE_ANYCRLF
515  };
516 
517  // get more matches
518  while(true) {
519  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
520  auto pcreOffset{pcreOVector[1]};
521  pcreOptions = 0;
522 
523  // check for empty string (end of matches)
524  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
525  if(pcreOVector[0] == pcreOVector[1]) {
526  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
527  if(pcreOVector[0] == text.length()) {
528  break;
529  }
530 
531  pcreOptions = PCRE2_NOTEMPTY_ATSTART | PCRE2_ANCHORED;
532  }
533 
534  // get next match
535  result = pcre2_match(
536  this->expressionMulti.getc(),
537  static_cast<PCRE2_SPTR>(
538  static_cast<const void *>(
539  text.c_str()
540  )
541  ),
542  text.length(),
543  pcreOffset,
544  pcreOptions,
545  pcreMatch.get(),
546  nullptr
547  );
548 
549  // check result
550  if(result == PCRE2_ERROR_NOMATCH) {
551  if(pcreOptions == 0) {
552  break;
553  }
554 
555  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
556  pcreOVector[1] = pcreOffset + 1;
557 
558  if(
559  pcreNewLine
560  && pcreOffset < text.length() - 1
561  && text.at(pcreOffset) == '\r'
562  && text.at(pcreOffset + 1) == '\n'
563  ) {
564  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
565  pcreOVector[1] += 1;
566  }
567  else if(pcreUTF8) {
568  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
569  while(pcreOVector[1] < text.length()) {
570  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic, hicpp-signed-bitwise)
571  if((text.at(pcreOVector[1]) & bitmaskTopTwoBits) != bitmaskTopBit) {
572  break;
573  }
574 
575  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
576  pcreOVector[1] += 1;
577  }
578  }
579 
580  continue;
581  }
582 
583  if(result < 0) {
584  // matching error
585  std::array<char, pcre2ErrorBufferLength> errorBuffer{};
586 
587  pcre2_get_error_message(
588  result,
589  static_cast<PCRE2_UCHAR8 *>(
590  static_cast<void *>(
591  errorBuffer.data()
592  )
593  ),
595  );
596 
597  throw RegEx::Exception(errorBuffer.data());
598  }
599 
600  if(result == 0) {
601  throw RegEx::Exception("Result vector unexpectedly too small");
602  }
603 
604  // get resulting match
605  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
606  resultTo.emplace_back(text, pcreOVector[0], pcreOVector[1] - pcreOVector[0]);
607  }
608  }
609 
611 
615  inline bool RegEx::valid() const noexcept {
616  return this->expressionSingle.valid() || this->expressionMulti.valid();
617  }
618 
619 } /* namespace crawlservpp::Query */
620 
621 #endif /* QUERY_REGEX_HPP_ */
constexpr auto bitmaskTopBit
Bit mask to extract the first bit of a multibyte character.
Definition: RegEx.hpp:60
constexpr auto bitmaskTopTwoBits
Bit mask to extract the top two bits of a multibyte character.
Definition: RegEx.hpp:63
void getFirst(const std::string &text, std::string &resultTo) const
Gets the first match from performing the query on a parsed JSON document.
Definition: RegEx.hpp:345
const pcre2_code * getc() const noexcept
Gets a const pointer to the underlying regular expression.
Definition: PCRE.hpp:160
RAII wrapper for Perl-compatible regular expressions.
Definition: PCRE.hpp:62
Class for JSONPath exceptions.
Definition: RegEx.hpp:108
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
bool valid() const noexcept
Checks whether the underlying regular expression is valid.
Definition: PCRE.hpp:173
void getAll(const std::string &text, std::vector< std::string > &resultTo) const
Gets all matches from performing the query on a parsed JSON document.
Definition: RegEx.hpp:435
constexpr auto pcre2ErrorBufferLength
The length of the error buffer used by the PCRE2 library, in bytes.
Definition: RegEx.hpp:57
Implements a RegEx query using the PCRE2 library.
Definition: RegEx.hpp:76
void set(pcre2_code *regExPtr)
Sets a PERL-compatibe regular expression.
Definition: PCRE.hpp:195
bool valid() const noexcept
Gets whether the query is valid.
Definition: RegEx.hpp:615
RAII wrapper for Perl-compatible regular expression matches.
Definition: PCREMatch.hpp:61
RegEx(const std::string &expression, bool single, bool multi)
Constructor setting a RegEx string and whether the query will return single and/or multiple results...
Definition: RegEx.hpp:140
Namespace for classes handling queries.
Definition: XML.hpp:51
bool getBool(const std::string &text) const
Gets a boolean result from performing the query on a parsed JSON document.
Definition: RegEx.hpp:261
pcre2_match_data * get() noexcept
Gets a pointer to the underlying regular expression match.
Definition: PCREMatch.hpp:140