crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
XML.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021–2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * XML.hpp
24  *
25  * Parse HTML markup into clean XML.
26  *
27  * Created on: Oct 18, 2018
28  * Author: ans
29  */
30 
31 #ifndef PARSING_XML_HPP_
32 #define PARSING_XML_HPP_
33 
34 #define PARSING_XML_
35 
36 #include "HTML.hpp"
37 
38 #include "../Helper/Memory.hpp"
39 #include "../Main/Exception.hpp"
40 
41 #include <pugixml.hpp>
42 
43 #include <cstddef> // std::size_t
44 #include <cstdint> // std::uint32_t
45 #include <memory> // std::make_unique, std::unique_ptr
46 #include <queue> // std::queue
47 #include <sstream> // std::ostringstream
48 #include <string> // std::string, std::to_string
49 #include <string_view> // std::string_view, , std::string_view_literals
50 
51 namespace crawlservpp::Query {
52 
53  class XPath;
54 
55 } /* namespace crawlservpp::Query */
56 
57 namespace crawlservpp::Parsing {
58 
59  /*
60  * CONSTANTS
61  */
62 
63  using std::string_view_literals::operator""sv;
64 
67 
69  inline constexpr auto xmlBegin{"<?xml "sv};
70 
72  inline constexpr std::array xmlTags{"<?i>"sv};
73 
75  inline constexpr auto cDataBegin{"<![CDATA["sv};
76 
78  inline constexpr auto cDataEnd{"]]>"sv};
79 
81  inline constexpr auto conditionalBegin{"<![if "sv};
82 
84  inline constexpr auto conditionalEnd{"<![endif]>"sv};
85 
87  inline constexpr auto conditionalInsert{"--"sv};
88 
90  inline constexpr auto conditionalInsertOffsetBegin{2};
91 
93  inline constexpr auto conditionalInsertOffsetEnd{9};
94 
96  inline constexpr auto conditionalInsertOffsetStrayEnd{2};
97 
99  inline constexpr auto commentCharsToReplace{"--"sv};
100 
102  inline constexpr auto commentCharsReplaceBy{"=="sv};
103 
105  inline constexpr auto invalidBegin{"<? "sv};
106 
108  inline constexpr auto invalidEnd{" ?>"sv};
109 
111  inline constexpr auto invalidInsertBegin{"!--"sv};
112 
114  inline constexpr auto invalidInsertEnd{"--"sv};
115 
117  inline constexpr auto invalidInsertOffsetBegin{1};
118 
120  inline constexpr auto invalidInsertOffsetEnd{2};
121 
123  inline constexpr auto numDebugCharacters{50};
124 
126  inline constexpr auto xmlInstructionBegin{"<?xml:"sv};
127 
129  inline constexpr auto xmlInstructionEnd{">"sv};
130 
132 
133  /*
134  * DECLARATION
135  */
136 
138 
149  class XML {
150  friend class Query::XPath;
151 
152  public:
155 
157  XML() = default;
158 
159  explicit XML(const pugi::xml_node& node);
160  virtual ~XML();
161 
165 
166  [[nodiscard]] bool valid() const;
167  void getContent(std::string& resultTo) const;
168 
172 
173  void setOptions(bool showWarnings, std::uint32_t numOfErrors) noexcept;
174 
178 
179  void parse(
180  std::string_view content,
181  bool repairCData,
182  bool repairComments,
183  bool removeXmlInstructions,
184  std::queue<std::string>& warningsTo
185  );
186 
190 
191  void clear();
192 
194 
196 
208 
212 
215  XML(const XML&) = delete;
216 
218  XML(XML&&) noexcept = default;
219 
221  XML& operator=(const XML&) = delete;
222 
224  XML& operator=(XML&&) noexcept = default;
225 
227 
228  private:
229  // unique pointer to (pugi)XML document
230  std::unique_ptr<pugi::xml_document> doc;
231 
232  // options
233  bool warnings{false};
234  std::uint32_t errors{};
235 
236  // internal static helper functions
237  static void cDataRepair(std::string& content);
238  static void replaceInvalidConditionalComments(std::string& content);
239  static void replaceInvalidComments(std::string& content);
240  static void removeXmlProcessingInstructions(std::string& content);
241  static void checkResult(pugi::xml_parse_result result, const std::string& content);
242  };
243 
244  /*
245  * IMPLEMENTATION
246  */
247 
248  /*
249  * CONSTRUCTION AND DESTRUCTION
250  */
251 
253 
257  inline XML::XML(const pugi::xml_node& node) {
258  // create XML document
259  this->doc = std::make_unique<pugi::xml_document>();
260 
261  this->doc->append_copy(node);
262  }
263 
265  inline XML::~XML() {
266  this->clear();
267  }
268 
269  /*
270  * GETTERS
271  */
272 
274 
279  inline bool XML::valid() const {
280  return this->doc.operator bool();
281  }
282 
284 
300  inline void XML::getContent(std::string& resultTo) const {
301  if(!(this->doc)) {
302  throw XML::Exception("No content has been parsed.");
303  }
304 
305  std::ostringstream out;
306 
307  Helper::Memory::freeIf(!resultTo.empty(), resultTo);
308 
309  this->doc->print(out);
310 
311  resultTo += out.str();
312  }
313 
314  /*
315  * SETTER
316  */
317 
319 
330  inline void XML::setOptions(bool showWarnings, std::uint32_t numOfErrors) noexcept {
331  this->warnings = showWarnings;
332  this->errors = numOfErrors;
333  }
334 
335  /*
336  * PARSING
337  */
338 
340 
365  inline void XML::parse(
366  std::string_view content,
367  bool repairCData,
368  bool repairComments,
369  bool removeXmlInstructions,
370  std::queue<std::string>& warningsTo
371  ) {
372  // remove whitespaces at the beginning and null characters
373  std::size_t begin{};
374 
375  while(content.length() > begin && std::isspace(content.at(begin)) != 0) {
376  ++begin;
377  }
378 
379  std::string xml;
380 
381  xml.reserve(content.size() - begin);
382 
383  for(std::size_t i{begin}; i < content.length(); ++i) {
384  if(content[i] != '\0') {
385  xml.push_back(content[i]);
386  }
387  }
388 
389  // if necessary, try to tidy HTML and convert it to XML
390  if(
391  xml.size() < xmlBegin.length()
392  || xml.substr(0, xmlBegin.length()) != xmlBegin
393  ) {
394  // remove XML processing instructions
395  if(removeXmlInstructions) {
396  removeXmlProcessingInstructions(xml);
397  }
398 
399  HTML tidy;
400 
401  try {
402  tidy.tidyAndConvert(xml, this->warnings, this->errors, warningsTo);
403  }
404  catch(const HTML::Exception& e) {
405  throw XML::Exception(
406  "tidy-html5 error: "
407  + std::string(e.view())
408  );
409  }
410  }
411 
412  // try to repair CDATA
413  if(repairCData) {
414  cDataRepair(xml);
415  }
416 
417  // replace invalid comments
418  if(repairComments) {
419  replaceInvalidConditionalComments(xml);
420  replaceInvalidComments(xml);
421  }
422 
423  // create XML document
424  this->doc = std::make_unique<pugi::xml_document>();
425 
426  // parse XHTML with pugixml
427  XML::checkResult(this->doc->load_buffer(xml.c_str(), xml.size(), pugi::parse_full), xml);
428  }
429 
430  /*
431  * CLEANUP
432  */
433 
435 
439  inline void XML::clear() {
440  if(this->doc) {
441  this->doc.reset();
442  }
443  }
444 
445  /*
446  * INTERNAL STATIC HELPER FUNCTIONS (private)
447  */
448 
449  // internal static helper function: try to fix CDATA error (invalid ']]>' inside CDATA tag)
450  inline void XML::cDataRepair(std::string& content) {
451  auto pos{content.find(cDataBegin)};
452 
453  if(pos == std::string::npos) {
454  return;
455  }
456 
457  pos += cDataBegin.length();
458 
459  while(pos < content.size()) {
460  const auto next{content.find(cDataBegin, pos)};
461 
462  if(next == std::string::npos) {
463  break;
464  }
465 
466  auto last{content.rfind(cDataEnd, next - cDataEnd.length())};
467 
468  if(last != std::string::npos && last > pos) {
469  while(true) {
470  pos = content.find(cDataEnd, pos);
471 
472  if(pos < last) {
473  content.insert(pos + cDataEnd.length() - 1, 1, ' ');
474 
475  pos += cDataEnd.length() + 1;
476  }
477  else {
478  break;
479  }
480  }
481  }
482 
483  pos = next + cDataBegin.length();
484  }
485  }
486 
487  // internal static helper function: replace invalid conditional comments (e.g. created by MS Excel)
488  inline void XML::replaceInvalidConditionalComments(std::string& content) {
489  std::size_t pos{};
490 
491  while(pos < content.length()) {
492  // find next invalid conditional comment
493  pos = content.find(conditionalBegin, pos);
494 
495  if(pos == std::string::npos) {
496  break;
497  }
498 
499  // find end of invalid conditional comment
500  const auto end{
501  content.find(
503  pos + conditionalBegin.length()
504  )
505  };
506 
507  if(end == std::string::npos) {
508  break;
509  }
510 
511  // insert commenting to make conditional comment valid (X)HTML
512  content.insert(pos + conditionalInsertOffsetBegin, conditionalInsert);
513  content.insert(
514  // (consider that "--" has already been added)
517  );
518 
519  // replace "--" inside new comment with "=="
520  auto subPos{pos + conditionalBegin.length() + conditionalInsert.length()};
521 
522  while(subPos < end) {
523  subPos = content.find(commentCharsToReplace, subPos);
524 
525  if(subPos > end) {
526  break;
527  }
528 
529  content.replace(subPos, commentCharsToReplace.length(), commentCharsReplaceBy);
530 
531  subPos += commentCharsReplaceBy.length();
532  }
533 
534  // jump to the end of the changed conditional comment
535  pos = end + conditionalEnd.length() + 2 * conditionalInsert.length();
536  }
537 
538  // replace remaining invalid end tags
539  pos = 0;
540 
541  while(pos < content.length()) {
542  pos = content.find(conditionalEnd, pos);
543 
544  if(pos == std::string::npos) {
545  break;
546  }
547 
548  content.insert(pos + conditionalInsertOffsetStrayEnd, conditionalInsert);
549  content.insert(
552  );
553 
554  pos += conditionalEnd.length() + 2 * conditionalInsert.length();
555  }
556  }
557 
558  // internal static helper function: replace invalid comments (<? ... ?>)
559  inline void XML::replaceInvalidComments(std::string& content) {
560  std::size_t pos{};
561 
562  while(pos < content.length()) {
563  // find next invalid comment
564  pos = content.find(invalidBegin, pos);
565 
566  if(pos == std::string::npos) {
567  break;
568  }
569 
570  // find end of invalid comment
571  const auto end{content.find(invalidEnd, pos + invalidBegin.length())};
572 
573  if(end == std::string::npos) {
574  break;
575  }
576 
577  // insert commenting to make comment valid (X)HTML
578  content.insert(pos + invalidInsertOffsetBegin, invalidInsertBegin);
579  content.insert(
580  // consider that "!--" has already been added
583  );
584 
585  // replace "--" inside new comment with "=="
586  auto subPos{pos + invalidBegin.length() + invalidInsertBegin.length()};
587 
588  while(subPos < end) {
589  subPos = content.find(commentCharsToReplace, subPos);
590 
591  if(subPos > end) {
592  break;
593  }
594 
595  content.replace(
596  subPos,
597  commentCharsToReplace.length(),
599  );
600 
601  subPos += commentCharsToReplace.length();
602  }
603 
604  // jump to the end of the changed comment
605  pos = end
606  + invalidEnd.length()
607  + invalidInsertBegin.length()
608  + invalidInsertEnd.length();
609  }
610  }
611 
612  // internal static helper function: remove XML processing instructions (<?xml:...>)
613  inline void XML::removeXmlProcessingInstructions(std::string& content) {
614  std::size_t pos{};
615 
616  while(pos < content.length()) {
617  pos = content.find(xmlInstructionBegin, pos);
618 
619  if(pos == std::string::npos) {
620  break;
621  }
622 
623  const auto end{
624  content.find(
626  pos + xmlInstructionBegin.length()
627  )
628  };
629 
630  if(end == std::string::npos) {
631  return;
632  }
633 
634  content.erase(pos, end - pos + xmlInstructionEnd.length());
635  }
636 
637  for(const auto& tag : xmlTags) {
638  pos = 0;
639 
640  while(pos < content.length()) {
641  pos = content.find(tag, pos);
642 
643  if(pos == std::string::npos) {
644  break;
645  }
646 
647  content.erase(pos, tag.length());
648  }
649  }
650  }
651 
652  // internal static helper function: check parsing result
653  inline void XML::checkResult(pugi::xml_parse_result result, const std::string& content) {
654  if(result) {
655  return;
656  }
657 
658  // parsing error
659  std::string errorString{"XML parsing error: "};
660 
661  errorString += result.description();
662  errorString += " at #";
663  errorString += std::to_string(result.offset);
664  errorString += " (";
665 
666  if(result.offset > 0) {
667  errorString += "'[...]";
668 
669  if(result.offset > numDebugCharacters) {
670  errorString += content.substr(
671  result.offset - numDebugCharacters,
673  );
674  }
675  else {
676  errorString += content.substr(0, result.offset);
677  }
678 
679  errorString += "[!!!]";
680 
681  if(content.size() > static_cast<std::size_t>(result.offset + numDebugCharacters)) {
682  errorString += "'[...]";
683  errorString += content.substr(result.offset, numDebugCharacters);
684  errorString += "[...]";
685  }
686  else {
687  errorString += "'[...]";
688  errorString += content.substr(result.offset);
689  }
690 
691  errorString += "').";
692  }
693 
694  throw XML::Exception(errorString);
695  }
696 
697 } /* namespace crawlservpp::Parsing */
698 
699 #endif /* PARSING_XML_HPP_ */
constexpr auto commentCharsReplaceBy
Characters used as replacement inside comments.
Definition: XML.hpp:102
constexpr auto commentCharsToReplace
Characters to be replaced inside comments.
Definition: XML.hpp:99
constexpr auto conditionalInsertOffsetBegin
Offset at which to insert at the beginning to make conditional comments valid.
Definition: XML.hpp:90
constexpr auto numDebugCharacters
The maximum number of characters to be shown in error messages.
Definition: XML.hpp:123
constexpr auto conditionalInsertOffsetEnd
Offset at which to insert at the end to make conditional comments valid.
Definition: XML.hpp:93
void tidyAndConvert(std::string &inOut, bool warnings, ulong numOfErrors, std::queue< std::string > &warningsTo)
Parse and tidy the given HTML markup and convert the result to XML.
Definition: HTML.hpp:171
constexpr auto xmlInstructionBegin
The beginning of a XML processing instruction.
Definition: XML.hpp:126
constexpr std::array xmlTags
Array containing additional XML markup tags to be removed.
Definition: XML.hpp:72
constexpr auto cDataBegin
The beginning of a CDATA element.
Definition: XML.hpp:75
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr auto invalidBegin
The beginning of an invalid comment.
Definition: XML.hpp:105
constexpr auto xmlBegin
The beginning of XML markup.
Definition: XML.hpp:69
constexpr auto invalidInsertOffsetEnd
Offset at which to insert at the end to make invalid comments valid.
Definition: XML.hpp:120
static void freeIf(bool isFree, T &target)
Frees memory early by swapping, if necessary.
Definition: Memory.hpp:52
constexpr auto invalidInsertEnd
Characters to be inserted at the end to make invalid comments valid.
Definition: XML.hpp:114
constexpr auto conditionalEnd
The end of a conditional comment.
Definition: XML.hpp:84
Implements a XPath query using the pugixml library.
Definition: XPath.hpp:74
Class for XML exceptions.
Definition: XML.hpp:207
std::string_view view() const noexcept
Gets the description of the exception as a view to the underlying string.
Definition: Exception.hpp:158
constexpr auto invalidInsertBegin
Characters to be inserted at the beginning to make invalid comments valid.
Definition: XML.hpp:111
constexpr auto conditionalBegin
The beginning of a conditional comment.
Definition: XML.hpp:81
Namespace for classes handling queries.
Definition: XML.hpp:51
Parses and cleans HTML markup.
Definition: HTML.hpp:71
constexpr auto conditionalInsertOffsetStrayEnd
Offset at which to insert into stray end tag left from conditional comment.
Definition: XML.hpp:96
Parses HTML markup into clean XML.
Definition: XML.hpp:149
constexpr auto invalidEnd
The end of an invalid comment.
Definition: XML.hpp:108
Namespace for classes parsing HTML, URIs, and XML.
Definition: HTML.hpp:42
constexpr auto invalidInsertOffsetBegin
Offset at which to insert at the beginning to make invalid comments valid.
Definition: XML.hpp:117
constexpr auto cDataEnd
The end of a CDATA element.
Definition: XML.hpp:78
constexpr auto xmlInstructionEnd
The end of a XML processing instruction.
Definition: XML.hpp:129
Class for HTML exceptions.
Definition: HTML.hpp:107
constexpr auto conditionalInsert
Characters to be inserted/replaced to make conditional comments valid.
Definition: XML.hpp:87