crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TidyDoc.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020–2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TidyDoc.hpp
24  *
25  * RAII wrapper for tidy-html5 documents.
26  *
27  * Created on: Feb 7, 2019
28  * Author: ans
29  */
30 
31 #ifndef WRAPPER_TIDYDOC_HPP_
32 #define WRAPPER_TIDYDOC_HPP_
33 
34 #include "TidyBuffer.hpp"
35 
36 #include "../Helper/SilentInclude/tidy.h"
37 #include "../Helper/Strings.hpp"
38 #include "../Main/Exception.hpp"
39 
40 #include <queue> // std::queue
41 #include <string> // std::string, std::to_string
42 
43 namespace crawlservpp::Wrapper {
44 
45  /*
46  * DECLARATION
47  */
48 
50 
70  class TidyDoc {
71  public:
74 
75  TidyDoc();
76  virtual ~TidyDoc();
77 
81 
82  [[nodiscard]] std::string getOutput(std::queue<std::string>& warningsTo);
83 
87 
88  void setOption(TidyOptionId option, bool value);
89  void setOption(TidyOptionId option, int value);
90  void setOption(TidyOptionId option, ulong value);
91  void setOption(TidyOptionId option, const std::string& value);
92 
96 
97  void parse(const std::string& in, std::queue<std::string>& warningsTo);
98  void cleanAndRepair(std::queue<std::string>& warningsTo);
99 
101 
103 
119 
123 
126  TidyDoc(TidyDoc&) = delete;
127 
129  TidyDoc& operator=(TidyDoc&) = delete;
130 
132  TidyDoc(TidyDoc&&) = delete;
133 
135  TidyDoc& operator=(TidyDoc&&) = delete;
136 
138 
139  private:
140  bool created{false};
141 
142  ::TidyDoc doc;
143 
144  TidyBuffer errors;
145 
146  // static helper function
147  [[nodiscard]] static bool isVersionBelow5_7_18();
148  };
149 
150  /*
151  * IMPLEMENTATION
152  */
153 
154  /*
155  * CONSTRUCTION AND DESTRUCTION
156  */
157 
159 
166  inline TidyDoc::TidyDoc() {
167  // set language manually to avoid locale bug (https://github.com/crawlserv/crawlservpp/issues/164)
168  if(TidyDoc::isVersionBelow5_7_18()) {
169  tidySetLanguage(tidyGetLanguage());
170  }
171 
172  // create document
173  this->doc = tidyCreate();
174  this->created = true;
175 
176  // set error buffer
177  if(tidySetErrorBuffer(this->doc, this->errors.get()) != 0) {
178  throw Exception("Could not set error buffer");
179  }
180  }
181 
183  inline TidyDoc::~TidyDoc() {
184  if(this->created) {
185  tidyRelease(this->doc);
186  }
187 
188  this->created = false;
189  }
190 
191  /*
192  * GETTER
193  */
194 
196 
221  inline std::string TidyDoc::getOutput(std::queue<std::string>& warningsTo) {
222  TidyBuffer buffer;
223 
224  switch(tidySaveBuffer(this->doc, buffer.get())) {
225  case 0:
226  // everything went fine
227  break;
228 
229  case 1:
230  case 2:
231  // warnings or errors occured
232  if(this->errors.valid()) {
233  std::queue<std::string> warnings(
235  this->errors.getString(),
236  '\n',
237  true
238  )
239  );
240 
241  while(!warnings.empty()) {
242  warningsTo.emplace(warnings.front());
243 
244  warnings.pop();
245  }
246 
247  this->errors.clear();
248  }
249 
250  break;
251 
252  default:
253  // fatal errors occured
254  if(this->errors.valid() && !(this->errors.empty())) {
255  throw Exception("Could not write to buffer: " + errors.getString());
256  }
257 
258  throw Exception("Could not write to buffer");
259  }
260 
261  if(buffer.valid() && !buffer.empty()) {
262  return buffer.getString();
263  }
264 
265  return std::string();
266  }
267 
268  /*
269  * SETTERS
270  */
271 
273 
296  inline void TidyDoc::setOption(TidyOptionId option, bool value) {
297  if(tidyOptSetBool(this->doc, option, value ? yes : no) == 0) {
298  throw Exception(
299  "Could not set tidy option #"
300  + std::to_string(option)
301  + " to boolean "
302  + (value ? "yes" : "no")
303  );
304  }
305  }
306 
308 
329  inline void TidyDoc::setOption(TidyOptionId option, int value) {
330  if(tidyOptSetInt(this->doc, option, value) == 0) {
331  throw Exception(
332  "Could not set tidy option #"
333  + std::to_string(option)
334  + " to integer "
335  + std::to_string(value)
336  );
337  }
338  }
339 
341 
362  inline void TidyDoc::setOption(TidyOptionId option, ulong value) {
363  if(tidyOptSetInt(this->doc, option, value) == 0) {
364  throw Exception(
365  "Could not set tidy option #"
366  + std::to_string(option)
367  + " to unsigned integer "
368  + std::to_string(value)
369  );
370  }
371  }
372 
374 
395  inline void TidyDoc::setOption(TidyOptionId option, const std::string& value) {
396  if(tidyOptSetValue(this->doc, option, value.c_str()) == 0) {
397  throw Exception(
398  "Could not set tidy option #"
399  + std::to_string(option)
400  + " to string \""
401  + value
402  + "\""
403  );
404  }
405  }
406 
407  /*
408  * PARSING AND CLEANUP
409  */
410 
412 
441  inline void TidyDoc::parse(const std::string& in, std::queue<std::string>& warningsTo) {
442  switch(tidyParseString(this->doc, in.c_str())) {
443  case 0:
444  // everything went fine
445  break;
446 
447  case 1:
448  case 2:
449  // warnings or errors occured
450  if(this->errors.valid()) {
451  std::queue<std::string> warnings(
453  this->errors.getString(),
454  '\n',
455  true
456  )
457  );
458 
459  while(!warnings.empty()) {
460  warningsTo.emplace(warnings.front());
461 
462  warnings.pop();
463  }
464 
465  this->errors.clear();
466  }
467 
468  break;
469 
470  default:
471  // errors occured
472  if(this->errors.valid() && !(this->errors.empty())) {
473  throw Exception("Could not parse HTML: " + errors.getString());
474  }
475 
476  throw Exception("Could not parse HTML");
477  }
478  }
479 
481 
505  inline void TidyDoc::cleanAndRepair(std::queue<std::string>& warningsTo) {
506  switch(tidyCleanAndRepair(this->doc)) {
507  case 0:
508  // everything went fine
509  break;
510 
511  case 1:
512  case 2:
513  // warnings or errors occured
514  if(this->errors.valid()) {
515  std::queue<std::string> warnings(
517  this->errors.getString(),
518  '\n',
519  true
520  )
521  );
522 
523  while(!warnings.empty()) {
524  warningsTo.emplace(warnings.front());
525 
526  warnings.pop();
527  }
528 
529  this->errors.clear();
530  }
531 
532  break;
533 
534  default:
535  // fatal errors occured
536  if(this->errors.valid() && !(this->errors.empty())) {
537  throw Exception("Could not clean and repair HTML: " + errors.getString());
538  }
539 
540  throw Exception("Could not clean and repair HTML");
541  }
542  }
543 
544  // static helper function checking whether the library version is below 5.7.18
545  inline bool TidyDoc::isVersionBelow5_7_18() {
546  constexpr auto firstDotPosition{1};
547  constexpr auto secondDotPosition{3};
548  constexpr auto length{6};
549 
550  const std::string version(tidyLibraryVersion());
551 
552  if(version.substr(0, firstDotPosition) > "5") {
553  return false; // version 6.0 or higher
554  }
555 
556  if(version.size() > firstDotPosition && version[firstDotPosition] != '.') {
557  return false; // version 10.0 or higher
558  }
559 
560  if(version.substr(0, secondDotPosition) > "5.7") {
561  return false; // version 5.8 or higher
562  }
563 
564  if(version.substr(0, firstDotPosition) < "5") {
565  return true; // version 4.x or lower
566  }
567 
568  if(version.size() > secondDotPosition && version[secondDotPosition] != '.') {
569  return false; // version 5.10 or higher
570  }
571 
572  if(version >= "5.7.18" && version.size() >= length) {
573  return false; // version 5.7.18 or higher
574  }
575 
576  if(version.substr(0, secondDotPosition) < "5.7") {
577  return true; // version 5.6 or lower
578  }
579 
580  if(version.size() > length) {
581  return false; // version 5.7.100 or higher
582  }
583 
584  return true; // version 5.7.17 or lower
585  }
586 
587 } /* namespace crawlservpp::Wrapper */
588 
589 #endif /* WRAPPER_TIDYDOC_HPP_ */
::TidyBuffer * get() noexcept
Gets a pointer to the underlying buffer.
Definition: TidyBuffer.hpp:119
bool valid() const noexcept
Checks whether the underlying buffer is valid.
Definition: TidyBuffer.hpp:155
bool empty() const noexcept
Checks whether the underlying buffer is empty.
Definition: TidyBuffer.hpp:195
virtual ~TidyDoc()
Destructor releasing the underlying tidy-html5 document.
Definition: TidyDoc.hpp:183
void setOption(TidyOptionId option, bool value)
Sets a boolean option.
Definition: TidyDoc.hpp:296
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
RAII wrapper for documents used by the tidy-html5 API.
Definition: TidyDoc.hpp:70
RAII wrapper for buffers used by the tidy-html5 API.
Definition: TidyBuffer.hpp:53
std::queue< std::string > splitToQueue(std::string_view str, char delimiter, bool removeEmpty)
Splits a string into a queue of strings using the given delimiter.
Definition: Strings.hpp:794
std::string getString() const noexcept
Copies the content of the underlying buffer into a string.
Definition: TidyBuffer.hpp:136
void clear() noexcept
Frees the underlying buffer.
Definition: TidyBuffer.hpp:209
Namespace for RAII wrappers and Wrapper::Database.
Definition: Database.hpp:109
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
TidyDoc()
Constructor creating an empty tidy-html5 document.
Definition: TidyDoc.hpp:166
void parse(const std::string &in, std::queue< std::string > &warningsTo)
Parses the given markup.
Definition: TidyDoc.hpp:441
TidyDoc & operator=(TidyDoc &)=delete
Deleted copy assignment operator.
std::string getOutput(std::queue< std::string > &warningsTo)
Gets the processed text from the tidy-html5 document.
Definition: TidyDoc.hpp:221
Class for tidy-html5 document exceptions.
Definition: TidyDoc.hpp:118
void cleanAndRepair(std::queue< std::string > &warningsTo)
Cleans and repairs the previously parsed content of the underlying tidy-html5 document.
Definition: TidyDoc.hpp:505