crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
HTML.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * HTML.hpp
24  *
25  * Parse HTML, tidy it up and convert it to %XML using tidy5-html.
26  *
27  * Created on: Feb 1, 2019
28  * Author: ans
29  */
30 
31 #ifndef PARSING_HTML_HPP_
32 #define PARSING_HTML_HPP_
33 
34 #include "../Main/Exception.hpp"
35 #include "../Wrapper/TidyDoc.hpp"
36 
37 #include <queue> // std::queue
38 #include <string> // std::string
39 #include <string_view> // std::string_view
40 
43 
44  /*
45  * CONSTANTS
46  */
47 
50 
52  inline constexpr std::string_view tidyEncoding{"utf8"};
53 
55 
56  /*
57  * DECLARATION
58  */
59 
61 
71  class HTML {
72  private:
73  // for convenience
74  using TidyDoc = Wrapper::TidyDoc;
75 
76  public:
79 
81  HTML() = default;
82 
84  virtual ~HTML() = default;
85 
89 
90  void tidyAndConvert(
91  std::string& inOut,
92  bool warnings,
93  ulong numOfErrors,
94  std::queue<std::string>& warningsTo
95  );
96 
98 
100 
108 
112 
115  HTML(HTML&) = delete;
116 
118  HTML(HTML&&) = delete;
119 
121  HTML& operator=(HTML&) = delete;
122 
124  HTML& operator=(HTML&&) = delete;
125 
127 
128  private:
129  TidyDoc doc;
130  };
131 
132  /*
133  * IMPLEMENTATION
134  */
135 
137 
171  inline void HTML::tidyAndConvert(
172  std::string& inOut,
173  bool warnings,
174  ulong numOfErrors,
175  std::queue<std::string>& warningsTo
176  ) {
177  // set options
178  try {
179  this->doc.setOption(TidyXmlOut, true);
180  this->doc.setOption(TidyQuiet, true);
181  this->doc.setOption(TidyNumEntities, true);
182  this->doc.setOption(TidyMark, false);
183  this->doc.setOption(TidyShowWarnings, warnings);
184  this->doc.setOption(TidyForceOutput, true);
185  this->doc.setOption(TidyDropEmptyElems, false);
186  this->doc.setOption(TidyShowErrors, numOfErrors);
187  this->doc.setOption(TidyOutCharEncoding, std::string(tidyEncoding));
188 
189  this->doc.parse(inOut, warningsTo);
190  this->doc.cleanAndRepair(warningsTo);
191 
192  const auto output(this->doc.getOutput(warningsTo));
193 
194  if(output.empty()) {
195  return;
196  }
197 
198  inOut = output;
199  }
200  catch(const TidyDoc::Exception& e) {
201  // re-throw as HTML::Exception
202  throw HTML::Exception(e.what());
203  }
204  }
205 
206 } /* namespace crawlservpp::Parsing */
207 
208 #endif /* PARSING_HTML_HPP_ */
void setOption(TidyOptionId option, bool value)
Sets a boolean option.
Definition: TidyDoc.hpp:296
HTML & operator=(HTML &)=delete
Deleted copy operator.
void tidyAndConvert(std::string &inOut, bool warnings, ulong numOfErrors, std::queue< std::string > &warningsTo)
Parse and tidy the given HTML markup and convert the result to XML.
Definition: HTML.hpp:171
constexpr std::string_view tidyEncoding
The character encoding used by the tidy-html5 API.
Definition: HTML.hpp:52
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
RAII wrapper for documents used by the tidy-html5 API.
Definition: TidyDoc.hpp:70
void parse(const std::string &in, std::queue< std::string > &warningsTo)
Parses the given markup.
Definition: TidyDoc.hpp:441
Parses and cleans HTML markup.
Definition: HTML.hpp:71
Namespace for classes parsing HTML, URIs, and XML.
Definition: HTML.hpp:42
std::string getOutput(std::queue< std::string > &warningsTo)
Gets the processed text from the tidy-html5 document.
Definition: TidyDoc.hpp:221
virtual ~HTML()=default
Default destructor.
Class for tidy-html5 document exceptions.
Definition: TidyDoc.hpp:118
void cleanAndRepair(std::queue< std::string > &warningsTo)
Cleans and repairs the previously parsed content of the underlying tidy-html5 document.
Definition: TidyDoc.hpp:505
Class for HTML exceptions.
Definition: HTML.hpp:107
HTML()=default
Default constructor.