crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
XPath.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021–2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * XPath.hpp
24  *
25  * Using the pugixml parser library to implement a XPath query with boolean,
26  * single and/or multiple results. A query is only created when needed.
27  *
28  * Created on: Oct 17, 2018
29  * Author: ans
30  */
31 
32 #ifndef QUERY_XPATH_HPP_
33 #define QUERY_XPATH_HPP_
34 
35 #include "../Helper/Strings.hpp"
36 #include "../Main/Exception.hpp"
37 #include "../Parsing/XML.hpp"
38 
39 #include <pugixml.hpp>
40 
41 #include <exception> // std::exception
42 #include <sstream> // std::ostringstream
43 #include <string> // std::string
44 #include <string_view> // std::string_view
45 #include <vector> // std::vector
46 
47 namespace crawlservpp::Query {
48 
49  /*
50  * CONSTANTS
51  */
52 
55 
57  inline constexpr std::string_view cDataHead{"<![CDATA["};
58 
60  inline constexpr std::string_view cDataTail{"]]>"};
61 
63 
64  /*
65  * DECLARATION
66  */
67 
69 
74  class XPath {
75  public:
78 
79  XPath(const std::string& xpath, bool textOnly);
80 
84 
85  [[nodiscard]] bool getBool(const Parsing::XML& doc) const;
86  void getFirst(const Parsing::XML& doc, std::string& resultTo) const;
87  void getAll(const Parsing::XML& doc, std::vector<std::string>& resultTo) const;
88  void getSubSets(const Parsing::XML& doc, std::vector<Parsing::XML>& resultTo) const;
89 
91 
93 
105 
106  private:
107  pugi::xpath_query query;
108  bool compiled;
109  bool isTextOnly;
110 
111  // static helper functions
112  [[nodiscard]] static std::string nodeToString(const pugi::xpath_node& node, bool textOnly);
113 
114  // sub-class for text-only conversion walker
115  class TextOnlyWalker : public pugi::xml_tree_walker {
116  public:
117  bool for_each(pugi::xml_node& node) override;
118  [[nodiscard]] std::string getResult() const;
119 
120  protected:
121  std::string result;
122  };
123  };
124 
125  /*
126  * IMPLEMENTATION
127  */
128 
130 
140  inline XPath::XPath(const std::string& xpath, bool textOnly) : compiled(false), isTextOnly(false) {
141  // create new XPath object
142  try {
143  this->query = pugi::xpath_query(xpath.c_str());
144  this->compiled = true;
145  }
146  catch(const pugi::xpath_exception& e) {
147  throw XPath::Exception(e.what());
148  }
149 
150  // save XPath option
151  this->isTextOnly = textOnly;
152  }
153 
155 
168  inline bool XPath::getBool(const Parsing::XML& doc) const {
169  // check query and content
170  if(!(this->compiled)) {
171  throw XPath::Exception("No query compiled");
172  }
173 
174  if(!(doc.doc)) {
175  throw XPath::Exception("No content parsed");
176  }
177 
178  // evaluate query with boolean result
179  try {
180  return this->query.evaluate_boolean(*(doc.doc));
181  }
182  catch(const std::exception& e) {
183  throw XPath::Exception(e.what());
184  }
185  }
186 
188 
202  inline void XPath::getFirst(const Parsing::XML& doc, std::string& resultTo) const {
203  // empty result
204  resultTo.clear();
205 
206  // check query and content
207  if(!(this->compiled)) {
208  throw XPath::Exception("No query compiled");
209  }
210 
211  if(!(doc.doc)) {
212  throw XPath::Exception("No content parsed");
213  }
214 
215  // evaluate query with string result
216  try {
217  if(this->query.return_type() == pugi::xpath_type_node_set) {
218  const auto nodeSet{this->query.evaluate_node_set(*(doc.doc))};
219 
220  if(nodeSet.empty()) {
221  resultTo = "";
222  }
223  else {
224  resultTo = XPath::nodeToString(nodeSet[0], this->isTextOnly);
225  }
226  }
227  else {
228  resultTo = this->query.evaluate_string(*(doc.doc));
229  }
230  }
231  catch(const std::exception& e) {
232  throw XPath::Exception(e.what());
233  }
234  }
235 
237 
251  inline void XPath::getAll(const Parsing::XML& doc, std::vector<std::string>& resultTo) const {
252  // empty result
253  resultTo.clear();
254 
255  // check query and content
256  if(!(this->compiled)) {
257  throw XPath::Exception("No query compiled");
258  }
259 
260  if(!(doc.doc)) {
261  throw XPath::Exception("No content parsed");
262  }
263 
264  // evaluate query with multiple string results
265  try {
266  if(this->query.return_type() == pugi::xpath_type_node_set) {
267  const auto nodeSet{this->query.evaluate_node_set(*(doc.doc))};
268 
269  resultTo.reserve(nodeSet.size());
270 
271  for(const auto& node : nodeSet) {
272  const auto result{XPath::nodeToString(node, this->isTextOnly)};
273 
274  if(!result.empty()) {
275  resultTo.emplace_back(result);
276  }
277  }
278  }
279  else {
280  const auto result{this->query.evaluate_string(*(doc.doc))};
281 
282  if(!result.empty()) {
283  resultTo.emplace_back(result);
284  }
285  }
286  }
287  catch(const std::exception& e) {
288  throw XPath::Exception(e.what());
289  }
290  }
291 
293 
310  inline void XPath::getSubSets(const Parsing::XML& doc, std::vector<Parsing::XML>& resultTo) const {
311  // empty result
312  resultTo.clear();
313 
314  // check query and content
315  if(!(this->compiled)) {
316  throw XPath::Exception("No query compiled");
317  }
318 
319  if(!(doc.doc)) {
320  throw XPath::Exception("No content parsed");
321  }
322 
323  // evaluate query with multiple results
324  try {
325  if(this->query.return_type() == pugi::xpath_type_node_set) {
326  const auto nodeSet{this->query.evaluate_node_set(*(doc.doc))};
327 
328  resultTo.reserve(nodeSet.size());
329 
330  for(const auto& node : nodeSet) {
331  if(node != nullptr) {
332  resultTo.emplace_back(node.node());
333  }
334  }
335  }
336  else {
337  throw XPath::Exception(
338  "Could not create subset, because the result of the query is no node set"
339  );
340  }
341  }
342  catch(const std::exception& e) {
343  throw XPath::Exception(e.what());
344  }
345  }
346 
347  // static helper function: convert node to string
348  inline std::string XPath::nodeToString(const pugi::xpath_node& node, bool textOnly) {
349  std::string result;
350 
351  if(node.attribute() != nullptr) {
352  result = node.attribute().as_string();
353  }
354  else if(node.node() != nullptr) {
355  if(textOnly) {
356  XPath::TextOnlyWalker walker;
357 
358  node.node().traverse(walker);
359 
360  result = walker.getResult();
361 
362  if(!result.empty()) {
363  result.pop_back();
364  }
365  }
366  else {
367  for(const auto& child : node.node().children()) {
368  std::ostringstream outStrStr;
369  std::string out;
370 
371  child.print(outStrStr, "", 0);
372 
373  out = outStrStr.str();
374 
375  // parse CDATA
376  if(
377  out.length() > cDataHead.length() + cDataTail.length()
378  && out.substr(0, cDataHead.length()) == cDataHead
379  && out.substr(out.length() - cDataTail.length()) == cDataTail
380  ) {
381  out = out.substr(
382  cDataHead.length(),
383  out.length() - cDataHead.length() - cDataTail.length()
384  );
385  }
386 
387  result += out;
388  }
389  }
390  }
391 
392  return result;
393  }
394 
395  // XML walker for text-only conversion helper functions
396  inline bool XPath::TextOnlyWalker::for_each(pugi::xml_node& node) {
397  if(node.type() == pugi::node_pcdata) {
398  std::string nodeText(node.text().as_string());
399 
400  Helper::Strings::trim(nodeText);
401 
402  this->result += nodeText;
403 
404  this->result.push_back(' ');
405  }
406 
407  return true;
408  }
409 
410  // get result from XML walker
411  inline std::string XPath::TextOnlyWalker::getResult() const {
412  return this->result;
413  }
414 
415 } /* namespace crawlservpp::Query */
416 
417 #endif /* QUERY_XPATH_HPP_ */
Class for XPath exceptions.
Definition: XPath.hpp:104
void getAll(const Parsing::XML &doc, std::vector< std::string > &resultTo) const
Gets all matches from performing the query on a parsed JSON document.
Definition: XPath.hpp:251
bool getBool(const Parsing::XML &doc) const
Gets a boolean result from performing the query on a parsed XML document.
Definition: XPath.hpp:168
XPath(const std::string &xpath, bool textOnly)
Constructor setting a XPath string and whether the result should be text-only.
Definition: XPath.hpp:140
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
void trim(std::string &stringToTrim)
Removes whitespaces around a string.
Definition: Strings.hpp:360
void getFirst(const Parsing::XML &doc, std::string &resultTo) const
Gets the first match from performing the query on a parsed JSON document.
Definition: XPath.hpp:202
Implements a XPath query using the pugixml library.
Definition: XPath.hpp:74
constexpr std::string_view cDataTail
The end of a CDATA tag.
Definition: XPath.hpp:60
constexpr std::string_view cDataHead
The beginning of a CDATA tag.
Definition: XPath.hpp:57
Namespace for classes handling queries.
Definition: XML.hpp:51
Parses HTML markup into clean XML.
Definition: XML.hpp:149
void getSubSets(const Parsing::XML &doc, std::vector< Parsing::XML > &resultTo) const
Gets all matching subsets from performing the query on a parsed JSON document.
Definition: XPath.hpp:310