crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
URI.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * URI.hpp
24  *
25  * Parser for RFC 3986 URIs that can also analyze their relationships with each other.
26  *
27  * Created on: Oct 18, 2018
28  * Author: ans
29  */
30 
31 #ifndef PARSING_URI_HPP_
32 #define PARSING_URI_HPP_
33 
34 #include "../Helper/Strings.hpp"
35 #include "../Main/Exception.hpp"
36 #include "../Wrapper/URI.hpp"
37 #include "../Wrapper/URIQueryList.hpp"
38 
39 #include <uriparser/Uri.h>
40 
41 #if URI_VER_MINOR < 9
42 #error "uriparser version 0.9.0 or higher required!"
43 #endif
44 
45 #include <algorithm> // std::find
46 #include <cstddef> // std::size_t
47 #include <memory> // std::make_unique
48 #include <string> // std::string, std::to_string
49 #include <string_view> // std::string_view
50 #include <vector> // std::vector
51 
52 namespace crawlservpp::Parsing {
53 
54  /*
55  * CONSTANTS
56  */
57 
60 
62  inline constexpr auto maxEscapedCharLength{6};
63 
65 
66  /*
67  * DECLARATION
68  */
69 
71 
75  class URI {
76  public:
79 
80  [[nodiscard]] bool isSameDomain() const;
81  [[nodiscard]] std::string getSubUri() const;
82  [[nodiscard]] std::string getSubUri(const std::vector<std::string>& args, bool whiteList) const;
83 
87 
88  void setCurrentDomain(std::string_view currentDomain);
89  void setCurrentOrigin(std::string_view baseUri);
90 
94 
95  bool parseLink(std::string_view uriToParse);
96 
100 
101  static std::string escape(std::string_view string, bool plusSpace);
102  static std::string unescape(std::string_view string, bool plusSpace);
103  static std::string escapeUri(std::string_view uriToEscape);
104  static void makeAbsolute(std::string_view uriBase, std::vector<std::string>& uris);
105 
107 
109 
131 
132  private:
133  // internal strings
134  std::string domain;
135  std::string subUri;
136  std::string current;
137  std::string link;
138 
139  // specifies whether the current website is cross-domain
140  bool crossDomain{false};
141 
142  // base and current URI
143  Wrapper::URI base;
144  Wrapper::URI uri;
145 
146  // private static helper functions
147  static std::string textRangeToString(const UriTextRangeA& range);
148  static std::string toString(const Wrapper::URI& src);
149  };
150 
151  /*
152  * IMPLEMENTATION
153  */
154 
156 
164  inline bool URI::isSameDomain() const {
165  if(this->crossDomain) {
166  return true;
167  }
168 
169  if(!(this->uri.valid())) {
170  throw URI::Exception(
171  "Parsing::URI::isSameDomain():"
172  " No URI has been parsed"
173  );
174  }
175 
176  return URI::textRangeToString(this->uri.getc()->hostText) == this->domain;
177  }
178 
180 
196  inline std::string URI::getSubUri() const {
197  return this->getSubUri(std::vector<std::string>(), false);
198  }
199 
201 
225  inline std::string URI::getSubUri(const std::vector<std::string>& args, bool whiteList) const {
226  if(this->domain.empty()) {
227  throw URI::Exception(
228  "Parsing::URI::getSubUri():"
229  " No domain has been specified or parsed"
230  );
231  }
232 
233  if(!(this->uri.valid())) {
234  throw URI::Exception(
235  "Parsing::URI::getSubUri():"
236  " No URI has been parsed"
237  );
238  }
239 
240  Wrapper::URIQueryList queryList;
241  const UriQueryListA * queryNext{nullptr};
242  int queryCount{};
243  std::string queries;
244 
245  // get query string
246  if(this->uri.getc()->query.first != this->uri.getc()->query.afterLast) {
247  if(
248  uriDissectQueryMallocA(
249  queryList.getPtr(),
250  &queryCount,
251  this->uri.getc()->query.first,
252  this->uri.getc()->query.afterLast
253  )
254  == URI_SUCCESS
255  ) {
256  queryNext = queryList.getc();
257 
258  while(queryNext != nullptr) {
259  if(
260  (
261  whiteList
262  && std::find(
263  args.cbegin(),
264  args.cend(),
265  queryNext->key
266  ) != args.cend()
267  ) || (
268  !whiteList
269  && std::find(
270  args.cbegin(),
271  args.cend(),
272  queryNext->key
273  ) == args.cend()
274  )
275  ) {
276  queries += queryNext->key;
277 
278  if(queryNext->value != nullptr) {
279  queries += "=" + std::string(queryNext->value);
280  }
281 
282  queries += "&";
283  }
284 
285  queryNext = queryNext->next;
286  }
287  }
288  }
289 
290  if(!queries.empty()) {
291  queries.pop_back();
292  }
293 
294  // construct URI (starting with slash if sub-URI or with domain if website is cross-domain)
295  UriPathSegmentStructA * nextSegment = this->uri.getc()->pathHead;
296 
297  std::string result;
298 
299  if(this->crossDomain) {
300  result = URI::textRangeToString(this->uri.getc()->hostText);
301 
302  if(result.empty()) {
303  return "";
304  }
305  }
306 
307  while(nextSegment != nullptr) {
308  result += "/" + URI::unescape(URI::textRangeToString(nextSegment->text), false);
309 
310  nextSegment = nextSegment->next;
311  }
312 
313  // add queries
314  if(!queries.empty()) {
315  result += "?" + queries;
316  }
317 
318  return result;
319  }
320 
322 
328  inline void URI::setCurrentDomain(std::string_view currentDomain) {
329  if(currentDomain.empty()) {
330  this->domain.clear();
331 
332  this->crossDomain = true; // domain needs to be parsed from current URI
333  }
334  else {
335  this->domain = URI::escapeUri(currentDomain);
336  }
337  }
338 
340 
360  inline void URI::setCurrentOrigin(std::string_view baseUri) {
361  std::string parsedSubUri;
362 
363  // if website is cross-domain, get the current domain from the URI
364  if(this->crossDomain) {
365  const auto domainEnd{baseUri.find('/')};
366 
367  if(domainEnd == std::string::npos) {
368  this->setCurrentDomain(baseUri);
369 
370  parsedSubUri = "/";
371  }
372  else {
373  this->setCurrentDomain(baseUri.substr(0, domainEnd));
374 
375  parsedSubUri = baseUri.substr(domainEnd);
376  }
377  }
378  else {
379  parsedSubUri = baseUri;
380  }
381 
382  // error checking
383  if(this->domain.empty()) {
384  throw URI::Exception(
385  "Parsing::URI::setCurrentOrigin():"
386  " No domain has been specified or parsed"
387  );
388  }
389 
390  if(parsedSubUri.empty()) {
391  throw URI::Exception(
392  "Parsing::URI::setCurrentOrigin():"
393  " Parsed sub-URI is empty"
394  );
395  }
396 
397  if(parsedSubUri.at(0) != '/') {
398  throw URI::Exception(
399  "Parsing::URI::setCurrentOrigin():"
400  " Parsed sub-URI does not start with slash ('/')"
401  );
402  }
403 
404  // escape and set current sub-URI
405  this->subUri = URI::escapeUri(parsedSubUri);
406 
407  // create current URI string
408  this->current = "https://" + this->domain + this->subUri;
409 
410  // create new base URI
411  this->base.create();
412 
413  // parse (current) base URI
414  const char * errorPos{nullptr};
415  const auto errorCode{
416  uriParseSingleUriA(
417  this->base.get(),
418  this->current.c_str(),
419  &errorPos
420  )
421  };
422 
423  if(errorCode != URI_SUCCESS) {
424  const std::string end(
425  errorPos,
426  baseUri.size() - (errorPos - baseUri.data())
427  );
428 
429  std::string errorString{
430  "Parsing::URI::setCurrentOrigin():"
431  " URI Parser error #"
432  };
433 
434  errorString += std::to_string(errorCode);
435  errorString += ": '";
436 
437  if(end.size() < this->current.size()) {
438  errorString += this->current.substr(0, this->current.size() - end.size());
439  errorString += "[!!!]";
440  errorString += end;
441  }
442  else {
443  errorString += this->current + "[!!!]";
444  }
445 
446  errorString += "'";
447 
448  throw URI::Exception(errorString);
449  }
450  }
451 
453 
479  inline bool URI::parseLink(std::string_view uriToParse) {
480  // reset old URI if necessary
481  this->uri.clear();
482 
483  // error checking
484  if(this->domain.empty()) {
485  throw URI::Exception(
486  "Parsing::URI::parseLink():"
487  " No domain has been specified or parsed"
488  );
489  }
490 
491  if(this->subUri.empty()) {
492  throw URI::Exception(
493  "Parsing::URI::parseLink():"
494  " No sub-URI has been parsed"
495  );
496  }
497 
498  // copy URI
499  std::string linkCopy(uriToParse);
500 
501  // remove anchor if necessary
502  const auto end{linkCopy.find('#')};
503 
504  if(end != std::string::npos && linkCopy.size() > end) {
505  if(end > 0) {
506  linkCopy = linkCopy.substr(0, end);
507  }
508  else {
509  linkCopy = "";
510  }
511  }
512 
513  // trim and escape URI
514  Helper::Strings::trim(linkCopy);
515 
516  linkCopy = URI::escapeUri(linkCopy);
517 
518  // check for empty link
519  if(linkCopy.empty()) {
520  return false;
521  }
522 
523  // create new URI
524  this->uri.create();
525 
526  // create temporary URI for relative link
527  Wrapper::URI relativeSource;
528 
529  relativeSource.create();
530 
531  // NOTE: URI needs to be stored in-class BEFORE PARSING to provide long-term access by the parsing library
532  // (otherwise the URI would be out of scope for the library after leaving this member function)
533  this->link.swap(linkCopy);
534 
535  // parse relative link
536  const char * errorPos{nullptr};
537  const auto errorCode{
538  uriParseSingleUriA(
539  relativeSource.get(),
540  this->link.c_str(),
541  &errorPos
542  )
543  };
544 
545  if(errorCode != URI_SUCCESS) {
546  const std::string rest(errorPos);
547 
548  std::string errorString{
549  "Parsing::URI::parseLink():"
550  " URI Parser error #"
551  };
552 
553  errorString += std::to_string(errorCode);
554  errorString += ": '";
555 
556  if(rest.size() < this->link.size()) {
557  errorString += this->link.substr(0, this->link.size() - rest.size());
558  errorString += "[!!!]";
559  errorString += rest;
560  }
561  else {
562  errorString += this->link;
563  errorString += "[!!!]";
564  }
565 
566  errorString += "'";
567 
568  throw URI::Exception(errorString);
569  }
570 
571  // resolve reference
572  if(
573  uriAddBaseUriExA(
574  this->uri.get(),
575  relativeSource.getc(),
576  this->base.getc(),
577  URI_RESOLVE_IDENTICAL_SCHEME_COMPAT
578  ) != URI_SUCCESS
579  ) {
580  throw URI::Exception(
581  "Parsing::URI::parseLink():"
582  " Reference resolving failed for '"
583  + URI::toString(relativeSource)
584  + "'"
585  );
586  }
587 
588  // normalize URI
589  const auto dirtyParts{
590  uriNormalizeSyntaxMaskRequiredA(
591  this->uri.getc()
592  )
593  };
594 
595  if(
596  dirtyParts != URI_NORMALIZED
597  && uriNormalizeSyntaxExA(
598  this->uri.get(),
599  dirtyParts
600  ) != URI_SUCCESS
601  ) {
602  throw URI::Exception(
603  "Parsing::URI::parseLink():"
604  " Normalizing failed for '"
605  + URI::toString(this->uri)
606  + "'"
607  );
608  }
609 
610  return true;
611  }
612 
614 
622  inline std::string URI::escape(std::string_view string, bool plusSpace) {
623  auto cString{
624  //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, hicpp-avoid-c-arrays, modernize-avoid-c-arrays)
625  std::make_unique<char[]>(
626  string.size() * maxEscapedCharLength + 1
627  )
628  };
629 
630  uriEscapeExA(
631  string.data(),
632  string.data() + string.size(),
633  cString.get(),
634  static_cast<UriBool>(plusSpace),
635  0
636  );
637 
638  return std::string(cString.get());
639  }
640 
642 
651  inline std::string URI::unescape(std::string_view string, bool plusSpace) {
652  if(string.empty()) {
653  return std::string();
654  }
655 
656  auto cString{
657  //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, hicpp-avoid-c-arrays, modernize-avoid-c-arrays)
658  std::make_unique<char[]>(
659  string.size() + 1
660  )
661  };
662 
663  for(std::size_t n{}; n < string.size(); ++n) {
664  cString[n] = string.at(n);
665  }
666 
667  cString[string.size()] = '\0';
668 
669  uriUnescapeInPlaceExA(
670  cString.get(),
671  static_cast<UriBool>(plusSpace),
672  URI_BR_DONT_TOUCH
673  );
674 
675  return std::string(cString.get());
676  }
677 
679 
689  inline std::string URI::escapeUri(std::string_view uriToEscape) {
690  std::string result;
691  std::size_t pos{};
692 
693  while(pos < uriToEscape.size()) {
694  auto end{uriToEscape.find_first_of(";/?:@=&#%", pos)};
695 
696  if(end == std::string::npos) {
697  end = uriToEscape.size();
698  }
699 
700  if(end - pos > 0) {
701  const std::string part(uriToEscape, pos, end - pos);
702 
703  result += URI::escape(part, false);
704  }
705 
706  if(end < uriToEscape.size()) {
707  result += uriToEscape.at(end);
708  }
709 
710  pos = end + 1;
711  }
712 
713  // replace % with %25 if not followed by a two-digit hexadecimal number
715 
716  return result;
717  }
718 
720 
734  inline void URI::makeAbsolute(std::string_view uriBase, std::vector<std::string>& uris) {
735  // create base URI
736  Wrapper::URI baseUri;
737 
738  baseUri.create();
739 
740  // parse base URI
741  const char * errorPos{nullptr};
742  const auto errorCode{
743  uriParseSingleUriExA(
744  baseUri.get(),
745  uriBase.data(),
746  uriBase.data() + uriBase.size(),
747  &errorPos
748  )
749  };
750 
751  if(errorCode != URI_SUCCESS) {
752  const std::string end(
753  errorPos,
754  uriBase.size() - (errorPos - uriBase.data())
755  );
756 
757  std::string errorString{
758  "Parsing::URI::makeAbsolute():"
759  " URI Parser error #"
760  };
761 
762  errorString += std::to_string(errorCode);
763  errorString += ": '";
764 
765  if(end.size() < uriBase.size()) {
766  errorString += uriBase.substr(0, uriBase.size() - end.size());
767  errorString += "[!!!]";
768  errorString += end;
769  }
770  else {
771  errorString += uriBase;
772  errorString += "[!!!]";
773  }
774 
775  errorString += "'";
776 
777  throw URI::Exception(errorString);
778  }
779 
780  // go through (possibly) relative URIs
781  std::vector<std::string> result;
782 
783  for(const auto& relUri : uris) {
784  // skip empty URIs
785  if(relUri.empty()) {
786  continue;
787  }
788 
789  // create relative URI
790  Wrapper::URI relativeSource;
791 
792  relativeSource.create();
793 
794  // create absolute URI
795  Wrapper::URI absoluteDest;
796 
797  absoluteDest.create();
798 
799  // parse relative URI
800  if(uriParseSingleUriA(
801  relativeSource.get(),
802  relUri.c_str(),
803  nullptr
804  ) != URI_SUCCESS) {
805  // ignore single URIs that cannot be parsed
806  continue;
807  }
808 
809  // resolve reference
810  if(uriAddBaseUriExA(
811  absoluteDest.get(),
812  relativeSource.getc(),
813  baseUri.getc(),
814  URI_RESOLVE_IDENTICAL_SCHEME_COMPAT
815  ) != URI_SUCCESS) {
816  // ignore single URIs that cannot be resolved
817  continue;
818  }
819 
820  // normalize absolute URI
821  const auto dirtyParts{
822  uriNormalizeSyntaxMaskRequiredA(
823  absoluteDest.getc()
824  )
825  };
826 
827  if(
828  dirtyParts != URI_NORMALIZED
829  && uriNormalizeSyntaxExA(
830  absoluteDest.get(),
831  dirtyParts
832  ) != URI_SUCCESS
833  ) {
834  // ignore single URIs that cannot be normalized
835  continue;
836  }
837 
838  // add normalized absolute URI
839  result.emplace_back(URI::toString(absoluteDest));
840  }
841 
842  // swap (possibly) relative with absolute URIs
843  result.swap(uris);
844  }
845 
846  // private static helper function: convert URITextRangeA to std::string
847  inline std::string URI::textRangeToString(const UriTextRangeA& range) {
848  if(
849  range.first == nullptr
850  || *(range.first) == 0
851  || range.afterLast == nullptr
852  || range.afterLast <= range.first
853  ) {
854  return std::string();
855  }
856 
857  return std::string(range.first, range.afterLast - range.first);
858  }
859 
860  // private static helper function: convert URI to string
861  inline std::string URI::toString(const Wrapper::URI& src) {
862  if(!src.valid()) {
863  return std::string();
864  }
865 
866  int charsRequired{};
867 
868  if(
869  uriToStringCharsRequiredA(
870  src.getc(),
871  &charsRequired
872  ) != URI_SUCCESS
873  ) {
874  throw URI::Exception(
875  "Parsing::URI::toString():"
876  " Could not convert URI to string,"
877  " because uriToStringCharsRequiredA(...)"
878  " failed"
879  );
880  }
881 
882  auto uriCString{
883  //NOLINTNEXTLINE(cppcoreguidelines-avoid-c-arrays, hicpp-avoid-c-arrays, modernize-avoid-c-arrays)
884  std::make_unique<char[]>(
885  charsRequired + 1
886  )
887  };
888 
889  if(
890  uriToStringA(
891  uriCString.get(),
892  src.getc(),
893  charsRequired + 1,
894  nullptr
895  ) != URI_SUCCESS
896  ) {
897  throw URI::Exception(
898  "Parsing::URI::toString():"
899  " Could not convert URI to string,"
900  " because uriToStringA(...) failed"
901  );
902  }
903 
904  return std::string(uriCString.get());
905  }
906 
907 } /* namespace crawlservpp::Parsing */
908 
909 #endif /* PARSING_URI_HPP_ */
void create()
Creates a new and empty URI.
Definition: URI.hpp:160
static void makeAbsolute(std::string_view uriBase, std::vector< std::string > &uris)
Public static helper function making a set of (possibly) relative URIs absolute.
Definition: URI.hpp:734
const UriUriA * getc() const noexcept
Gets a const pointer to the underlying URI structure.
Definition: URI.hpp:139
static std::string unescape(std::string_view string, bool plusSpace)
Public static helper function URI-unescaping a string.
Definition: URI.hpp:651
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
void encodePercentage(std::string &stringToEncode)
Encodes percentage signs that are not followed by a two-digit hexadecimal number with %25...
Definition: Strings.hpp:977
void trim(std::string &stringToTrim)
Removes whitespaces around a string.
Definition: Strings.hpp:360
void setCurrentDomain(std::string_view currentDomain)
Sets the current domain.
Definition: URI.hpp:328
UriQueryListA ** getPtr() noexcept
Gets a pointer to the pointer containing the address of the underlying query list.
Definition: URIQueryList.hpp:147
const UriQueryListA * getc() const noexcept
Gets a const pointer to the underlying query list.
Definition: URIQueryList.hpp:136
Class for URI exceptions.
Definition: URI.hpp:130
void setCurrentOrigin(std::string_view baseUri)
Sets the current origin.
Definition: URI.hpp:360
bool isSameDomain() const
Checks whether the parsed URI links to the current domain.
Definition: URI.hpp:164
static std::string escape(std::string_view string, bool plusSpace)
Public static helper function URI-escaping a string.
Definition: URI.hpp:622
bool valid() const noexcept
Checks whether the URI is valid.
Definition: URI.hpp:148
Namespace for classes parsing HTML, URIs, and XML.
Definition: HTML.hpp:42
static std::string escapeUri(std::string_view uriToEscape)
Public static helper function escaping a URI, but leacing reserved characters intact.
Definition: URI.hpp:689
bool parseLink(std::string_view uriToParse)
Parses a link, either abolute or into a sub-URI.
Definition: URI.hpp:479
std::string getSubUri() const
Gets the sub-URI for the current URI.
Definition: URI.hpp:196
RAII wrapper for the RFC 3986 URI structure used by uriparser.
Definition: URI.hpp:57
UriUriA * get() noexcept
Gets a pointer to the underlying URI structure.
Definition: URI.hpp:131
Parser for RFC 3986 URIs that can also analyze their relationships with each other.
Definition: URI.hpp:75
void clear()
Frees the current URI.
Definition: URI.hpp:172
constexpr auto maxEscapedCharLength
Maximum length of a URL-escaped character.
Definition: URI.hpp:62
RAII wrapper for the URI query list used by uriparser.
Definition: URIQueryList.hpp:56