crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Json.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022–2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Json.hpp
24  *
25  * Namespace for global JSON helper functions.
26  *
27  * Created on: Jan 9, 2019
28  * Author: ans
29  */
30 
31 #ifndef HELPER_JSON_HPP_
32 #define HELPER_JSON_HPP_
33 
34 #include "Strings.hpp"
35 
36 #include "../Main/Exception.hpp"
37 #include "../Struct/TextMap.hpp"
38 
39 #include "../_extern/jsoncons/include/jsoncons/json.hpp"
40 
41 #include <cstddef> // std::size_t
42 
43 namespace rapidjson { typedef ::std::size_t SizeType; }
44 
45 #include "../_extern/rapidjson/include/rapidjson/document.h"
46 #include "../_extern/rapidjson/include/rapidjson/error/en.h"
47 #include "../_extern/rapidjson/include/rapidjson/error/error.h"
48 #include "../_extern/rapidjson/include/rapidjson/stringbuffer.h"
49 #include "../_extern/rapidjson/include/rapidjson/writer.h"
50 
51 #include <cctype> // std::iscntrl, std::isxdigit, std::tolower
52 #include <string> // std::string
53 #include <utility> // std::pair
54 #include <vector> // std::vector
55 
58 
59  /*
60  * CONSTANTS
61  */
62 
65 
67  inline constexpr auto unicodeEscapeLength{6};
68 
70  inline constexpr auto unicodeEscapeDigit1{2};
71 
73  inline constexpr auto unicodeEscapeDigit2{3};
74 
76  inline constexpr auto unicodeEscapeDigit3{4};
77 
79  inline constexpr auto unicodeEscapeDigit4{5};
80 
82  inline constexpr auto numDebugChars{25};
83 
85 
86  /*
87  * DECLARATION
88  */
89 
92 
93  [[nodiscard]] std::string stringify(const std::vector<std::string>& vectorToStringify);
94  [[nodiscard]] std::string stringify(const std::string& stringToStringify);
95  [[nodiscard]] std::string stringify(const char * stringToStringify);
96  [[nodiscard]] std::string stringify(
97  const std::vector<std::vector<std::pair<std::string, std::string>>>& vectorToStringify
98  );
99  [[nodiscard]] std::string stringify(const Struct::TextMap& textMapToStringify);
100  [[nodiscard]] std::string stringify(const rapidjson::Value& value);
101  [[nodiscard]] std::string stringify(const jsoncons::json& json);
102 
106 
107  [[nodiscard]] std::string cleanCopy(std::string_view json);
108  [[nodiscard]] rapidjson::Document parseRapid(std::string_view json);
109  [[nodiscard]] jsoncons::json parseCons(std::string_view json);
110  [[nodiscard]] Struct::TextMap parseTextMapJson(std::string_view json);
111  [[nodiscard]] std::vector<std::pair<std::size_t, std::size_t>> parsePosLenPairsJson(
112  std::string_view json
113  );
114 
118 
119  static void free(rapidjson::Document& target);
120 
122 
123  /*
124  * CLASS FOR JSON EXCEPTIONS
125  */
126 
128 
137 
138  /*
139  * IMPLEMENTATION
140  */
141 
143 
158  inline std::string stringify(const std::vector<std::string>& vectorToStringify) {
159  // create document as array and get reference to allocator
160  rapidjson::Document document;
161 
162  document.SetArray();
163 
164  rapidjson::Document::AllocatorType& allocator{document.GetAllocator()};
165 
166  // reserve memory for all array elements
167  document.Reserve(vectorToStringify.size(), allocator);
168 
169  // write vector elements as string values to array
170  for(const auto& element : vectorToStringify) {
171  rapidjson::Value stringValue;
172 
173  stringValue.SetString(element, allocator);
174 
175  document.PushBack(stringValue, allocator);
176  }
177 
178  // create string buffer and writer
179  rapidjson::StringBuffer buffer;
180  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
181 
182  // write array to string buffer
183  document.Accept(writer);
184 
185  // return string
186  return std::string(buffer.GetString(), buffer.GetSize());
187  }
188 
190 
210  inline std::string stringify(const std::string& stringToStringify) {
211  // create document as array and get reference to allocator
212  rapidjson::Document document;
213 
214  document.SetArray();
215 
216  rapidjson::Document::AllocatorType& allocator{document.GetAllocator()};
217 
218  // write string as string element to array
219  rapidjson::Value stringValue;
220 
221  stringValue.SetString(stringToStringify, allocator);
222 
223  document.PushBack(stringValue, allocator);
224 
225  // create string buffer and writer
226  rapidjson::StringBuffer buffer;
227  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
228 
229  // write array to string buffer
230  document.Accept(writer);
231 
232  // return string
233  return std::string(buffer.GetString(), buffer.GetSize());
234  }
235 
237 
257  inline std::string stringify(const char * stringToStringify) {
258  return stringify(std::string(stringToStringify));
259  }
260 
262 
279  inline std::string stringify(const std::vector<std::vector<std::pair<std::string, std::string>>>& vectorToStringify) {
280  // create document as array and get reference to allocator
281  rapidjson::Document document;
282 
283  document.SetArray();
284 
285  rapidjson::Document::AllocatorType& allocator{document.GetAllocator()};
286 
287  // reserve memory for all array elements
288  document.Reserve(vectorToStringify.size(), allocator);
289 
290  // go through the vector elements representing the objects in the array
291  for(const auto& element : vectorToStringify) {
292  // create object and reserve memory for all [key, value] pairs
293  rapidjson::Value objectValue;
294 
295  objectValue.SetObject();
296 
297  // go through the sub-vector elements representing the [key, value] pairs in the object
298  for(const auto& pair : element) {
299  // create key
300  rapidjson::Value key;
301 
302  key.SetString(pair.first, allocator);
303 
304  // create value
305  rapidjson::Value value;
306 
307  value.SetString(pair.second, allocator);
308 
309  // add [key, value] pair to object
310  objectValue.AddMember(key, value, allocator);
311  }
312 
313  // add object to array
314  document.PushBack(objectValue, allocator);
315  }
316 
317  // create string buffer and writer
318  rapidjson::StringBuffer buffer;
319  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
320 
321  // write array to string buffer
322  document.Accept(writer);
323 
324  // return string
325  return std::string(buffer.GetString(), buffer.GetSize());
326  }
327 
329 
347  inline std::string stringify(const Struct::TextMap& textMapToStringify) {
348  // create document as array and get refference to allocator
349  rapidjson::Document document;
350 
351  document.SetArray();
352 
353  rapidjson::Document::AllocatorType& allocator{document.GetAllocator()};
354 
355  // reserve memory for all array elements
356  document.Reserve(textMapToStringify.size(), allocator);
357 
358  // go through the vector elements representing the objects in the array
359  for(const auto& textMapEntry : textMapToStringify) {
360  // create object and reserve memory for all [key, value] pairs
361  rapidjson::Value objectValue;
362 
363  objectValue.SetObject();
364 
365  // create and add [key, value] pair for position
366  rapidjson::Value keyPos;
367 
368  keyPos.SetString("p", 1, allocator);
369 
370  rapidjson::Value valuePos;
371 
372  valuePos.SetUint64(Struct::TextMapEntry::pos(textMapEntry));
373 
374  objectValue.AddMember(keyPos, valuePos, allocator);
375 
376  // create and add [key, value] pair for length
377  rapidjson::Value keyLength;
378 
379  keyLength.SetString("l", 1, allocator);
380 
381  rapidjson::Value valueLength;
382 
383  valueLength.SetUint64(Struct::TextMapEntry::length(textMapEntry));
384 
385  objectValue.AddMember(keyLength, valueLength, allocator);
386 
387  // create and add [key, value] pair for describing string
388  rapidjson::Value keyValue;
389 
390  keyValue.SetString("v", 1, allocator);
391 
392  rapidjson::Value valueValue;
393 
394  valueValue.SetString(textMapEntry.value, allocator);
395 
396  objectValue.AddMember(keyValue, valueValue, allocator);
397 
398  // add object to array
399  document.PushBack(objectValue, allocator);
400  }
401 
402  // create string buffer and writer
403  rapidjson::StringBuffer buffer;
404  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
405 
406  // write array to string buffer
407  document.Accept(writer);
408 
409  // return string
410  return std::string(buffer.GetString(), buffer.GetSize());
411  }
412 
414 
427  inline std::string stringify(const rapidjson::Value& value) {
428  rapidjson::StringBuffer buffer;
429  rapidjson::Writer<rapidjson::StringBuffer> writer(buffer);
430 
431  value.Accept(writer);
432 
433  return std::string(buffer.GetString(), buffer.GetSize());
434  }
435 
437 
449  inline std::string stringify(const jsoncons::json& json) {
450  std::string result;
451 
452  json.dump(result);
453 
454  return result;
455  }
456 
458 
476  inline std::string cleanCopy(std::string_view json) {
477  if(json.empty()) {
478  return std::string();
479  }
480 
481  std::string result;
482 
483  for(std::size_t n{}; n < json.length(); ++n) {
484  // ignore control characters
485  if(std::iscntrl(json[n]) != 0) {
486  continue;
487  }
488 
489  // check escape sequences
490  if(json[n] == '\\') {
491  bool validEscapeSequence{false};
492 
493  if(n < json.length() - 1) {
494  switch(std::tolower(json[n + 1])) {
495  // check for escaped backslash
496  case '\\':
497  // do not check the following (escaped) backslash...
498  ++n;
499 
500  // ...but add the ignored backslash to the result
501  result.push_back('\\');
502 
503  validEscapeSequence = true;
504 
505  break;
506 
507  // check for single-digit escape sequence names
508  case 'b':
509  case 'f':
510  case 'n':
511  case 'r':
512  case 't':
513  case '\"':
514  case '\'': /* non-standard lenience */
515  case '/':
516  validEscapeSequence = true;
517 
518  break;
519 
520  // check for escaped Unicode character sequence
521  case 'u':
522  if(n < json.length() - unicodeEscapeLength) {
523  validEscapeSequence =
524  ::isxdigit(json[n + unicodeEscapeDigit1]) != 0
525  && ::isxdigit(json[n + unicodeEscapeDigit2]) != 0
526  && ::isxdigit(json[n + unicodeEscapeDigit3]) != 0
527  && ::isxdigit(json[n + unicodeEscapeDigit4]) != 0;
528  }
529 
530  break;
531 
532  default:
533  break;
534  }
535  }
536 
537  if(!validEscapeSequence) {
538  // simply escape the backslash of an invalid escape sequence
539  result.push_back('\\');
540  }
541 
542  result.push_back('\\');
543  }
544  else {
545  result.push_back(json[n]);
546  }
547  }
548 
549  return result;
550  }
551 
553 
575  inline rapidjson::Document parseRapid(std::string_view json) {
576  // clean input
577  std::string cleanJson(cleanCopy(json));
578 
579  rapidjson::Document doc;
580 
581  doc.Parse(cleanJson);
582 
583  if(doc.HasParseError() && cleanJson.find('\\') != std::string::npos) {
584  // try again with escaped backslashes
585  Strings::replaceAll(cleanJson, "\\", "\\\\");
586 
587  doc.Parse(cleanJson);
588  }
589 
590  if(doc.HasParseError()) {
591  std::string exceptionStr{
592  "Json::parseRapid(): "
593  };
594 
595  exceptionStr += rapidjson::GetParseError_En(doc.GetParseError());
596  exceptionStr += " at '";
597 
598  if(doc.GetErrorOffset() > numDebugChars) {
599  exceptionStr += cleanJson.substr(doc.GetErrorOffset() - numDebugChars, numDebugChars);
600  }
601  else if(doc.GetErrorOffset() > 0) {
602  exceptionStr += cleanJson.substr(0, doc.GetErrorOffset());
603  }
604 
605  exceptionStr += "[!]";
606 
607  if(cleanJson.size() > doc.GetErrorOffset() + numDebugChars) {
608  exceptionStr += cleanJson.substr(doc.GetErrorOffset(), numDebugChars);
609  }
610  else if(cleanJson.size() > doc.GetErrorOffset()) {
611  exceptionStr += cleanJson.substr(doc.GetErrorOffset());
612  }
613 
614  exceptionStr += "'";
615 
616  throw Exception(exceptionStr);
617  }
618 
619  return doc;
620  }
621 
623 
645  inline jsoncons::json parseCons(std::string_view json) {
646  // clean input
647  std::string cleanJson(cleanCopy(json));
648 
649  try {
650  jsoncons::json result{jsoncons::json::parse(cleanJson)};
651 
652  return result;
653  }
654  catch(const jsoncons::json_exception&) {
655  try {
656  if(cleanJson.find('\\') == std::string::npos) {
657  throw;
658  }
659 
660  // try again with replaced backslashes
661  Strings::replaceAll(cleanJson, "\\", "\\\\");
662 
663  jsoncons::json result{jsoncons::json::parse(cleanJson)};
664 
665  return result;
666  }
667  catch(const jsoncons::json_exception& e2) {
668  throw Exception(
669  "Json::parseCons(): "
670  + std::string(e2.what())
671  );
672  }
673  }
674  }
675 
677 
700  inline Struct::TextMap parseTextMapJson(std::string_view json) {
701  if(json.empty()) {
702  return {};
703  }
704 
705  // parse JSON
706  rapidjson::Document document{parseRapid(json)};
707 
708  if(!document.IsArray()) {
709  throw Exception(
710  "Json::parseTextMapJson():"
711  " Invalid text map"
712  " (is not an array)"
713  );
714  }
715 
716  Struct::TextMap result;
717 
718  for(const auto& element : document.GetArray()) {
719  if(!element.IsObject()) {
720  throw Exception(
721  "Json::parseTextMapJson():"
722  " Invalid text map"
723  " (an array element is not an object"
724  );
725  }
726 
727  const auto p{element.FindMember("p")};
728  const auto l{element.FindMember("l")};
729  const auto v{element.FindMember("v")};
730 
731  if(p == element.MemberEnd() || !(p->value.IsUint64())) {
732  throw Exception(
733  "Json::parseTextMapJson():"
734  " Invalid text map"
735  " (could not find valid position)"
736  );
737  }
738 
739  if(l == element.MemberEnd() || !(l->value.IsUint64())) {
740  throw Exception(
741  "Json::parseTextMapJson():"
742  " Invalid text map"
743  " (could not find valid length)"
744  );
745  }
746 
747  if(v == element.MemberEnd() || !(v->value.IsString())) {
748  throw Exception(
749  "Json::parseTextMapJson():"
750  " Invalid text map"
751  " (could not find valid value)"
752  );
753  }
754 
755  result.emplace_back(
756  p->value.GetUint64(),
757  l->value.GetUint64(),
758  std::string(
759  v->value.GetString(),
760  v->value.GetStringLength()
761  )
762  );
763  }
764 
765  return result;
766  }
767 
769 
793  inline std::vector<std::pair<std::size_t, std::size_t>> parsePosLenPairsJson(
794  std::string_view json
795  ) {
796  if(json.empty()) {
797  return {};
798  }
799 
800  // parse JSON
801  rapidjson::Document document{parseRapid(json)};
802 
803  if(!document.IsArray()) {
804  throw Exception(
805  "Json::parsePosLenPairsJson():"
806  " Invalid array of [pos;length] pairs"
807  " (is not an array)"
808  );
809  }
810 
811  std::vector<std::pair<std::size_t, std::size_t>> result;
812 
813  for(const auto& element : document.GetArray()) {
814  if(!element.IsArray()) {
815  throw Exception(
816  "Json::parsePosLenPairsJson():"
817  " Invalid array of [pos;length] pairs"
818  " (an array element is not an array)"
819  );
820  }
821 
822  if(element.Size() != 2) {
823  throw Exception(
824  "Json::parsePosLenPairsJson():"
825  " Invalid array of [pos;length] pairs"
826  " (a pair is not of size 2)"
827  );
828  }
829 
830  const auto a{element.GetArray()};
831 
832  if(!(a[0].IsUint64())) {
833  throw Exception(
834  "Json::parsePosLenPairsJson():"
835  " Invalid array of [pos;length] pairs"
836  " (could not find valid position)"
837  );
838  }
839 
840  if(!(a[1].IsUint64())) {
841  throw Exception(
842  "Json::parsePosLenPairsJson():"
843  " Invalid array of [pos;length] pairs"
844  " (could not find valid length)"
845  );
846  }
847 
848  result.emplace_back(
849  a[0].GetUint64(),
850  a[1].GetUint64()
851  );
852  }
853 
854  return result;
855  }
856 
858 
862  inline void free(rapidjson::Document& target) {
863  rapidjson::Value(rapidjson::kObjectType).Swap(target);
864  }
865 
866 } /* namespace crawlservpp::Helper::Json */
867 
868 #endif /* HELPER_JSON_HPP_ */
constexpr auto unicodeEscapeDigit4
The offset of the fourth Unicode character digit in JSON code (from the &#39;\&#39;).
Definition: Json.hpp:79
constexpr auto unicodeEscapeDigit1
The offset of the first Unicode character digit in JSON code (from the &#39;\&#39;).
Definition: Json.hpp:70
constexpr auto unicodeEscapeDigit3
The offset of the third Unicode character digit in JSON code (from the &#39;\&#39;).
Definition: Json.hpp:76
std::vector< std::pair< std::size_t, std::size_t > > parsePosLenPairsJson(std::string_view json)
Parses JSON code using RapidJSON and converts it into [pos;length] pairs.
Definition: Json.hpp:793
Namespace for global JSON helper functions.
Definition: Json.hpp:57
constexpr auto numDebugChars
The number of characters to show before and behind a JSON error.
Definition: Json.hpp:82
rapidjson::Document parseRapid(std::string_view json)
Parses JSON code using RapidJSON.
Definition: Json.hpp:575
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
Class for JSON exceptions.
Definition: Json.hpp:136
constexpr auto unicodeEscapeLength
The length of an escaped Unicode character in JSON code (including the &#39;\u&#39;).
Definition: Json.hpp:67
Struct::TextMap parseTextMapJson(std::string_view json)
Parses JSON code using RapidJSON and converts it into a text map.
Definition: Json.hpp:700
void replaceAll(std::string &strInOut, std::string_view needle, std::string_view replacement)
Replaces all occurences within a string with another string.
Definition: Strings.hpp:246
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
std::string cleanCopy(std::string_view json)
Copies and cleans the given JSON code to prepare it for parsing.
Definition: Json.hpp:476
Definition: Json.hpp:43
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
std::string stringify(const jsoncons::json &json)
Stringifies a JSON value using jsoncons.
Definition: Json.hpp:449
::std::size_t SizeType
Definition: Json.hpp:43
jsoncons::json parseCons(std::string_view json)
Parses JSON code using jsoncons.
Definition: Json.hpp:645
static void free(rapidjson::Document &target)
Frees memory by swapping.
Definition: Json.hpp:862
constexpr auto unicodeEscapeDigit2
The offset of the second Unicode character digit in JSON code (from the &#39;\&#39;).
Definition: Json.hpp:73