crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Strings.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Strings.hpp
24  *
25  * Namespace for global string helper functions.
26  *
27  * Created on: Dec 10, 2018
28  * Author: ans
29  */
30 
31 #ifndef HELPER_STRINGS_HPP_
32 #define HELPER_STRINGS_HPP_
33 
34 #include <boost/algorithm/string.hpp>
35 
36 #include <algorithm> // std::equal, std::find_if, std::mismatch, std::remove_if,
37  // std::sort, std::transform, std::unique
38 #include <array> // std::array
39 #include <cctype> // std::isspace, std::tolower
40 #include <cstddef> // std::size_t
41 #include <ios> // std::boolalpha
42 #include <queue> // std::queue
43 #include <random> // std::default_random_engine, std::random_device, std::uniform_int_distribution
44 #include <sstream> // std::istringstream
45 #include <string> // std::string
46 #include <string_view> // std::string_view, std::string_view_literals
47 #include <utility> // std::pair
48 #include <vector> // std::vector
49 
52 
53  /*
54  * CONSTANTS
55  */
56 
57  using std::string_view_literals::operator""sv;
58 
61 
63  inline constexpr std::array utfWhitespaces {
64  "\u0085"sv, // next line (NEL)
65  "\u00a0"sv, // no-break space
66  "\u1680"sv, // Ogham space mark
67  "\u2000"sv, // en quad
68  "\u2001"sv, // em quad
69  "\u2002"sv, // en space
70  "\u2003"sv, // em space
71  "\u2004"sv, // three-per-em space
72  "\u2005"sv, // four-per-em space
73  "\u2006"sv, // six-per-em space
74  "\u2007"sv, // figure space
75  "\u2008"sv, // punctuation space
76  "\u2009"sv, // thin space
77  "\u200a"sv, // hair space
78  "\u2028"sv, // line separator
79  "\u2029"sv, // paragraph separator
80  "\u202f"sv, // narrow no-break space
81  "\u205f"sv, // medium mathematical space
82  "\u2060"sv, // word joiner
83  "\u3000"sv, // ideographic space
84  };
85 
87  inline constexpr auto checkHexLength{3};
88 
90  inline constexpr auto randCharSet{
91  "01234567890abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"sv
92  };
93 
95 
96  /*
97  * DECLARATION
98  */
99 
102 
103  void replaceAll(
104  std::string& strInOut,
105  std::string_view needle,
106  std::string_view replacement
107  );
108 
112 
113  bool stringToBool(std::string inputString);
114 
118 
119  bool isDec(std::string_view inputString);
120  bool isHex(std::string_view inputString);
121 
125 
126  void trim(std::string& stringToTrim);
127 
131 
132  std::string join(
133  const std::vector<std::string>& strings,
134  char delimiter,
135  bool ignoreEmpty
136  );
137  std::string join(
138  const std::vector<std::string>& strings,
139  std::string_view delimiter,
140  bool ignoreEmpty
141  );
142  std::string join(
143  std::queue<std::string>& strings,
144  char delimiter,
145  bool ignoreEmpty
146  );
147  std::string join(
148  std::queue<std::string>& strings,
149  std::string_view delimiter,
150  bool ignoreEmpty
151  );
152  void join(
153  const std::vector<std::string>& strings,
154  char delimiter,
155  bool ignoreEmpty,
156  std::string& appendTo
157  );
158  void join(
159  const std::vector<std::string>& strings,
160  std::string_view delimiter,
161  bool ignoreEmpty,
162  std::string& appendTo
163  );
164  void join(
165  std::queue<std::string>& strings,
166  char delimiter,
167  bool ignoreEmpty,
168  std::string& appendTo
169  );
170  void join(
171  std::queue<std::string>& strings,
172  std::string_view delimiter,
173  bool ignoreEmpty,
174  std::string& appendTo
175  );
176 
180 
181  std::vector<std::string> split(const std::string& str, char delimiter);
182  std::vector<std::string> split(std::string_view str, std::string_view delimiter);
183 
184  std::queue<std::string> splitToQueue(
185  std::string_view str,
186  char delimiter,
187  bool removeEmpty
188  );
189  std::queue<std::string> splitToQueue(
190  std::string_view str,
191  std::string_view delimiter,
192  bool removeEmpty
193  );
194 
198 
199  void sortAndRemoveDuplicates(std::vector<std::string>& vectorOfStrings, bool caseSensitive);
200 
204 
205  char getFirstOrEscapeChar(std::string_view from);
206 
210 
211  void encodePercentage(std::string& stringToEncode);
212 
216 
217  void utfTidy(std::string& stringToTidy);
218 
222 
223  bool checkDomainName(std::string_view name);
224  bool checkSQLName(std::string_view name);
225 
229 
230  std::string generateRandom(std::size_t length);
231 
233 
234  /*
235  * IMPLEMENTATION
236  */
237 
239 
246  inline void replaceAll(
247  std::string& strInOut,
248  std::string_view needle,
249  std::string_view replacement
250  ) {
251  std::size_t startPos{};
252 
253  if(needle.empty()) {
254  return;
255  }
256 
257  while((startPos = strInOut.find(needle, startPos)) != std::string::npos) {
258  strInOut.replace(startPos, needle.length(), replacement);
259 
260  startPos += replacement.length();
261  }
262  }
263 
265 
277  inline bool stringToBool(std::string inputString) {
278  std::transform(
279  inputString.begin(),
280  inputString.end(),
281  inputString.begin(),
282  [](const auto c) {
283  return std::tolower(c);
284  }
285  );
286 
287  std::istringstream strStr(inputString);
288 
289  bool result{false};
290 
291  strStr >> std::boolalpha >> result;
292 
293  return result;
294  }
295 
297 
303  inline bool isDec(std::string_view inputString) {
304  bool hasDot{false};
305 
306  for(const auto c : inputString) {
307  switch(c) {
308  case '0':
309  case '1':
310  case '2':
311  case '3':
312  case '4':
313  case '5':
314  case '6':
315  case '7':
316  case '8':
317  case '9':
318  continue;
319 
320  case '.':
321  if(hasDot) {
322  return false;
323  }
324 
325  hasDot = true;
326 
327  continue;
328 
329  default:
330  return false;
331  }
332  }
333 
334  return true;
335  }
336 
338 
346  inline bool isHex(std::string_view inputString) {
347  return inputString.find_first_not_of(
348  "0123456789AaBbCcDdEeFf"
349  ) == std::string_view::npos;
350  }
351 
353 
360  inline void trim(std::string& stringToTrim) {
361  stringToTrim.erase(
362  stringToTrim.begin(),
363  std::find_if(
364  stringToTrim.begin(),
365  stringToTrim.end(),
366  [](int ch) {
367  return std::isspace(ch) == 0;
368  }
369  )
370  );
371 
372  stringToTrim.erase(
373  std::find_if(
374  stringToTrim.rbegin(),
375  stringToTrim.rend(),
376  [](int ch) {
377  return std::isspace(ch) == 0;
378  }
379  ).base(), stringToTrim.end());
380  }
381 
383 
396  inline std::string join(
397  const std::vector<std::string>& strings,
398  char delimiter,
399  bool ignoreEmpty
400  ) {
401  std::string result;
402  std::size_t size{};
403 
404  // calculate and reserve needed memory
405  for(const auto& string : strings) {
406  if(!ignoreEmpty || !string.empty()) {
407  size += string.size() + 1;
408  }
409  }
410 
411  result.reserve(size);
412 
413  // create string
414  for(const auto& string : strings) {
415  if(!ignoreEmpty || !string.empty()) {
416  result += string;
417  result += delimiter;
418  }
419  }
420 
421  if(!result.empty()) {
422  result.pop_back();
423  }
424 
425  // return string
426  return result;
427  }
428 
430 
443  inline std::string join(
444  const std::vector<std::string>& strings,
445  std::string_view delimiter,
446  bool ignoreEmpty
447  ) {
448  // calculate and reserve needed memory
449  std::string result;
450  std::size_t size{};
451 
452  for(const auto& string : strings) {
453  if(!ignoreEmpty || !string.empty()) {
454  size += string.size() + delimiter.size();
455  }
456  }
457 
458  result.reserve(size);
459 
460  // create string
461  for(const auto& string : strings) {
462  if(!ignoreEmpty || !string.empty()) {
463  result += string;
464  result += delimiter;
465  }
466  }
467 
468  if(!result.empty()) {
469  result.pop_back();
470  }
471 
472  // return string
473  return result;
474  }
475 
477 
493  inline std::string join(
494  std::queue<std::string>& strings,
495  char delimiter,
496  bool ignoreEmpty
497  ) {
498  // create string
499  std::string result;
500 
501  while(!strings.empty()) {
502  if(!ignoreEmpty || !(strings.front().empty())) {
503  result += strings.front() + delimiter;
504  }
505 
506  strings.pop();
507  }
508 
509  if(!result.empty()) {
510  result.pop_back();
511  }
512 
513  // return string
514  return result;
515  }
516 
518 
534  inline std::string join(
535  std::queue<std::string>& strings,
536  std::string_view delimiter,
537  bool ignoreEmpty
538  ) {
539  // create string
540  std::string result;
541 
542  while(!strings.empty()) {
543  if(!ignoreEmpty || !(strings.front().empty())) {
544  result += strings.front();
545  result += delimiter;
546  }
547 
548  strings.pop();
549  }
550 
551  if(!result.empty()) {
552  result.pop_back();
553  }
554 
555  // return string
556  return result;
557  }
558 
560 
572  inline void join(
573  const std::vector<std::string>& strings,
574  char delimiter,
575  bool ignoreEmpty,
576  std::string& appendTo
577  ) {
578  // save old size of the string
579  const auto oldSize{appendTo.size()};
580 
581  // calculate and reserve needed memory
582  auto size{oldSize};
583 
584  for(const auto& string : strings) {
585  if(!ignoreEmpty || !string.empty()) {
586  size += string.size() + 1;
587  }
588  }
589 
590  appendTo.reserve(size);
591 
592  // append string
593  for(const auto& string : strings) {
594  if(!ignoreEmpty || !string.empty()) {
595  appendTo += string + delimiter;
596  }
597  }
598 
599  if(appendTo.size() > oldSize) {
600  appendTo.pop_back();
601  }
602  }
603 
605 
617  inline void join(
618  const std::vector<std::string>& strings,
619  std::string_view delimiter,
620  bool ignoreEmpty,
621  std::string& appendTo
622  ) {
623  // save old size of the string
624  const auto oldSize{appendTo.size()};
625 
626  // calculate and reserve needed memory
627  auto size{oldSize};
628 
629  for(const auto& string : strings) {
630  if(!ignoreEmpty || !string.empty()) {
631  size += string.size() + delimiter.size();
632  }
633  }
634 
635  appendTo.reserve(size);
636 
637  // append string
638  for(const auto& string : strings) {
639  if(!ignoreEmpty || !string.empty()) {
640  appendTo += string;
641  appendTo += delimiter;
642  }
643  }
644 
645  if(appendTo.size() > oldSize) {
646  appendTo.pop_back();
647  }
648  }
649 
651 
666  inline void join(
667  std::queue<std::string>& strings,
668  char delimiter,
669  bool ignoreEmpty,
670  std::string& appendTo
671  ) {
672  // save old size of the string
673  const auto oldSize{appendTo.size()};
674 
675  // append string
676  while(!strings.empty()) {
677  if(!ignoreEmpty || !(strings.front().empty())) {
678  appendTo += strings.front() + delimiter;
679  }
680 
681  strings.pop();
682  }
683 
684  if(appendTo.size() > oldSize) {
685  appendTo.pop_back();
686  }
687  }
688 
690 
705  inline void join(
706  std::queue<std::string>& strings,
707  std::string_view delimiter,
708  bool ignoreEmpty,
709  std::string& appendTo
710  ) {
711  // save old size of the string
712  const auto oldSize{appendTo.size()};
713 
714  // append string
715  while(!strings.empty()) {
716  if(!ignoreEmpty || !(strings.front().empty())) {
717  appendTo += strings.front();
718  appendTo += delimiter;
719  }
720 
721  strings.pop();
722  }
723 
724  if(appendTo.size() > oldSize) {
725  appendTo.pop_back();
726  }
727  }
728 
730 
739  inline std::vector<std::string> split(const std::string& str, char delimiter) {
740  std::vector<std::string> result;
741 
742  boost::split(result, str, [&delimiter](char c) {
743  return c == delimiter;
744  });
745 
746  return result;
747  }
748 
750 
759  inline std::vector<std::string> split(std::string_view str, std::string_view delimiter) {
760  std::string tmp(str);
761  std::vector<std::string> result;
762 
763  while(!tmp.empty()) {
764  auto index{tmp.find(delimiter)};
765 
766  if(index != std::string::npos) {
767  result.emplace_back(tmp, 0, index);
768 
769  tmp = tmp.substr(index + delimiter.size());
770  }
771  else if(!tmp.empty()) {
772  result.emplace_back(tmp);
773 
774  tmp.clear();
775  }
776  }
777 
778  return result;
779  }
780 
782 
794  inline std::queue<std::string> splitToQueue(
795  std::string_view str,
796  char delimiter,
797  bool removeEmpty
798  ) {
799  std::queue<std::string>::container_type result;
800 
801  boost::split(result, str, [&delimiter](char c) {
802  return c == delimiter;
803  });
804 
805  if(removeEmpty) {
806  result.erase(
807  std::remove_if(
808  result.begin(),
809  result.end(),
810  [](const auto& str) {
811  return str.empty();
812  }
813  ),
814  result.end()
815  );
816  }
817 
818  return std::queue<std::string>(result);
819  }
820 
822 
834  inline std::queue<std::string> splitToQueue(
835  std::string_view str,
836  std::string_view delimiter,
837  bool removeEmpty
838  ) {
839  std::string tmp(str);
840  std::queue<std::string> result;
841 
842  while(!tmp.empty()) {
843  auto index{tmp.find(delimiter)};
844 
845  if(index != std::string::npos) {
846  if(!removeEmpty || index > 0) {
847  result.emplace(tmp, 0, index);
848  }
849 
850  tmp = tmp.substr(index + delimiter.size());
851  }
852  else if(!tmp.empty()) {
853  result.emplace(tmp);
854 
855  tmp.clear();
856  }
857  }
858 
859  return result;
860  }
861 
863 
874  std::vector<std::string>& vectorOfStrings,
875  bool caseSensitive
876  ) {
877  if(caseSensitive) {
878  // case-sensitive sort
879  std::sort(vectorOfStrings.begin(), vectorOfStrings.end());
880 
881  // case-sensitive removal of co-occuring duplicates
882  vectorOfStrings.erase(
883  std::unique(
884  vectorOfStrings.begin(),
885  vectorOfStrings.end()
886  ),
887  vectorOfStrings.end()
888  );
889  }
890  else {
891  // case-insensitive sort
892  std::sort(
893  vectorOfStrings.begin(),
894  vectorOfStrings.end(),
895  [](const auto& s1, const auto& s2) {
896  const auto result{
897  std::mismatch(
898  s1.cbegin(),
899  s1.cend(),
900  s2.cbegin(),
901  s2.cend(),
902  [](const auto& s1, const auto& s2) {
903  return (s1 == s2) || std::tolower(s1) == std::tolower(s2);
904  }
905  )
906  };
907 
908  return result.second != s2.cend()
909  && (
910  result.first == s1.cend()
911  || std::tolower(*result.first) < std::tolower(*result.second)
912  );
913  }
914  );
915 
916  // case-insensitive removal of co-occuring duplicates
917  vectorOfStrings.erase(std::unique(vectorOfStrings.begin(), vectorOfStrings.end(),
918  [](const auto& s1, const auto& s2) {
919  return (s1.size() == s2.size()) && std::equal(s1.cbegin(), s1.cend(), s2.cbegin(),
920  [](const auto& c1, const auto& c2) {
921  return (c1 == c2) || std::tolower(c1) == std::tolower(c2);
922  }
923  );
924  }
925  ), vectorOfStrings.end());
926  }
927  }
928 
930 
948  inline char getFirstOrEscapeChar(std::string_view from) {
949  if(!from.empty()) {
950  if(from.at(0) == '\\' && from.length() > 1) {
951  switch(from.at(1)) {
952  case 'n':
953  return '\n';
954 
955  case 't':
956  return '\t';
957 
958  case '\\':
959  default:
960  // ignore invalid escape sequence
961  return '\\';
962  }
963  }
964  else {
965  return from.at(0);
966  }
967  }
968 
969  return 0;
970  }
971 
973 
977  inline void encodePercentage(std::string& stringToEncode) {
978  std::size_t pos{};
979 
980  do {
981  pos = stringToEncode.find('%', pos);
982 
983  if(pos == std::string::npos) {
984  break;
985  }
986 
987  if(
988  pos + checkHexLength > stringToEncode.length()
989  || !isHex(stringToEncode.substr(pos + 1, 2))
990  ) {
991  stringToEncode.insert(pos + 1, "25");
992 
993  pos += checkHexLength;
994  }
995  else {
996  ++pos;
997  }
998  } while(pos < stringToEncode.length());
999  }
1000 
1002 
1007  inline void utfTidy(std::string& stringToTidy) {
1008  // replace Unicode white spaces with spaces
1009  for(const auto whitespace : utfWhitespaces) {
1010  replaceAll(stringToTidy, whitespace, " ");
1011  }
1012 
1013  // remove zero-width no-break spaces
1014  replaceAll(stringToTidy, "\ufeff"sv, "");
1015 
1016  // replace special ASCII characters with spaces
1017  std::transform(
1018  stringToTidy.begin(),
1019  stringToTidy.end(),
1020  stringToTidy.begin(),
1021  [](const auto c) {
1022  switch(c) {
1023  case '\t':
1024  case '\n':
1025  case '\v':
1026  case '\f':
1027  case '\r':
1028  case '\b':
1029  case '\0':
1030  return ' ';
1031 
1032  default:
1033  return c;
1034  }
1035  }
1036  );
1037 
1038  // replace double spaces
1039  while(stringToTidy.find(" ") != std::string::npos) {
1040  replaceAll(stringToTidy, " ", " ");
1041  }
1042 
1043  // replace unnecessary spaces around punctuation
1044  replaceAll(stringToTidy, " .", ".");
1045  replaceAll(stringToTidy, " ,", ",");
1046  replaceAll(stringToTidy, " :", ":");
1047  replaceAll(stringToTidy, " ;", ";");
1048  replaceAll(stringToTidy, "( ", "(");
1049  replaceAll(stringToTidy, " )", ")");
1050 
1051  // trim result
1052  trim(stringToTidy);
1053  }
1054 
1055  // check whether the name is valid as domain (checking only for characters that interfere with internal SQL statements)
1057 
1067  inline bool checkDomainName(std::string_view name) {
1068  return name.find_first_of("/'") == std::string_view::npos;
1069  }
1070 
1072 
1079  inline bool checkSQLName(std::string_view name) {
1080  return name.find_first_not_of(
1081  "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789$_"
1082  ) == std::string_view::npos;
1083  }
1084 
1086 
1092  inline std::string generateRandom(std::size_t length) {
1093  thread_local static std::default_random_engine rengine(
1094  std::random_device{}()
1095  );
1096 
1097  thread_local static std::uniform_int_distribution<std::size_t> distribution(
1098  0,
1099  randCharSet.length() - 1
1100  );
1101 
1102  std::string result;
1103 
1104  result.reserve(length);
1105 
1106  while(length-- > 0) {
1107  result += randCharSet[distribution(rengine)];
1108  }
1109 
1110  return result;
1111  }
1112 
1113 } /* namespace crawlservpp::Helper::Strings */
1114 
1115 #endif /* HELPER_STRINGS_HPP_ */
constexpr std::array utfWhitespaces
UTF-8 whitespaces used by utfTidy().
Definition: Strings.hpp:63
Namespace for global string helper functions.
Definition: Strings.hpp:51
std::string join(const std::vector< std::string > &strings, char delimiter, bool ignoreEmpty)
Concatenates all elements of a vector into a single string.
Definition: Strings.hpp:396
void encodePercentage(std::string &stringToEncode)
Encodes percentage signs that are not followed by a two-digit hexadecimal number with %25...
Definition: Strings.hpp:977
void trim(std::string &stringToTrim)
Removes whitespaces around a string.
Definition: Strings.hpp:360
std::queue< std::string > splitToQueue(std::string_view str, char delimiter, bool removeEmpty)
Splits a string into a queue of strings using the given delimiter.
Definition: Strings.hpp:794
void sortAndRemoveDuplicates(std::vector< std::string > &vectorOfStrings, bool caseSensitive)
Sorts the given vector of strings and removes duplicates.
Definition: Strings.hpp:873
void utfTidy(std::string &stringToTidy)
Removes new lines and unnecessary spaces, including UTF-8 whitespaces.
Definition: Strings.hpp:1007
void replaceAll(std::string &strInOut, std::string_view needle, std::string_view replacement)
Replaces all occurences within a string with another string.
Definition: Strings.hpp:246
char getFirstOrEscapeChar(std::string_view from)
Gets the first character or an escaped character from the beginning of the given string.
Definition: Strings.hpp:948
bool isDec(std::string_view inputString)
Checks whether a string contains only decimal digits and max. one dot (.).
Definition: Strings.hpp:303
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
bool stringToBool(std::string inputString)
Converts a string into a boolean value.
Definition: Strings.hpp:277
bool checkSQLName(std::string_view name)
Checks whether the given string is a valid name for MySQL tables and fields.
Definition: Strings.hpp:1079
std::vector< std::string > split(std::string_view str, std::string_view delimiter)
Splits a string into a vector of strings using the given delimiter.
Definition: Strings.hpp:759
constexpr auto checkHexLength
Length of a two-digit hexademical number including the preceding percentage sign. ...
Definition: Strings.hpp:87
bool checkDomainName(std::string_view name)
Checks whether the given string is a a valid domain name.
Definition: Strings.hpp:1067
constexpr auto randCharSet
Characters to be chosen from for random string generation performed by generateRandom().
Definition: Strings.hpp:90
bool isHex(std::string_view inputString)
Checks whether a string contains only hexadecimal digits.
Definition: Strings.hpp:346
std::string generateRandom(std::size_t length)
Generates a random alpha-numerical string of the given length.
Definition: Strings.hpp:1092
std::vector< std::string > split(const std::string &str, char delimiter)
Splits a string into a vector of strings using the given delimiter.
Definition: Strings.hpp:739