crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
PickleDict.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * PickleDict.hpp
24  *
25  * Simple dictionary that supports extracting data
26  * from and writing data to Python pickles.
27  *
28  * Only pickles with protocol version 4 or higher
29  * are supported.
30  *
31  * NOTE: Does not actually run Python pickle op-codes,
32  * only extracts data from, or writes its data to a
33  * simple Python pickle.
34  *
35  * Created on: Feb 3, 2021
36  * Author: ans
37  */
38 
39 #ifndef HELPER_PICKLEDICT_HPP_
40 #define HELPER_PICKLEDICT_HPP_
41 
42 #include "../Helper/Bytes.hpp"
43 #include "../Helper/Container.hpp"
44 #include "../Main/Exception.hpp"
45 
46 #include <array> // std::array
47 #include <cstddef> // std::size_t
48 #include <cstdint> // std::int8_t, std::int64_t, std::uint[8|16|32|62]_t
49 #include <cstdlib> // std::strtof, std::strtoll
50 #include <iterator> // std::make_move_iterator
51 #include <limits> // std::numeric_limits
52 #include <optional> // std::optional
53 #include <string> // std::to_string
54 #include <unordered_map> // std::unordered_map
55 #include <vector> // std::vector
56 
57 #include <iostream>
58 
59 namespace crawlservpp::Data {
60 
61  // for convenience
62  using Bytes = std::vector<std::uint8_t>;
63 
64  /*
65  * CONSTANTS
66  */
67 
70 
72  inline constexpr auto pickleOneByte{1};
73 
75  inline constexpr auto pickleTwoBytes{2};
76 
78  inline constexpr auto pickleFourBytes{4};
79 
81  inline constexpr auto pickleEightBytes{8};
82 
84  inline constexpr auto pickleNineBytes{9};
85 
87  inline constexpr auto pickleMinSize{11};
88 
90  inline constexpr auto pickleProtocolVersion{4};
91 
93  inline constexpr auto pickleProtoByte{0};
94 
96  inline constexpr auto pickleVersionByte{1};
97 
99  inline constexpr auto pickleHeadSize{2};
100 
102  inline constexpr auto pickleMinFrameSize{9};
103 
105  inline constexpr std::uint8_t pickleMaxUOneByteNumber{255};
106 
108  inline constexpr std::uint16_t pickleMaxUTwoByteNumber{65535};
109 
111  inline constexpr std::uint32_t pickleMaxUFourByteNumber{4294967295};
112 
114  inline constexpr auto pickleBase{10};
115 
117  /*
118  * CLASS DECLARATION
119  */
120 
122 
136  class PickleDict {
137  public:
140 
142  PickleDict() = default;
143 
144  explicit PickleDict(const Bytes& data);
145 
149 
150  [[nodiscard]] std::optional<std::int64_t> getNumber(const std::string& key) const;
151  [[nodiscard]] std::optional<double> getFloat(const std::string& key) const;
152  [[nodiscard]] std::optional<std::string> getString(const std::string& key) const;
153 
157 
158  void setNumber(
159  const std::string& key,
160  std::int64_t value
161  );
162  void setFloat(
163  const std::string& key,
164  double value
165  );
166  void setString(
167  const std::string& key,
168  const std::string& value
169  );
170 
174 
175  void readFrom(const Bytes& data);
176  void writeTo(Bytes& dataTo) const;
177 
179 
182 
183  private:
184  // dictionary
185  std::unordered_map<std::string, std::string> strings;
186  std::unordered_map<std::string, std::int64_t> numbers;
187  std::unordered_map<std::string, double> floats;
188 
189  /*
190  * INTERNAL DATA STRUCTURES
191  */
192 
193  // Python pickle op-codes
194  // Source: https://github.com/python/cpython/blob/master/Modules/_pickle.c
195  enum OpCode {
196  MARK = '(',
197  STOP = '.',
198  POP = '0',
199  POP_MARK = '1',
200  DUP = '2',
201  FLOAT = 'F',
202  INT = 'I',
203  BININT = 'J',
204  BININT1 = 'K',
205  LONG = 'L',
206  BININT2 = 'M',
207  NONE = 'N',
208  PERSID = 'P',
209  BINPERSID = 'Q',
210  REDUCE = 'R',
211  STRING = 'S',
212  BINSTRING = 'T',
213  SHORT_BINSTRING = 'U',
214  UNICODE = 'V',
215  BINUNICODE = 'X',
216  APPEND = 'a',
217  BUILD = 'b',
218  GLOBAL = 'c',
219  DICT = 'd',
220  EMPTY_DICT = '}',
221  APPENDS = 'e',
222  GET = 'g',
223  BINGET = 'h',
224  INST = 'i',
225  LONG_BINGET = 'j',
226  LIST = 'l',
227  EMPTY_LIST = ']',
228  OBJ = 'o',
229  PUT = 'p',
230  BINPUT = 'q',
231  LONG_BINPUT = 'r',
232  SETITEM = 's',
233  TUPLE = 't',
234  EMPTY_TUPLE = ')',
235  SETITEMS = 'u',
236  BINFLOAT = 'G',
237 
238  /* Protocol 2. */
239  PROTO = '\x80',
240  NEWOBJ = '\x81',
241  EXT1 = '\x82',
242  EXT2 = '\x83',
243  EXT4 = '\x84',
244  TUPLE1 = '\x85',
245  TUPLE2 = '\x86',
246  TUPLE3 = '\x87',
247  NEWTRUE = '\x88',
248  NEWFALSE = '\x89',
249  LONG1 = '\x8a',
250  LONG4 = '\x8b',
251 
252  /* Protocol 3 (Python 3.x) */
253  BINBYTES = 'B',
254  SHORT_BINBYTES = 'C',
255 
256  /* Protocol 4 */
257  SHORT_BINUNICODE = '\x8c',
258  BINUNICODE8 = '\x8d',
259  BINBYTES8 = '\x8e',
260  EMPTY_SET = '\x8f',
261  ADDITEMS = '\x90',
262  FROZENSET = '\x91',
263  NEWOBJ_EX = '\x92',
264  STACK_GLOBAL = '\x93',
265  MEMOIZE = '\x94',
266  FRAME = '\x95',
267 
268  /* Protocol 5 */
269  BYTEARRAY8 = '\x96',
270  NEXT_BUFFER = '\x97',
271  READONLY_BUFFER = '\x98',
272 
273  /* force signedness */
274  SIGNED = -1
275  };
276 
277  // Python pickle frame
278  struct Frame {
279  std::uint8_t opCode{};
280 
281  Bytes data;
282  };
283 
284  /*
285  * PRIVATE MEMBER FUNCTIONS
286  */
287 
288  // internal helper functions
289  void readValue(
290  const Bytes& data,
291  std::size_t& pos,
292  const std::string& key
293  );
294 
295  // internal static helper functions
296  static bool readKey(
297  const Bytes& data,
298  std::size_t& pos,
299  std::string& keyTo
300  );
301  static Bytes unpack(const Bytes& data);
302  static bool extractNextFrame(
303  const Bytes& bytes,
304  std::size_t& pos,
305  Frame& frameTo
306  );
307  static Bytes unpackFrame(const Frame& frame);
308 
309  static bool skipMemoize(
310  const Bytes& data,
311  std::size_t& pos
312  );
313 
314  static void checkLength(
315  std::size_t dataLength,
316  std::size_t currentEnd
317  );
318  static std::size_t readValueLength(
319  const Bytes& data,
320  std::size_t& pos,
321  std::size_t numBytes
322  );
323  static std::size_t getLengthByTermination(
324  const Bytes& data,
325  std::size_t pos,
326  char terminatingCharacter
327  );
328  static std::string getString(
329  const Bytes& data,
330  std::size_t& pos,
331  std::size_t length
332  );
333 
334  static void writeHead(Bytes& to);
335  static void writeFrame(const Bytes& frameBytes, Bytes& to, bool isLast);
336  static void writeDictHead(Bytes& to);
337  static void writeDictTail(Bytes& to);
338 
339  static void writeNumberEntry(
340  const std::pair<std::string, std::int64_t>& entry,
341  Bytes& to
342  );
343  static void writeFloatEntry(
344  const std::pair<std::string, double>& entry,
345  Bytes& to
346  );
347  static void writeStringEntry(
348  const std::pair<std::string, std::string>& entry,
349  Bytes& to
350  );
351 
352  static void writeKey(const std::string& key, Bytes& to);
353 
354  static void writeBinInt1(std::uint8_t value, Bytes& to);
355  static void writeBinInt2(std::uint16_t value, Bytes& to);
356  static void writeLong1(std::int64_t value, Bytes& to);
357  static void writeBinFloat(double value, Bytes& to);
358  static void writeShortBinUnicode(const std::string& value, Bytes& to);
359  static void writeBinUnicode(const std::string& value, Bytes& to);
360  static void writeBinUnicode8(const std::string& value, Bytes& to);
361 
362  // add bytes from any (iterable) container
363  template<typename T> static void writeBytes(const T& bytes, Bytes& to) {
364  to.reserve(to.size() + bytes.size());
365 
366  for(const auto byte : bytes) {
367  to.emplace_back(byte);
368  }
369  }
370 
371  // check whether number is in range
372  template<typename T> [[nodiscard]] static bool inRange(std::int64_t number) {
373  return number >= std::numeric_limits<T>::min()
374  && number <= std::numeric_limits<T>::max();
375  }
376  };
377 
378  /*
379  * IMPLEMENTATION
380  */
381 
382  /*
383  * CONSTRUCTION
384  */
385 
387 
390  inline PickleDict::PickleDict(const Bytes& data) {
391  this->readFrom(data);
392  }
393 
394  /*
395  * GETTERS
396  */
397 
399 
406  inline std::optional<std::int64_t> PickleDict::getNumber(const std::string& key) const {
407  std::optional<std::int64_t> result;
408 
409  const auto it{this->numbers.find(key)};
410 
411  if(it != this->numbers.end()) {
412  result = it->second;
413  }
414 
415  return result;
416  }
417 
419 
426  inline std::optional<double> PickleDict::getFloat(const std::string& key) const {
427  std::optional<double> result;
428 
429  const auto it{this->floats.find(key)};
430 
431  if(it != this->floats.end()) {
432  result = it->second;
433  }
434 
435  return result;
436  }
437 
439 
446  inline std::optional<std::string> PickleDict::getString(const std::string& key) const {
447  std::optional<std::string> result;
448 
449  const auto it{this->strings.find(key)};
450 
451  if(it != this->strings.end()) {
452  result = it->second;
453  }
454 
455  return result;
456  }
457 
458  /*
459  * SETTERS
460  */
461 
463 
467  inline void PickleDict::setNumber(const std::string& key, std::int64_t value) {
468  if(key.empty()) {
469  return;
470  }
471 
472  this->numbers[key] = value;
473  }
474 
476 
480  inline void PickleDict::setFloat(const std::string& key, double value) {
481  if(key.empty()) {
482  return;
483  }
484 
485  this->floats[key] = value;
486  }
487 
489 
493  inline void PickleDict::setString(const std::string& key, const std::string& value) {
494  if(key.empty()) {
495  return;
496  }
497 
498  this->strings[key] = value;
499  }
500 
501  /*
502  * READER
503  */
504 
506 
526  inline void PickleDict::readFrom(const Bytes& data) {
527  // unpack frames
528  const auto unpackedData{
529  PickleDict::unpack(data)
530  };
531 
532  std::size_t pos{};
533 
534  while(pos < unpackedData.size()) {
535  // extract keys and values, skip everything else
536  std::string key;
537 
538  if(
539  PickleDict::readKey(unpackedData, pos, key)
540  && PickleDict::skipMemoize(unpackedData, pos)
541  ) {
542  this->readValue(unpackedData, pos, key);
543  }
544  else {
545  /*
546  * skip other values so that they are
547  * not mistaken for op-codes
548  *
549  */
550  this->readValue(unpackedData, pos, "");
551  }
552  }
553  }
554 
556 
565  inline void PickleDict::writeTo(Bytes& dataTo) const {
566  dataTo.clear();
567 
568  // write frame
569  std::vector<std::uint8_t> frame;
570 
571  PickleDict::writeDictHead(frame);
572 
573  for(const auto& entry : this->numbers) {
574  PickleDict::writeNumberEntry(entry, frame);
575  }
576 
577  for(const auto& entry : this->floats) {
578  PickleDict::writeFloatEntry(entry, frame);
579  }
580 
581  for(const auto& entry : this->strings) {
582  PickleDict::writeStringEntry(entry, frame);
583  }
584 
585  PickleDict::writeDictTail(frame);
586 
587  // write whole Python pickle
588  PickleDict::writeHead(dataTo);
589  PickleDict::writeFrame(frame, dataTo, true);
590  }
591 
592  /*
593  * INTERNAL HELPER FUNCTIONS (private)
594  */
595 
596  // read a value from the current position in the data, or none at all
597  inline void PickleDict::readValue(
598  const Bytes& data,
599  std::size_t& pos,
600  const std::string& key
601  ) {
602  std::size_t valueLength{};
603  std::string s1;
604  std::string s2;
605 
606  // check for end of data
607  if(pos >= data.size()) {
608  throw Exception(
609  "SimpleDict::readValue():"
610  " Unexpected end of data"
611  " (invalid position)"
612  );
613  }
614  if(pos == data.size() - 1) {
615  if(data[pos] == static_cast<std::uint8_t>(OpCode::STOP)) {
616  /* reached valid end of pickle */
617  ++pos;
618 
619  return;
620  }
621 
622  throw Exception(
623  "SimpleDict::readValue():"
624  " Unexpected end of data"
625  " (no STOP at the end)"
626  );
627  }
628 
629  // seek ahead
630  ++pos;
631 
632  switch(static_cast<int8_t>(data[pos - 1])) {
633  /*
634  * SKIP
635  */
636  case OpCode::ADDITEMS:
637  case OpCode::APPEND:
638  case OpCode::APPENDS:
639  case OpCode::BINPERSID:
640  case OpCode::BUILD:
641  case OpCode::DICT:
642  case OpCode::DUP:
643  case OpCode::EMPTY_DICT:
644  case OpCode::EMPTY_LIST:
645  case OpCode::EMPTY_SET:
646  case OpCode::EMPTY_TUPLE:
647  case OpCode::FROZENSET:
648  case OpCode::LIST:
649  case OpCode::MARK:
650  case OpCode::MEMOIZE:
651  case OpCode::NEWOBJ:
652  case OpCode::NEWOBJ_EX:
653  case OpCode::NEXT_BUFFER:
654  case OpCode::OBJ:
655  case OpCode::POP:
656  case OpCode::POP_MARK:
657  case OpCode::READONLY_BUFFER:
658  case OpCode::REDUCE:
659  case OpCode::SETITEM:
660  case OpCode::SETITEMS:
661  case OpCode::STACK_GLOBAL:
662  case OpCode::TUPLE:
663  case OpCode::TUPLE1:
664  case OpCode::TUPLE2:
665  case OpCode::TUPLE3:
666  // skip without argument
667  break;
668 
669  case OpCode::EXT1:
670  // skip one-byte argument
671  pos += pickleOneByte;
672 
673  break;
674 
675  case OpCode::EXT2:
676  // skip two-byte argument
677  pos += pickleTwoBytes;
678 
679  break;
680 
681  case OpCode::EXT4:
682  // skip four-byte argument
683  pos += pickleFourBytes;
684 
685  break;
686 
687  /*
688  * GET NUMBER
689  */
690  case OpCode::NEWFALSE:
691  case OpCode::NONE:
692  // add zero
693  this->setNumber(key, 0);
694 
695  ++pos;
696 
697  break;
698 
699  case OpCode::NEWTRUE:
700  // add one
701  this->setNumber(key, 1);
702 
703  ++pos;
704 
705  break;
706 
707  case OpCode::BINGET:
708  case OpCode::BININT1:
709  case OpCode::BINPUT:
710  // get one-byte unsigned integer
711  this->setNumber(key, static_cast<std::uint8_t>(data[pos]));
712 
713  ++pos;
714 
715  break;
716 
717  case OpCode::BININT2:
718  // get two-byte unsigned integer
719  PickleDict::checkLength(data.size(), pos + pickleTwoBytes);
720 
721  this->setNumber(key, Helper::Bytes::bytesToUInt16(data, pos));
722 
723  break;
724 
725  case OpCode::LONG_BINGET:
726  case OpCode::LONG_BINPUT:
727  // get four-byte unsigned integer
728  PickleDict::checkLength(data.size(), pos + pickleFourBytes);
729 
730  this->setNumber(key, Helper::Bytes::bytesToUInt32(data, pos));
731 
732  break;
733 
734  case OpCode::LONG1:
735  // read one-byte length and corresponding integer
736  valueLength = PickleDict::readValueLength(data, pos, pickleOneByte);
737 
738  switch(valueLength) {
739  case pickleOneByte:
740  this->setNumber(key, static_cast<std::int8_t>(data[pos]));
741 
742  ++pos;
743 
744  break;
745 
746  case pickleTwoBytes:
747  this->setNumber(key, Helper::Bytes::bytesToInt16(data, pos));
748 
749  break;
750 
751  case pickleFourBytes:
752  this->setNumber(key, Helper::Bytes::bytesToInt32(data, pos));
753 
754  break;
755 
756  case pickleEightBytes:
757  this->setNumber(key, Helper::Bytes::bytesToInt64(data, pos));
758 
759  break;
760 
761  default:
762  if(valueLength > pickleEightBytes) {
763  throw Exception(
764  "Pickle::readValue(): Value lengths consisting of "
765  + std::to_string(valueLength)
766  + " bytes are not supported"
767  );
768  }
769 
770  this->setNumber(key, Helper::Bytes::bytesToInt64(data, pos, valueLength));
771  }
772 
773  break;
774 
775  case OpCode::BININT:
776  case OpCode::LONG4:
777  // get four-byte signed integer
778  PickleDict::checkLength(data.size(), pos + pickleFourBytes);
779 
780  this->setNumber(key, Helper::Bytes::bytesToInt32(data, pos));
781 
782  break;
783 
784  case OpCode::INT:
785  case OpCode::LONG:
786  // get number from newline-terminated string
787  valueLength = PickleDict::getLengthByTermination(data, pos, '\n');
788 
789  s1 = PickleDict::getString(data, pos, valueLength);
790 
791  ++pos; // jump past newline
792 
793  this->setNumber(key, std::strtoll(s1.c_str(), nullptr, pickleBase));
794 
795  break;
796 
797  /*
798  * GET FLOATING-POINT NUMBER
799  */
800  case OpCode::BINFLOAT:
801  // get eight-byte floating-point number
802  PickleDict::checkLength(data.size(), pos + pickleEightBytes);
803 
804  this->setFloat(key, Helper::Bytes::bytesToDouble(data, pos));
805 
806  break;
807 
808  case OpCode::FLOAT:
809  // get floating-point number from from newline-terminated string
810  valueLength = PickleDict::getLengthByTermination(data, pos, '\n');
811 
812  s1 = PickleDict::getString(data, pos, valueLength);
813 
814  ++pos; // jump past newline
815 
816  this->setFloat(key, std::strtod(s1.c_str(), nullptr));
817 
818  break;
819 
820  /*
821  * GET STRING (OR BYTES)
822  */
823  case OpCode::SHORT_BINBYTES:
824  case OpCode::SHORT_BINSTRING:
825  case OpCode::SHORT_BINUNICODE:
826  // read one-byte length and corresponding string
827  valueLength = PickleDict::readValueLength(data, pos, pickleOneByte);
828 
829  this->setString(key, PickleDict::getString(data, pos, valueLength));
830 
831  break;
832 
833  case OpCode::BINBYTES:
834  case OpCode::BINSTRING:
835  case OpCode::BINUNICODE:
836  // read four-bytes length and corresponding string
837  valueLength = PickleDict::readValueLength(data, pos, pickleFourBytes);
838 
839  this->setString(key, PickleDict::getString(data, pos, valueLength));
840 
841  break;
842 
843  case OpCode::BINBYTES8:
844  case OpCode::BINUNICODE8:
845  case OpCode::BYTEARRAY8:
846  // read eight-bytes length and corresponding string
847  valueLength = PickleDict::readValueLength(data, pos, pickleEightBytes);
848 
849  this->setString(key, PickleDict::getString(data, pos, valueLength));
850 
851  break;
852 
853  case OpCode::GET:
854  case OpCode::PERSID:
855  case OpCode::PUT:
856  case OpCode::STRING:
857  case OpCode::UNICODE:
858  // read string terminated by newline
859  valueLength = PickleDict::getLengthByTermination(data, pos, '\n');
860 
861  this->setString(key, PickleDict::getString(data, pos, valueLength));
862 
863  ++pos; // jump past newline
864 
865  break;
866 
867  case OpCode::GLOBAL:
868  case OpCode::INST:
869  // read two strings terminated by newlines
870  valueLength = PickleDict::getLengthByTermination(data, pos, '\n');
871 
872  s1 = PickleDict::getString(data, pos, valueLength);
873 
874  ++pos; // jump past first newline
875 
876  valueLength = PickleDict::getLengthByTermination(data, pos, '\n');
877 
878  s2 = PickleDict::getString(data, pos, valueLength);
879 
880  ++pos; // jump past second newline
881 
882  // combine strings
883  this->setString(key, s1 + "." + s2);
884 
885  break;
886 
887  /*
888  * ERRORS
889  */
890  case OpCode::FRAME:
891  throw Exception(
892  "SimpleDict::ReadValue():"
893  " Unexpected frame still found after unpacking"
894  );
895 
896  case OpCode::STOP:
897  throw Exception(
898  "SimpleDict::ReadValue():"
899  " Unexpected 'STOP' before the end of the data"
900  );
901 
902  case OpCode::PROTO:
903  throw Exception(
904  "SimpleDict::ReadValue():"
905  " Unexpected 'PROTO' after the beginning of the data"
906  );
907 
908  default:
909  throw Exception(
910  "SimpleDict::ReadValue():"
911  " Unknown Python pickle op-code encountered"
912  );
913  }
914  }
915 
916  /*
917  * INTERNAL STATIC HELPER FUNCTIONS (private)
918  */
919 
920  // read a key at the current position in the data, return whether a key was read
921  inline bool PickleDict::readKey(
922  const Bytes& data,
923  std::size_t& pos,
924  std::string& keyTo
925  ) {
926  // check current op-code for pushing a short string
927  if(
928  data[pos] == static_cast<std::uint8_t>(OpCode::SHORT_BINSTRING)
929  || data[pos] == static_cast<std::uint8_t>(OpCode::SHORT_BINUNICODE)
930  ) {
931  // (= 1 byte)
932  ++pos;
933 
934  // read key length
935  const auto keyLength{data[pos]};
936 
937  // (= 1 byte)
938  ++pos;
939 
940  // check key length
941  const auto keyEnd{pos + keyLength};
942 
943  if(keyEnd > data.size()) {
944  throw Exception(
945  "SimpleDict::readKey():"
946  " Unexpected end of data (expected >"
947  + std::to_string(keyEnd - data.size())
948  + " bytes more)"
949  );
950  }
951 
952  // clear target and reserve memory
953  keyTo.clear();
954  keyTo.reserve(keyLength);
955 
956  // read key
957  for(; pos < keyEnd; ++pos) {
958  keyTo.push_back(static_cast<char>(data[pos]));
959  }
960 
961  return true;
962  }
963 
964  return false;
965  }
966 
967  // unpack all frames from a Python pickle with protocol version 4
968  inline Bytes PickleDict::unpack(const Bytes& data) {
969  Bytes unpacked;
970  std::size_t pos{};
971  Frame frame;
972 
973  while(PickleDict::extractNextFrame(data, pos, frame)) {
974  auto frameData{PickleDict::unpackFrame(frame)};
975 
976  Helper::Container::moveInto(unpacked, frameData);
977  }
978 
979  return unpacked;
980  }
981 
982  // extract the next frame from a Python pickle with protocol version 4
983  inline bool PickleDict::extractNextFrame(
984  const Bytes& bytes,
985  std::size_t& pos,
986  Frame& frameTo
987  ) {
988  if(pos == 0) {
989  // check format and version of the Python pickle
990  if(bytes.size() < pickleMinSize) {
991  throw Exception(
992  "Pickle::extractFirstFrame():"
993  " No Python pickle found (only "
994  + std::to_string(bytes.size())
995  + " bytes)"
996  );
997  }
998 
999  if(bytes[pickleProtoByte] != static_cast<std::uint8_t>(OpCode::PROTO)) {
1000  throw Exception(
1001  "Pickle::extractFirstFrame():"
1002  " No Python pickle found (invalid first byte: "
1004  + " != "
1005  + Helper::Bytes::byteToHexString(OpCode::PROTO)
1006  + ")"
1007  );
1008  }
1009 
1011  throw Exception(
1012  "Pickle::extractFirstFrame():"
1013  " Python pickle of unsupported version ("
1014  + std::to_string(bytes[pickleVersionByte])
1015  + " < "
1016  + std::to_string(pickleProtocolVersion)
1017  + ")"
1018  );
1019  }
1020 
1021  pos += pickleHeadSize;
1022  }
1023 
1024  if(pos == bytes.size()) {
1025  return false;
1026  }
1027 
1028  // check number of remaining bytes
1029  const auto remaining{bytes.size() - pos};
1030 
1031  if(remaining < pickleMinFrameSize) {
1032  throw Exception(
1033  "Pickle::extractFirstFrame():"
1034  " No frame found in Python pickle (only "
1035  + std::to_string(remaining)
1036  + " bytes left)"
1037  );
1038  }
1039 
1040  // get opcode and size
1041  frameTo.opCode = bytes[pos];
1042 
1043  ++pos;
1044 
1045  auto size{Helper::Bytes::bytesToUInt64(bytes, pos)};
1046  const auto it{bytes.cbegin() + pos};
1047 
1048  pos += size;
1049 
1050  frameTo.data = Bytes(it, it + size);
1051 
1052  return true;
1053  }
1054 
1055  // unpack a frame
1056  inline Bytes PickleDict::unpackFrame(const Frame& frame) {
1057  if(frame.opCode == static_cast<std::uint8_t>(OpCode::FRAME)) {
1058  return frame.data;
1059  }
1060 
1061  Bytes complete;
1062 
1063  complete.reserve(frame.data.size() + 1);
1064  complete.push_back(frame.opCode);
1065 
1066  Helper::Container::append(complete, frame.data);
1067 
1068  return complete;
1069  }
1070 
1071  // optionally skip MEMOIZE command and return whether
1072  // such a command was found at the given position
1073  inline bool PickleDict::skipMemoize(
1074  const Bytes& data,
1075  std::size_t& pos
1076  ) {
1077  if(
1078  pos < data.size()
1079  && data[pos] == static_cast<std::uint8_t>(OpCode::MEMOIZE)
1080  ) {
1081  ++pos;
1082 
1083  return true;
1084  }
1085 
1086  return false;
1087  }
1088 
1089  // check data length
1090  inline void PickleDict::checkLength(
1091  std::size_t dataLength,
1092  std::size_t currentEnd
1093  ) {
1094  if(currentEnd > dataLength) {
1095  throw Exception(
1096  "Pickle::readValue(): Unexpected end of data (expected >"
1097  + std::to_string(currentEnd - dataLength)
1098  + " bytes more)"
1099  );
1100  }
1101  }
1102 
1103  // read length of succeeding value
1104  inline std::size_t PickleDict::readValueLength(
1105  const Bytes& data,
1106  std::size_t& pos,
1107  std::size_t numBytes
1108  ) {
1109  PickleDict::checkLength(data.size(), pos + numBytes);
1110 
1111  std::size_t result{};
1112 
1113  switch(numBytes) {
1114  case pickleOneByte:
1115  result = data[pos];
1116 
1117  break;
1118 
1119  case pickleTwoBytes:
1120  result = Helper::Bytes::bytesToUInt16(data, pos);
1121 
1122  break;
1123 
1124  case pickleFourBytes:
1125  result = Helper::Bytes::bytesToUInt32(data, pos);
1126 
1127  break;
1128 
1129  case pickleEightBytes:
1130  result = Helper::Bytes::bytesToUInt64(data, pos);
1131 
1132  break;
1133 
1134  default:
1135  if(numBytes > pickleEightBytes) {
1136  throw Exception(
1137  "Pickle::readValue(): Value lengths consisting of "
1138  + std::to_string(numBytes)
1139  + " bytes are not supported"
1140  );
1141  }
1142 
1143  result = Helper::Bytes::bytesToUInt64(data, pos, numBytes);
1144 
1145  break;
1146  }
1147 
1148  pos += numBytes;
1149 
1150  PickleDict::checkLength(data.size(), pos + result);
1151 
1152  return result;
1153  }
1154 
1155  // determine the length of a string by its terminating character (does NOT change the current position)
1156  inline std::size_t PickleDict::getLengthByTermination(
1157  const Bytes& data,
1158  std::size_t pos,
1159  char terminatingCharacter
1160  ) {
1161  for(std::size_t end{pos}; end < data.size(); ++end) {
1162  if(data[end] == static_cast<std::uint8_t>(terminatingCharacter)) {
1163  return end - pos;
1164  }
1165  }
1166 
1167  // no terminating character found
1168  throw Exception(
1169  "SimpleDict::getLengthByTermination():"
1170  " Could not find terminating character '"
1171  + Helper::Bytes::charToString(terminatingCharacter)
1172  + "' after position #"
1173  + std::to_string(pos)
1174  );
1175  }
1176 
1177  // extract a string from the data
1178  inline std::string PickleDict::getString(
1179  const Bytes& data,
1180  std::size_t& pos,
1181  std::size_t length
1182  ) {
1183  std::string result;
1184  const auto end{pos + length};
1185 
1186  result.reserve(length);
1187 
1188  for(; pos < end; ++pos) {
1189  result.push_back(static_cast<char>(data[pos]));
1190  }
1191 
1192  return result;
1193  }
1194 
1195  // write Python pickle data head
1196  inline void PickleDict::writeHead(Bytes& to) {
1197  to.push_back(static_cast<std::uint8_t>(OpCode::PROTO));
1198  to.push_back(pickleProtocolVersion);
1199  }
1200 
1201  // write Python pickle frame
1202  inline void PickleDict::writeFrame(const Bytes& frameBytes, Bytes& to, bool isLast) {
1203  // calculate frame size
1204  std::uint64_t frameSize{frameBytes.size()};
1205 
1206  if(isLast) {
1207  ++frameSize;
1208  }
1209 
1210  // reserve memory
1211  to.reserve(to.size() + frameSize + pickleNineBytes);
1212 
1213  // write frame head (including its size)
1214  to.push_back(static_cast<std::uint8_t>(OpCode::FRAME));
1215 
1216  PickleDict::writeBytes(Helper::Bytes::uInt64ToBytes(frameSize), to);
1217 
1218  // write frame data
1219  Helper::Container::append(to, frameBytes);
1220 
1221  // finish frame
1222  if(isLast) {
1223  to.push_back(static_cast<std::uint8_t>(OpCode::STOP));
1224  }
1225  }
1226 
1227  // write dictionary head
1228  inline void PickleDict::writeDictHead(Bytes& to) {
1229  to.push_back(static_cast<std::uint8_t>(OpCode::EMPTY_DICT));
1230  to.push_back(static_cast<std::uint8_t>(OpCode::MEMOIZE));
1231  to.push_back(static_cast<std::uint8_t>(OpCode::MARK));
1232  }
1233 
1234  // write dictionary tail
1235  inline void PickleDict::writeDictTail(Bytes& to) {
1236  to.push_back(static_cast<std::uint8_t>(OpCode::MEMOIZE));
1237  to.push_back(static_cast<std::uint8_t>(OpCode::SETITEMS));
1238  }
1239 
1240  // write dictionary entry containing a number
1241  inline void PickleDict::writeNumberEntry(
1242  const std::pair<std::string, std::int64_t>& entry,
1243  Bytes& to
1244  ) {
1245  PickleDict::writeKey(entry.first, to);
1246 
1247  if(entry.second >= 0) {
1248  if(entry.second <= pickleMaxUOneByteNumber) {
1249  PickleDict::writeBinInt1(static_cast<std::uint8_t>(entry.second), to);
1250 
1251  return;
1252  }
1253 
1254  if(entry.second <= pickleMaxUTwoByteNumber) {
1255  PickleDict::writeBinInt2(static_cast<std::uint16_t>(entry.second), to);
1256 
1257  return;
1258  }
1259  }
1260 
1261  PickleDict::writeLong1(entry.second, to);
1262  }
1263 
1264  // write dictionary entry containing a floating-point number
1265  inline void PickleDict::writeFloatEntry(
1266  const std::pair<std::string, double>& entry,
1267  Bytes& to
1268  ) {
1269  PickleDict::writeKey(entry.first, to);
1270  PickleDict::writeBinFloat(entry.second, to);
1271  }
1272 
1273  // write dictionary entry containing a string
1274  inline void PickleDict::writeStringEntry(
1275  const std::pair<std::string, std::string>& entry,
1276  Bytes& to
1277  ) {
1278  PickleDict::writeKey(entry.first, to);
1279 
1280  if(entry.second.size() <= pickleMaxUOneByteNumber) {
1281  PickleDict::writeShortBinUnicode(entry.second, to);
1282  }
1283  else if(entry.second.size() <= pickleMaxUFourByteNumber) {
1284  PickleDict::writeBinUnicode(entry.second, to);
1285  }
1286  else {
1287  PickleDict::writeBinUnicode8(entry.second, to);
1288  }
1289  }
1290 
1291  // write dictionary key
1292  inline void PickleDict::writeKey(const std::string& key, Bytes& to) {
1293  PickleDict::writeShortBinUnicode(key, to);
1294 
1295  to.push_back(static_cast<std::uint8_t>(OpCode::MEMOIZE));
1296  }
1297 
1298  // write one-byte unsigned number
1299  inline void PickleDict::writeBinInt1(std::uint8_t value, Bytes& to) {
1300  to.push_back(static_cast<std::uint8_t>(OpCode::BININT1));
1301  to.push_back(value);
1302  }
1303 
1304  // write two-bytes unsigned number
1305  inline void PickleDict::writeBinInt2(std::uint16_t value, Bytes& to) {
1306  to.push_back(static_cast<std::uint8_t>(OpCode::BININT2));
1307 
1308  PickleDict::writeBytes(Helper::Bytes::uInt16ToBytes(value), to);
1309  }
1310 
1311  // write number of bytes and signed number
1312  inline void PickleDict::writeLong1(std::int64_t value, Bytes& to) {
1313  to.push_back(static_cast<std::uint8_t>(OpCode::LONG1));
1314 
1315  if(PickleDict::inRange<std::int8_t>(value)) {
1316  to.push_back(pickleOneByte);
1317  to.push_back(static_cast<std::int8_t>(value));
1318  }
1319  else if(PickleDict::inRange<std::int16_t>(value)) {
1320  to.push_back(pickleTwoBytes);
1321 
1322  PickleDict::writeBytes(
1324  static_cast<std::int16_t>(value)
1325  ),
1326  to
1327  );
1328  }
1329  else if(PickleDict::inRange<std::int32_t>(value)) {
1330  to.push_back(pickleFourBytes);
1331 
1332  PickleDict::writeBytes(
1334  static_cast<std::int32_t>(value)
1335  ),
1336  to
1337  );
1338  }
1339  else {
1340  to.push_back(pickleEightBytes);
1341 
1342  PickleDict::writeBytes(Helper::Bytes::int64ToBytes(value), to);
1343  }
1344  }
1345 
1346  // write floating-point number of eight bytes, i.e. with double precision
1347  inline void PickleDict::writeBinFloat(double value, Bytes& to) {
1348  to.push_back(static_cast<std::uint8_t>(OpCode::BINFLOAT));
1349 
1350  PickleDict::writeBytes(Helper::Bytes::doubleToBytes(value), to);
1351  }
1352 
1353  // write one-byte length and string
1354  inline void PickleDict::writeShortBinUnicode(const std::string& value, Bytes& to) {
1355  to.push_back(static_cast<std::uint8_t>(OpCode::SHORT_BINUNICODE));
1356 
1357  // use max. 255 bytes
1358  std::uint8_t length{};
1359 
1360  if(value.size() > pickleMaxUOneByteNumber) {
1361  length = pickleMaxUOneByteNumber;
1362  }
1363  else {
1364  length = static_cast<std::uint8_t>(value.size());
1365  }
1366 
1367  // reserve memory
1368  to.reserve(length + pickleOneByte);
1369 
1370  // write length
1371  to.push_back(length);
1372 
1373  // write string
1374  for(std::size_t index{}; index < length; ++index) {
1375  to.push_back(static_cast<std::uint8_t>(value[index]));
1376  }
1377  }
1378 
1379  // write four-byte length and string
1380  inline void PickleDict::writeBinUnicode(const std::string& value, Bytes& to) {
1381  to.push_back(static_cast<std::uint8_t>(OpCode::BINUNICODE));
1382 
1383  // use max. 4,294,967,295 bytes
1384  std::uint32_t length{};
1385 
1386  if(value.size() > pickleMaxUOneByteNumber) {
1387  length = pickleMaxUOneByteNumber;
1388  }
1389  else {
1390  length = static_cast<std::uint32_t>(value.size());
1391  }
1392 
1393  // reserve memory
1394  to.reserve(length + pickleFourBytes);
1395 
1396  // write length
1397  PickleDict::writeBytes(Helper::Bytes::uInt32ToBytes(length), to);
1398 
1399  // write string
1400  for(std::size_t index{}; index < length; ++index) {
1401  to.push_back(static_cast<std::uint8_t>(value[index]));
1402  }
1403  }
1404 
1405  // write eight-byte length and string
1406  inline void PickleDict::writeBinUnicode8(const std::string& value, Bytes& to) {
1407  to.push_back(static_cast<std::uint8_t>(OpCode::BINUNICODE8));
1408 
1409  // reserve memory
1410  to.reserve(value.size() + pickleEightBytes);
1411 
1412  // write length
1413  PickleDict::writeBytes(Helper::Bytes::uInt64ToBytes(value.size()), to);
1414 
1415  // write string
1416  for(std::size_t index{}; index < value.size(); ++index) {
1417  to.push_back(static_cast<std::uint8_t>(value[index]));
1418  }
1419  }
1420 
1421 } /* namespace crawlservpp::Data */
1422 
1423 #endif /* DATA_PICKLEDICT_HPP_ */
constexpr auto pickleMinFrameSize
The minimum size of a Python pickle frame.
Definition: PickleDict.hpp:102
std::array< std::uint8_t, sizeFour > uInt32ToBytes(std::uint32_t number)
Converts an unsigned 32-bit number to an array of four bytes.
Definition: Bytes.hpp:470
std::vector< std::uint8_t > Bytes
Definition: PickleDict.hpp:62
constexpr auto pickleNineBytes
Nine bytes (eight bytes and an op-code).
Definition: PickleDict.hpp:84
PickleDict()=default
Default constructor.
std::int32_t bytesToInt32(const Bytes &bytes, std::size_t &pos)
Retrieve a signed 32-bit number from a vector of bytes.
Definition: Bytes.hpp:345
Class for Python pickle exceptions.
Definition: PickleDict.hpp:181
std::array< std::uint8_t, sizeFour > int32ToBytes(std::int32_t number)
Converts an signed 32-bit number to an array of four bytes.
Definition: Bytes.hpp:489
std::optional< double > getFloat(const std::string &key) const
Gets a floating-point number from the dictionary, if avaible.
Definition: PickleDict.hpp:426
static void moveInto(T &to, T &from)
Moves the elements of an iterable container into another iterable container.
Definition: Container.hpp:99
constexpr auto pickleOneByte
One byte.
Definition: PickleDict.hpp:72
double bytesToDouble(const Bytes &bytes, std::size_t &pos)
Retrieves a IEEE 754 double-precision binary floating-point number from a vector of bytes...
Definition: Bytes.hpp:406
Simple Python pickle dictionary.
Definition: PickleDict.hpp:136
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
std::uint16_t bytesToUInt16(const Bytes &bytes, std::size_t &pos)
Retrieve an unsigned 16-bit number from a vector of bytes.
Definition: Bytes.hpp:362
constexpr std::uint32_t pickleMaxUFourByteNumber
Maximum number in unsigned four-byte number.
Definition: PickleDict.hpp:111
void setNumber(const std::string &key, std::int64_t value)
Adds or overwrite a number in the dictionary.
Definition: PickleDict.hpp:467
constexpr auto pickleEightBytes
Eight bytes.
Definition: PickleDict.hpp:81
void setFloat(const std::string &key, double value)
Adds or overwrites a floating-point number in the dictionary.
Definition: PickleDict.hpp:480
std::string byteToHexString(std::uint8_t byte)
Converts a byte to a string containing the byte in hexadecimal format.
Definition: Bytes.hpp:572
std::array< std::uint8_t, sizeEight > int64ToBytes(std::int64_t number)
Converts a signed 64-bit number to an array of eight bytes.
Definition: Bytes.hpp:451
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
constexpr auto pickleHeadSize
The size of the Python pickle header, in bytes.
Definition: PickleDict.hpp:99
std::string charToString(char c)
Converts a character to a string.
Definition: Bytes.hpp:609
void writeTo(Bytes &dataTo) const
Writes dictionary to Python pickle data.
Definition: PickleDict.hpp:565
constexpr auto pickleVersionByte
The position of the version byte in a Python pickle.
Definition: PickleDict.hpp:96
std::optional< std::int64_t > getNumber(const std::string &key) const
Gets a number from the dictionary, if avaible.
Definition: PickleDict.hpp:406
constexpr auto pickleBase
The base used for converting strings to numbers.
Definition: PickleDict.hpp:114
constexpr auto pickleProtocolVersion
The protocol version of Python pickles used.
Definition: PickleDict.hpp:90
std::int64_t bytesToInt64(const Bytes &bytes, std::size_t &pos)
Retrieve a signed 64-bit number from a vector of bytes.
Definition: Bytes.hpp:300
static void append(T &to, const T &from, typename T::size_type startAt, typename T::size_type endAt)
Appends (part of) an iterable container to another container.
Definition: Container.hpp:51
constexpr auto pickleMinSize
The minimum size of a Python pickle to extract a frame.
Definition: PickleDict.hpp:87
std::optional< std::string > getString(const std::string &key) const
Gets a string from the dictionary, if avaible.
Definition: PickleDict.hpp:446
std::uint64_t bytesToUInt64(const Bytes &bytes, std::size_t &pos)
Retrieve an unsigned 64-bit number from a vector of bytes.
Definition: Bytes.hpp:203
std::array< std::uint8_t, sizeTwo > uInt16ToBytes(std::uint16_t number)
Converts an unsigned 16-bit number to an array of two bytes.
Definition: Bytes.hpp:508
std::array< std::uint8_t, sizeEight > doubleToBytes(double number)
Converts a floating-point number with double precision to an array of four bytes. ...
Definition: Bytes.hpp:547
constexpr auto pickleFourBytes
Four bytes.
Definition: PickleDict.hpp:78
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
std::int16_t bytesToInt16(const Bytes &bytes, std::size_t &pos)
Retrieve a signed 16-bit number from a vector of bytes.
Definition: Bytes.hpp:388
constexpr std::uint8_t pickleMaxUOneByteNumber
Maximum number in unsigned one-byte number.
Definition: PickleDict.hpp:105
constexpr auto pickleTwoBytes
Two bytes.
Definition: PickleDict.hpp:75
std::array< std::uint8_t, sizeEight > uInt64ToBytes(std::uint64_t number)
Converts an unsigned 64-bit number to an array of eight bytes.
Definition: Bytes.hpp:432
void readFrom(const Bytes &data)
Creates a simple dictionary from Python pickle data.
Definition: PickleDict.hpp:526
std::array< std::uint8_t, sizeTwo > int16ToBytes(std::int16_t number)
Converts an signed 16-bit number to an array of two bytes.
Definition: Bytes.hpp:527
constexpr std::uint16_t pickleMaxUTwoByteNumber
Maximum number in unsigned two-byte number.
Definition: PickleDict.hpp:108
Namespace for different types of data.
constexpr auto pickleProtoByte
The position of the protocol byte in a Python pickle.
Definition: PickleDict.hpp:93
std::uint32_t bytesToUInt32(const Bytes &bytes, std::size_t &pos)
Retrieve an unsigned 32-bit number from a vector of bytes.
Definition: Bytes.hpp:317
void setString(const std::string &key, const std::string &value)
Add or overwrites a string in the dictionary.
Definition: PickleDict.hpp:493