crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Corpus.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Corpus.hpp
24  *
25  * Class representing a text corpus with optional article and date maps
26  * that can be sliced into smaller chunks to fit into the database.
27  *
28  * NOTE: All input data needs to be sorted by date.
29  * Text without dates need to be added first.
30  *
31  * Created on: Mar 4, 2020
32  * Author: ans
33  */
34 
35 #ifndef DATA_CORPUS_HPP_
36 #define DATA_CORPUS_HPP_
37 
38 #include "Lemmatizer.hpp"
39 #include "Stemmer/English.hpp"
40 #include "Stemmer/German.hpp"
41 #include "Tagger.hpp"
42 #include "TokenCorrect.hpp"
43 #include "TokenRemover.hpp"
44 
45 #include "../Helper/CommaLocale.hpp"
46 #include "../Helper/Container.hpp"
47 #include "../Helper/DateTime.hpp"
48 #include "../Helper/Memory.hpp"
49 #include "../Helper/Utf8.hpp"
50 #include "../Main/Exception.hpp"
51 #include "../Struct/StatusSetter.hpp"
52 #include "../Struct/TextMap.hpp"
53 
54 #include <algorithm> // std::all_of, std::any_of, std::count_if, std::find_if, std::remove_if
55 #include <cctype> // std::toupper
56 #include <cstddef> // std::size_t
57 #include <cstdint> // std::int64_t, std::uint8_t, std::uint16_t
58 #include <functional> // std::function, std::reference_wrapper
59 #include <iterator> // std::distance
60 #include <map> // std::map
61 #include <memory> // std::make_unique, std::unique_ptr
62 #include <numeric> // std::accumulate
63 #include <optional> // std::optional, std::nullopt
64 #include <ostream> // std::ostream
65 #include <sstream> // std::ostringstream
66 #include <string> // std::string, std::to_string
67 #include <string_view> // std::string_view
68 #include <utility> // std::pair
69 #include <vector> // std::vector
70 
72 namespace crawlservpp::Data {
73 
74  /*
75  * CONSTANTS
76  */
77 
80 
82  inline constexpr auto dateLength{10};
83 
85  inline constexpr std::uint8_t utf8MaxBytes{4};
86 
88  inline constexpr auto mergeUpdateEvery{10000};
89 
91  inline constexpr auto tokenizeUpdateEvery{10000};
92 
94  inline constexpr auto filterUpdateEvery{10000};
95 
97  inline constexpr auto minSingleUtf8CharSize{2};
98 
100  inline constexpr auto maxSingleUtf8CharSize{4};
101 
105 
107  inline constexpr std::uint16_t corpusManipNone{0};
108 
110  inline constexpr std::uint16_t corpusManipTagger{1};
111 
113  inline constexpr std::uint16_t corpusManipTaggerPosterior{2};
114 
116  inline constexpr std::uint16_t corpusManipEnglishStemmer{3};
117 
119  inline constexpr std::uint16_t corpusManipGermanStemmer{4};
120 
122  inline constexpr std::uint16_t corpusManipLemmatizer{5};
123 
125  inline constexpr std::uint16_t corpusManipRemove{6};
126 
128  inline constexpr std::uint16_t corpusManipTrim{7};
129 
131  inline constexpr std::uint16_t corpusManipCorrect{8};
132 
133  //TODO(ans): add corpusManipReplace (by dictionary)
134 
136 
137  /*
138  * DECLARATION
139  */
140 
142 
165  class Corpus {
166  // for convenience
168  using TextMap = Struct::TextMap;
170 
171  public:
172  using Sizes = std::vector<std::size_t>;
173  using Tokens = std::vector<std::string>;
174 
175  using ArticleFunc = std::function<bool(const Tokens&, std::size_t, std::size_t)>;
176  using SentenceFunc = std::function<void(Tokens::iterator, Tokens::iterator)>;
177 
179  = std::map<std::string, std::map<std::string, std::vector<Tokens>>>;
180  using SentenceMap = std::vector<std::pair<std::size_t, std::size_t>>;
181  using PositionLength = std::pair<std::size_t, std::size_t>;
183 
186 
187  explicit Corpus(bool consistencyChecks);
188  Corpus(std::vector<Corpus>& others, bool consistencyChecks, StatusSetter& statusSetter);
189 
193 
194  [[nodiscard]] std::string& getCorpus();
195  [[nodiscard]] const std::string& getcCorpus() const;
196 
197  [[nodiscard]] bool isTokenized() const;
198  [[nodiscard]] Tokens& getTokens();
199  [[nodiscard]] const Tokens& getcTokens() const;
200  [[nodiscard]] std::size_t getNumTokens() const;
201 
202  [[nodiscard]] bool hasArticleMap() const;
203  [[nodiscard]] TextMap& getArticleMap();
204  [[nodiscard]] const TextMap& getcArticleMap() const;
205 
206  [[nodiscard]] bool hasDateMap() const;
207  [[nodiscard]] TextMap& getDateMap();
208  [[nodiscard]] const TextMap& getcDateMap() const;
209 
210  [[nodiscard]] bool hasSentenceMap() const;
211  [[nodiscard]] SentenceMap& getSentenceMap();
212  [[nodiscard]] const SentenceMap& getcSentenceMap() const;
213 
214  [[nodiscard]] std::string get(std::size_t index) const;
215  [[nodiscard]] std::string get(const std::string& id) const;
216  [[nodiscard]] std::string getDate(const std::string& date) const;
217  [[nodiscard]] Tokens getTokenized(std::size_t index) const;
218  [[nodiscard]] Tokens getTokenized(const std::string& id) const;
219  [[nodiscard]] Tokens getDateTokenized(const std::string& date) const;
220  [[nodiscard]] std::vector<Tokens> getArticles() const;
221 
222  [[nodiscard]] std::size_t size() const;
223  [[nodiscard]] bool empty() const;
224 
225  [[nodiscard]] std::string substr(std::size_t from, std::size_t len);
226 
230 
231  void create(
232  Tokens& texts,
233  bool deleteInputData
234  );
235  void create(
236  Tokens& texts,
237  std::vector<std::string>& articleIds,
238  std::vector<std::string>& dateTimes,
239  bool deleteInputData
240  );
241  void combineContinuous(
242  Tokens& chunks,
243  std::vector<TextMap>& articleMaps,
244  std::vector<TextMap>& dateMaps,
245  bool deleteInputData
246  );
247  void combineTokenized(
248  Tokens& chunks,
249  Sizes& tokenNums,
250  std::vector<TextMap>& articleMaps,
251  std::vector<TextMap>& dateMaps,
252  std::vector<SentenceMap>& sentenceMaps,
253  bool deleteInputData
254  );
255 
259 
260  void copyContinuous(std::string& to) const;
261  void copyContinuous(
262  std::string& to,
263  TextMap& articleMapTo,
264  TextMap& dateMapTo
265  ) const;
267  std::size_t chunkSize,
268  Tokens& to,
269  std::vector<TextMap>& articleMapsTo,
270  std::vector<TextMap>& dateMapsTo
271  ) const;
272  void copyChunksTokenized(
273  std::size_t chunkSize,
274  Tokens& to,
275  Sizes& tokenNumsTo,
276  std::vector<TextMap>& articleMapsTo,
277  std::vector<TextMap>& dateMapsTo,
278  std::vector<SentenceMap>& sentenceMapsTo
279  ) const;
280 
284 
285  bool filterByDate(const std::string& from, const std::string& to);
286  std::size_t filterArticles(const ArticleFunc& callbackArticle, StatusSetter& statusSetter);
287 
291 
292  [[nodiscard]] bool tokenize(
293  const std::vector<std::uint16_t>& manipulators,
294  const std::vector<std::string>& models,
295  const std::vector<std::string>& dictionaries,
296  const std::vector<std::string>& languages,
297  std::uint64_t freeMemoryEvery,
298  StatusSetter& statusSetter
299  );
300  [[nodiscard]] bool tokenizeCustom(
301  const std::optional<SentenceFunc>& callback,
302  std::uint64_t freeMemoryEvery,
303  StatusSetter& statusSetter
304  );
305 
309 
310  void clear();
311 
313 
316 
317  protected:
320 
322  std::string corpus;
323 
326 
328  TextMap articleMap;
329 
331  TextMap dateMap;
332 
335 
337 
338  private:
339  // internal state
340  bool tokenized{false};
341  bool checkConsistency{false};
342  std::size_t tokenBytes{};
343 
344  // internal helper functions
345  void moveCombinedIn(DateArticleSentenceMap& from);
346 
347  void checkThatNotTokenized(std::string_view function) const;
348  void checkThatTokenized(std::string_view function) const;
349 
350  void addArticle(
351  std::string& text,
352  std::string& id,
353  std::string& dateTime,
354  TextMapEntry& dateMapEntry,
355  bool deleteInputData
356  );
357  void addChunk(
358  const std::string& content,
359  const std::optional<std::reference_wrapper<const TextMap>>& articles,
360  const std::optional<std::reference_wrapper<const TextMap>>& dates,
361  const SentenceMap& sentences,
362  bool& continueToken
363  );
364 
365  void check(std::string_view function) const;
366  void checkTokenized(std::string_view function) const;
367 
368  void addAsOneChunk(
369  std::size_t size,
370  Tokens& to,
371  Sizes& tokenNumsTo,
372  std::vector<TextMap>& articleMapsTo,
373  std::vector<TextMap>& dateMapsTo,
374  std::vector<SentenceMap>& sentenceMapsTo
375  ) const;
376 
377  void reTokenize();
378 
379  [[nodiscard]] bool tokenizeTokenized(
380  const std::optional<SentenceFunc>& callback,
381  StatusSetter& statusSetter
382  );
383  [[nodiscard]] bool tokenizeContinuous(
384  const std::optional<SentenceFunc>& callback,
385  std::uint64_t freeMemoryEvery,
386  StatusSetter& statusSetter
387  );
388 
389  // internal static helper functions
390  static bool combineCorpora(
391  std::vector<Corpus>& from,
393  StatusSetter& statusSetter
394  );
395 
396  [[nodiscard]] static Tokens getTokensForEntry(
397  const TextMap& map,
398  const std::string& id,
399  const Tokens& tokens
400  );
401 
402  [[nodiscard]] static std::size_t getValidLengthOfChunk(
403  const std::string& source,
404  std::size_t pos,
405  std::size_t maxLength,
406  std::size_t maxChunkSize
407  );
408  [[nodiscard]] static std::size_t getValidLengthOfChunk(
409  const std::string& chunkContent,
410  std::size_t maxChunkSize
411  );
412 
413  static void checkTokensForChunking(const Tokens& tokens);
414 
415  static void reserveChunks(
416  std::size_t chunks,
417  Tokens& to,
418  Sizes& tokenNumsTo,
419  std::vector<TextMap>& articleMapsTo,
420  std::vector<TextMap>& dateMapsTo,
421  std::vector<SentenceMap>& sentenceMapsTo,
422  bool hasArticleMap,
423  bool hasDateMap
424  );
425 
426  static void checkForEntry(
427  std::string_view type,
428  const SentenceMapEntry& sentence,
429  std::size_t& nextIndex,
430  const TextMap& map,
431  std::size_t chunkOffset,
432  TextMap& chunkMap,
433  bool checkConsistency
434  );
435 
436  static void finishChunk(
437  std::string& contentFrom,
438  SentenceMap& sentencesFrom,
439  Tokens& contentTo,
440  Sizes& tokenNumTo,
441  std::vector<SentenceMap>& sentencesTo,
442  std::size_t chunkTokens,
443  std::size_t& chunkOffset,
444  bool splitToken,
445  std::size_t nextChunkSize
446  );
447  static void splitEntry(
448  TextMap& map,
449  std::size_t token,
450  bool splitToken,
451  TextMapEntry& remainingTo
452  );
453  static void finishMap(
454  TextMap& from,
455  std::vector<TextMap>& to,
456  TextMapEntry& remaining
457  );
458 
459  static void notUsed(
460  std::string_view type,
461  const std::vector<std::string>& values,
462  std::size_t index
463  );
464 
465  static std::size_t bytes(const Tokens& tokens);
466 
467  static void addChunkMap(
468  const std::optional<std::reference_wrapper<const TextMap>>& from,
469  TextMap& to,
470  std::size_t offset,
471  bool splitToken
472  );
473 
474  static void checkMap(
475  std::string_view function,
476  std::string_view name,
477  const TextMap& map,
478  std::size_t end,
479  bool isTokenized,
480  bool isDateMap
481  );
482  static void checkMap(
483  std::string_view function,
484  const SentenceMap& map,
485  std::size_t end,
486  bool isTokenized
487  );
488 
489  static void skipEntriesBefore(
490  const TextMap& map,
491  std::size_t& entryIndex,
492  std::size_t& entryEnd,
493  std::size_t pos,
494  bool& inEntryTo
495  );
496 
497  static void removeEmpty(Tokens& from);
498  static void removeToken(TextMap& map, std::size_t entryIndex, bool& emptiedTo);
499  static void removeToken(SentenceMapEntry& entry, bool& emptiedTo);
500 
501  static std::size_t getFirstEnd(const TextMap& map);
502  static std::size_t getEntryEnd(const TextMap& map, std::size_t entryIndex);
503 
504  static void processSentence(
505  Tokens& sentence,
506  const std::optional<SentenceFunc>& callback,
507  bool inArticle,
508  bool inDate,
509  std::size_t& currentToken,
510  std::size_t& sentenceFirstToken,
511  TextMap& articleMap,
512  TextMap& dateMap,
513  SentenceMap& sentenceMap,
514  Tokens& tokens,
515  std::size_t& tokenBytes
516  );
517 
518  static bool addCorpus(
519  Corpus& from,
521  std::size_t number,
522  std::size_t total,
523  StatusSetter& statusSetter
524  );
525  static bool addSentences(
526  Corpus& from,
528  StatusSetter& statusSetter
529  );
530 
531  static void finishArticle(
532  std::vector<Tokens>& from,
534  const std::string& date,
535  const std::string& article
536  );
537  static void nextEntry(
538  const TextMap& map,
539  std::size_t index,
540  std::string& nameTo,
541  std::size_t& endTo,
542  std::size_t corpusEnd
543  );
544 
545  static bool pushSentence(
546  const SentenceMapEntry& sentence,
547  std::size_t chunkSize,
548  std::size_t chunkOffset,
549  std::size_t& chunkTokens,
550  std::string& chunkContent,
551  SentenceMap& chunkSentences,
552  const Tokens& tokens,
553  std::size_t& tokensComplete,
554  std::size_t& additionalBytes
555  );
556 
557  static std::string mergingStatus(std::size_t number, std::size_t total);
558 
559  static void locale(std::ostream& os);
560 
561  // internal static helper functions for exception throwing
562  static void exceptionGetNoArticleMap(
563  std::string_view function,
564  std::size_t article
565  );
566  static void exceptionArticleOutOfBounds(
567  std::string_view function,
568  std::size_t article,
569  std::size_t size
570  );
571  static void exceptionDateLength(
572  std::string_view function,
573  std::size_t length
574  );
575  static void exceptionArticleMapStart(
576  std::string_view function,
577  std::string_view expected,
578  std::size_t chunkIndex,
579  std::size_t numberOfChunks,
580  std::size_t start
581  );
582  static void exceptionLastSentenceLength(
583  std::size_t pos,
584  std::size_t length,
585  std::size_t corpusSize
586  );
587  static void exceptionArticleBehindDate(
588  std::size_t articlePos,
589  std::size_t datePos,
590  std::size_t dateEnd
591  );
592  static void exceptionChunkSize(std::size_t size, std::size_t chunkSize);
593  static void exceptionArticleMapEnd(std::size_t pos, std::size_t size);
594  static void exceptionUnexpectedBeforeSentence(
595  std::string_view type,
596  std::string_view name,
597  std::size_t pos,
598  std::size_t sentencePos
599  );
600  static void exceptionMismatchWithDate(
601  std::string_view type,
602  std::size_t pos,
603  std::size_t datePos
604  );
605  static void exceptionDateBehindLast(
606  std::string_view type,
607  std::size_t datePos,
608  std::size_t lastPos
609  );
610  static void exceptionSentenceBehind(
611  std::string_view function,
612  std::string_view type,
613  const std::pair<std::size_t, std::size_t>& sentence,
614  const TextMapEntry& entry,
615  const TextMap& map,
616  const TextMap::const_iterator& next,
617  const Tokens& tokens
618  );
619  static void exceptionTokenBytes(
620  std::string_view function,
621  std::size_t size,
622  std::size_t actualSize
623  );
624  static void exceptionInvalidMaxChunkSize(std::size_t size, std::size_t max);
625  static void exceptionPositionTooSmall(
626  std::size_t pos,
627  std::size_t expectedMin,
628  std::string_view name
629  );
630  static void exceptionInvalidPosition(
631  std::string_view function,
632  std::size_t pos,
633  std::size_t expected,
634  std::string_view name
635  );
636  static void exceptionInvalidDate(
637  std::string_view function,
638  std::string_view value,
639  std::string_view name
640  );
641  static void exceptionInvalidEnd(
642  std::string_view function,
643  std::size_t pos,
644  std::size_t expected,
645  std::string_view name
646  );
647 
648  /*
649  * INTERNAL HELPER FUNCTION TEMPLATES (private)
650  */
651 
652  // reserve memory for combined maps
653  template<class T> static void reserveCombined(
654  const std::vector<T>& vec,
655  T& combined
656  ) {
657  combined.reserve(
658  std::accumulate(
659  vec.cbegin(),
660  vec.cend(),
661  std::size_t{},
662  [](const auto& a, const auto& b) {
663  return a + b.size();
664  }
665  )
666  );
667  }
668 
669  // check whether current map entry begins at current position
670  template<typename T> [[nodiscard]] static bool entryBeginsAt(
671  const T& map,
672  size_t entryIndex,
673  size_t pos
674  ) {
675  return entryIndex < map.size() && TextMapEntry::pos(map.at(entryIndex)) == pos;
676  }
677 
678  // remove empty entries from map (checking all of their tokens)
679  template<typename T> static void removeEmptyEntries(
680  T& map,
681  const Tokens& tokens
682  ) {
683  map.erase(
684  std::remove_if(
685  map.begin(),
686  map.end(),
687  [&tokens](const auto& entry) {
688  const auto end{TextMapEntry::end(entry)};
689 
690  for(
691  auto tokenIndex{TextMapEntry::pos(entry)};
692  tokenIndex < end;
693  ++tokenIndex
694  ) {
695  if(!(tokens.at(tokenIndex).empty())) {
696  return false;
697  }
698  }
699 
700  return true;
701  }
702  ),
703  map.end()
704  );
705  }
706 
707  // remove empty entries from map (checking just their length)
708  template<typename T> static void removeEmptyEntries(T& map) {
709  map.erase(
710  std::remove_if(
711  map.begin(),
712  map.end(),
713  [](const auto& entry) {
714  return TextMapEntry::length(entry) == 0;
715  }
716  ),
717  map.end()
718  );
719  }
720 
721  // skip map entries before current token
722  template<typename T> static void skipEntriesBefore(
723  const T& map,
724  std::size_t& entryIndex,
725  PositionLength& origin,
726  std::size_t pos
727  ) {
728  if(pos == 0) {
729  /* first token: set origin to first map entry */
730  origin.first = TextMapEntry::pos(map.at(0));
731  origin.second = TextMapEntry::end(map.at(0));
732  }
733 
734  while(
735  entryIndex < map.size()
736  && (
737  origin.second <= pos
738  || TextMapEntry::length(map[entryIndex]) == 0
739  )
740  ) {
741  ++entryIndex;
742 
743  if(entryIndex < map.size()) {
744  origin.first = TextMapEntry::pos(map[entryIndex]);
745  origin.second = TextMapEntry::end(map[entryIndex]);
746  }
747  }
748  }
749 
750  // update the position of the current map entry
751  template<typename T> static void updatePosition(
752  std::string_view type,
753  T& map,
754  std::size_t entryIndex,
755  std::size_t entryPos,
756  std::size_t pos,
757  std::size_t removed
758  ) {
759  if(
760  entryIndex >= map.size() /* end of map reached */
761  || pos != entryPos /* entry not yet or already reached */
762  ) {
763  return;
764  }
765 
766  if(removed > TextMapEntry::pos(map.at(entryIndex))) {
767  Corpus::exceptionPositionTooSmall(
768  TextMapEntry::pos(map.at(entryIndex)),
769  removed,
770  type
771  );
772  }
773 
774  TextMapEntry::pos(map.at(entryIndex)) -= removed;
775  }
776 
777  // decrease the length of the current entry
778  template<typename T> static void removeTokenFromLength(
779  T& map,
780  std::size_t entryIndex,
781  const PositionLength& origin,
782  std::size_t tokenIndex
783  ) {
784  if(
785  entryIndex < map.size()
786  && tokenIndex >= origin.first
787  && tokenIndex < origin.second
788  ) {
789  --(TextMapEntry::length(map[entryIndex]));
790  }
791  }
792  };
793 
794  /*
795  * IMPLEMENTATION
796  */
797 
798  /*
799  * CONSTRUCTION
800  */
801 
803 
807  inline Corpus::Corpus(bool consistencyChecks) : checkConsistency(consistencyChecks) {}
808 
810 
829  std::vector<Corpus>& others,
830  bool consistencyChecks,
831  StatusSetter& statusSetter
832  ) : checkConsistency(consistencyChecks) {
833  // check arguments
834  if(others.empty()) {
835  return;
836  }
837 
838  if(others.size() == 1) {
839  std::swap(*this, others[0]);
840 
841  return;
842  }
843 
844  // combine corpora
845  DateArticleSentenceMap combined;
846 
847  if(!Corpus::combineCorpora(others, combined, statusSetter)) {
848  return;
849  }
850 
851  // move resulting corpus into class
852  statusSetter.change("Preparing combined corpus...");
853 
854  this->moveCombinedIn(combined);
855  }
856 
857  /*
858  * GETTERS
859  */
860 
862 
869  inline std::string& Corpus::getCorpus() {
870  this->checkThatNotTokenized("getCorpus");
871 
872  return this->corpus;
873  }
874 
876 
883  inline const std::string& Corpus::getcCorpus() const {
884  this->checkThatNotTokenized("getcCorpus");
885 
886  return this->corpus;
887  }
888 
890 
895  inline bool Corpus::isTokenized() const {
896  return this->tokenized;
897  }
898 
900 
909  this->checkThatTokenized("getTokens");
910 
911  return this->tokens;
912  }
913 
915 
923  inline const Corpus::Tokens& Corpus::getcTokens() const {
924  this->checkThatTokenized("getcTokens");
925 
926  return this->tokens;
927  }
928 
930 
937  inline std::size_t Corpus::getNumTokens() const {
938  this->checkThatTokenized("getNumTokens");
939 
940  return this->tokens.size();
941  }
942 
944 
949  inline bool Corpus::hasArticleMap() const {
950  return !(this->articleMap.empty());
951  }
952 
954 
959  return this->articleMap;
960  }
961 
963 
967  inline const Struct::TextMap& Corpus::getcArticleMap() const {
968  return this->articleMap;
969  }
970 
972 
977  inline bool Corpus::hasDateMap() const {
978  return !(this->dateMap.empty());
979  }
980 
982 
987  return this->dateMap;
988  }
989 
991 
995  inline const Struct::TextMap& Corpus::getcDateMap() const {
996  return this->dateMap;
997  }
998 
1000 
1005  inline bool Corpus::hasSentenceMap() const {
1006  return !(this->sentenceMap.empty());
1007  }
1008 
1010 
1020  this->checkThatTokenized("getSentenceMap");
1021 
1022  return this->sentenceMap;
1023  }
1024 
1026 
1036  this->checkThatTokenized("getcSentenceMap");
1037 
1038  return this->sentenceMap;
1039  }
1040 
1042 
1056  inline std::string Corpus::get(std::size_t index) const {
1057  this->checkThatNotTokenized("get");
1058 
1059  if(this->articleMap.empty()) {
1060  Corpus::exceptionGetNoArticleMap(
1061  "get",
1062  index
1063  );
1064  }
1065 
1066  if(index >= this->articleMap.size()) {
1067  Corpus::exceptionArticleOutOfBounds(
1068  "get",
1069  index,
1070  this->articleMap.size()
1071  );
1072  }
1073 
1074  const auto& articleEntry{this->articleMap.at(index)};
1075 
1076  return this->corpus.substr(
1077  TextMapEntry::pos(articleEntry),
1078  TextMapEntry::length(articleEntry)
1079  );
1080  }
1081 
1083 
1098  inline std::string Corpus::get(const std::string& id) const {
1099  this->checkThatNotTokenized("get");
1100 
1101  // check argument
1102  if(id.empty()) {
1103  throw Exception(
1104  "Corpus::get():"
1105  " No ID has been specified"
1106  );
1107  }
1108 
1109  const auto& articleEntry{
1110  std::find_if(
1111  this->articleMap.cbegin(),
1112  this->articleMap.cend(),
1113  [&id](const auto& entry) {
1114  return entry.value == id;
1115  }
1116  )
1117  };
1118 
1119  if(articleEntry == this->articleMap.cend()) {
1120  return std::string();
1121  }
1122 
1123  return this->corpus.substr(
1124  articleEntry->p,
1125  articleEntry->l
1126  );
1127  }
1128 
1130 
1145  inline std::string Corpus::getDate(const std::string& date) const {
1146  this->checkThatNotTokenized("getDate");
1147 
1148  // check argument
1149  if(date.length() != dateLength) {
1150  Corpus::exceptionDateLength(
1151  "getDate",
1152  date.length()
1153  );
1154  }
1155 
1156  const auto& dateEntry{
1157  std::find_if(
1158  this->dateMap.cbegin(),
1159  this->dateMap.cend(),
1160  [&date](const auto& entry) {
1161  return entry.value == date;
1162  }
1163  )
1164  };
1165 
1166  if(dateEntry == this->dateMap.cend()) {
1167  return std::string();
1168  }
1169 
1170  return this->corpus.substr(
1171  dateEntry->p,
1172  dateEntry->l
1173  );
1174  }
1175 
1177 
1192  inline Corpus::Tokens Corpus::getTokenized(std::size_t index) const {
1193  this->checkThatTokenized("getTokenized");
1194 
1195  if(this->articleMap.empty()) {
1196  Corpus::exceptionGetNoArticleMap(
1197  "getTokenized",
1198  index
1199  );
1200  }
1201 
1202  if(index >= this->articleMap.size()) {
1203  Corpus::exceptionArticleOutOfBounds(
1204  "getTokenized",
1205  index,
1206  this->articleMap.size()
1207  );
1208  }
1209 
1210  const auto& articleEntry{this->articleMap.at(index)};
1211  const auto articleEnd{TextMapEntry::end(articleEntry)};
1212 
1213  Tokens copy;
1214 
1215  copy.reserve(TextMapEntry::length(articleEntry));
1216 
1217  for(auto tokenIndex{TextMapEntry::pos(articleEntry)}; tokenIndex < articleEnd; ++tokenIndex) {
1218  copy.emplace_back(this->tokens.at(tokenIndex));
1219  }
1220 
1221  return copy;
1222  }
1223 
1225 
1241  inline Corpus::Tokens Corpus::getTokenized(const std::string& id) const {
1242  this->checkThatTokenized("getTokenized");
1243 
1244  // check argument
1245  if(id.empty()) {
1246  throw Exception(
1247  "Corpus::getTokenized():"
1248  " No ID has been specified"
1249  );
1250  }
1251 
1252  return Corpus::getTokensForEntry(this->articleMap, id, this->tokens);
1253  }
1254 
1256 
1272  inline Corpus::Tokens Corpus::getDateTokenized(const std::string& date) const {
1273  this->checkThatTokenized("getDateTokenized");
1274 
1275  // check argument
1276  if(date.length() != dateLength) {
1277  Corpus::exceptionDateLength(
1278  "getDateTokenized",
1279  date.length()
1280  );
1281  }
1282 
1283  return Corpus::getTokensForEntry(this->dateMap, date, this->tokens);
1284  }
1285 
1287 
1295  inline std::vector<Corpus::Tokens> Corpus::getArticles() const {
1296  this->checkThatTokenized("getArticles");
1297 
1298  std::vector<Tokens> copy;
1299 
1300  copy.reserve(this->articleMap.size());
1301 
1302  for(const auto& article : this->articleMap) {
1303  copy.emplace_back();
1304 
1305  copy.back().reserve(TextMapEntry::length(article));
1306 
1307  const auto articleEnd{TextMapEntry::end(article)};
1308 
1309  for(auto tokenIndex{TextMapEntry::pos(article)}; tokenIndex < articleEnd; ++tokenIndex) {
1310  copy.back().emplace_back(this->tokens.at(tokenIndex));
1311  }
1312  }
1313 
1314  return copy;
1315  }
1316 
1318 
1324  inline std::size_t Corpus::size() const {
1325  return this->tokenized ? this->tokenBytes : this->corpus.size();
1326  }
1327 
1329 
1337  inline bool Corpus::empty() const {
1338  if(this->tokenized) {
1339  return this->tokens.empty();
1340  }
1341 
1342  return this->corpus.empty();
1343  }
1344 
1346 
1363  inline std::string Corpus::substr(std::size_t from, std::size_t len) {
1364  this->checkThatNotTokenized("substr");
1365 
1366  return this->corpus.substr(from, len);
1367  }
1368 
1369  /*
1370  * CREATION
1371  */
1372 
1374 
1386  inline void Corpus::create(
1387  Tokens& texts,
1388  bool deleteInputData
1389  ) {
1390  // clear old corpus
1391  this->clear();
1392 
1393  // concatenate texts
1394  for(auto& text : texts) {
1395  // add text to corpus
1396  this->corpus += text;
1397 
1398  Helper::Memory::freeIf(deleteInputData, text);
1399 
1400  // add space at the end of the corpus
1401  this->corpus.push_back(' ');
1402  }
1403 
1404  Helper::Memory::freeIf(deleteInputData, texts);
1405 
1406  // remove last space, if necessary
1407  if(!(this->corpus.empty())) {
1408  this->corpus.pop_back();
1409  }
1410  }
1411 
1413 
1431  inline void Corpus::create(
1432  Tokens& texts,
1433  std::vector<std::string>& articleIds,
1434  std::vector<std::string>& dateTimes,
1435  bool deleteInputData
1436  ) {
1437  // check arguments
1438  if(articleIds.empty() && dateTimes.empty()) {
1439  this->create(texts, deleteInputData);
1440 
1441  return;
1442  }
1443 
1444  // clear old corpus
1445  this->clear();
1446 
1447  std::string emptyString;
1448  TextMapEntry dateMapEntry;
1449 
1450  for(std::size_t n{}; n < texts.size(); ++n) {
1451  this->addArticle(
1452  texts[n],
1453  articleIds.size() > n ? articleIds[n] : emptyString,
1454  dateTimes.size() > n ? dateTimes[n] : emptyString,
1455  dateMapEntry,
1456  deleteInputData
1457  );
1458  }
1459 
1460  Helper::Memory::freeIf(deleteInputData, texts);
1461  Helper::Memory::freeIf(deleteInputData, articleIds);
1462  Helper::Memory::freeIf(deleteInputData, dateTimes);
1463 
1464  // remove last space, if necessary
1465  if(!(this->corpus.empty())) {
1466  this->corpus.pop_back();
1467  }
1468 
1469  // conclude last date, if unfinished
1470  if(!dateMapEntry.value.empty()) {
1471  this->dateMap.emplace_back(dateMapEntry);
1472  }
1473  }
1475 
1500  Tokens& chunks,
1501  std::vector<TextMap>& articleMaps,
1502  std::vector<TextMap>& dateMaps,
1503  bool deleteInputData
1504  ) {
1505  // clear old corpus
1506  this->clear();
1507 
1508  // reserve memory
1509  Corpus::reserveCombined(chunks, this->corpus);
1510  Corpus::reserveCombined(articleMaps, this->articleMap);
1511  Corpus::reserveCombined(dateMaps, this->dateMap);
1512 
1513  // add chunks
1514  for(auto chunkIt = chunks.begin(); chunkIt != chunks.end(); ++chunkIt) {
1515  const auto chunkIndex{
1516  static_cast<std::size_t>(
1517  chunkIt
1518  - chunks.begin()
1519  )
1520  };
1521 
1522  // save current position in new corpus
1523  const auto pos{this->corpus.size()};
1524 
1525  // add text of chunk to corpus
1526  this->corpus += *chunkIt;
1527 
1528  Helper::Memory::freeIf(deleteInputData, *chunkIt);
1529 
1530  bool beginsWithNewArticle{false};
1531 
1532  if(articleMaps.size() > chunkIndex) {
1533  // add article map
1534  auto& map{articleMaps[chunkIndex]};
1535 
1536  if(!map.empty()) {
1537  const auto& first{map[0]};
1538 
1539  // perform consistency check, if necessary
1540  if(this->checkConsistency && TextMapEntry::pos(first) > 1) {
1541  Corpus::exceptionArticleMapStart(
1542  "combineContinuous",
1543  "#0 or #1",
1544  chunkIndex,
1545  chunks.size(),
1547  );
1548  }
1549 
1550  auto it{map.cbegin()};
1551 
1552  // compare first new article ID with last one
1553  if(
1554  !(this->articleMap.empty())
1555  && this->articleMap.back().value == first.value
1556  ) {
1557  // append current article to last one
1559 
1560  ++it;
1561  }
1562  else {
1563  beginsWithNewArticle = true;
1564  }
1565 
1566  // add remaining articles to map
1567  for(; it != map.cend(); ++it) {
1568  this->articleMap.emplace_back(
1569  pos + it->p,
1570  it->l,
1571  it->value
1572  );
1573  }
1574 
1575  Helper::Memory::freeIf(deleteInputData, map);
1576  }
1577  }
1578 
1579  if(dateMaps.size() > chunkIndex) {
1580  // add date map
1581  auto& map{dateMaps[chunkIndex]};
1582 
1583  if(!map.empty()) {
1584  const auto& first{map[0]};
1585  auto it{map.cbegin()};
1586 
1587  // compare first new date with last one
1588  if(
1589  !(this->dateMap.empty())
1590  && this->dateMap.back().value == first.value
1591  ) {
1592  // append current date to last one
1594 
1595  // add space between articles if chunk begins with new article and date has been extended
1596  if(beginsWithNewArticle) {
1597  ++TextMapEntry::length(this->dateMap.back());
1598  }
1599 
1600  ++it;
1601  }
1602 
1603  // add remaining dates to map
1604  for(; it != map.cend(); ++it) {
1605  this->dateMap.emplace_back(
1606  pos + it->p,
1607  it->l,
1608  it->value
1609  );
1610  }
1611 
1612  Helper::Memory::freeIf(deleteInputData, map);
1613  }
1614  }
1615  }
1616 
1617  Helper::Memory::freeIf(deleteInputData, chunks);
1618  Helper::Memory::freeIf(deleteInputData, articleMaps);
1619  Helper::Memory::freeIf(deleteInputData, dateMaps);
1620  }
1621 
1623 
1663  Tokens& chunks,
1664  Sizes& tokenNums,
1665  std::vector<TextMap>& articleMaps,
1666  std::vector<TextMap>& dateMaps,
1667  std::vector<SentenceMap>& sentenceMaps,
1668  bool deleteInputData
1669  ) {
1670  // clear old corpus
1671  this->clear();
1672 
1673  // check arguments
1674  if(
1675  this->checkConsistency
1676  && (
1677  tokenNums.size() > chunks.size()
1678  || articleMaps.size() > chunks.size()
1679  || dateMaps.size() > chunks.size()
1680  || sentenceMaps.size() > chunks.size()
1681  )
1682  ) {
1683  throw Exception(
1684  "Corpus::combineTokenized():"
1685  " More token counts, article maps, date maps,"
1686  " and/or sentence maps than corpus chunks"
1687  );
1688  }
1689 
1690  if(chunks.empty()) {
1691  return;
1692  }
1693 
1694  if(sentenceMaps.empty()) {
1695  throw Exception(
1696  "Corpus::combineTokenized():"
1697  " No sentence maps for non-empty corpus"
1698  );
1699  }
1700 
1701  // reserve memory
1702  const auto totalTokens{
1703  std::accumulate(
1704  tokenNums.cbegin(),
1705  tokenNums.cend(),
1706  std::size_t{}
1707  )
1708  };
1709 
1710  Helper::Memory::freeIf(deleteInputData, tokenNums);
1711 
1712  this->tokens.reserve(totalTokens);
1713 
1714  Corpus::reserveCombined(articleMaps, this->articleMap);
1715  Corpus::reserveCombined(dateMaps, this->dateMap);
1716  Corpus::reserveCombined(sentenceMaps, this->sentenceMap);
1717 
1718  // add tokens from chunks
1719  std::size_t chunkIndex{};
1720  bool splitToken{false};
1721 
1722  for(auto& chunk : chunks) {
1723  this->addChunk(
1724  chunk,
1725  (chunkIndex < articleMaps.size()) ?
1726  std::optional<std::reference_wrapper<const TextMap>>{articleMaps[chunkIndex]}
1727  : std::nullopt,
1728  (chunkIndex < dateMaps.size()) ?
1729  std::optional<std::reference_wrapper<const TextMap>>{dateMaps[chunkIndex]}
1730  : std::nullopt,
1731  sentenceMaps.at(chunkIndex),
1732  splitToken
1733  );
1734 
1735  Helper::Memory::freeIf(deleteInputData, chunk);
1736  Helper::Memory::freeIf(deleteInputData, sentenceMaps.at(chunkIndex));
1737 
1738  if(chunkIndex < articleMaps.size()) {
1739  Helper::Memory::freeIf(deleteInputData, articleMaps.at(chunkIndex));
1740  }
1741 
1742  if(chunkIndex < dateMaps.size()) {
1743  Helper::Memory::freeIf(deleteInputData, dateMaps.at(chunkIndex));
1744  }
1745 
1746  ++chunkIndex;
1747  }
1748 
1749  if(this->sentenceMap.empty()) {
1750  throw Exception(
1751  "Corpus::combineTokenized():"
1752  " Empty sentence map for non-empty corpus"
1753  );
1754  }
1755 
1756  if(
1757  this->checkConsistency
1758  && TextMapEntry::end(this->sentenceMap.back()) > this->tokens.size()
1759  ) {
1760  Corpus::exceptionLastSentenceLength(
1761  TextMapEntry::pos(this->sentenceMap.back()),
1762  TextMapEntry::length(this->sentenceMap.back()),
1763  this->tokens.size()
1764  );
1765  }
1766 
1767  Helper::Memory::freeIf(deleteInputData, chunks);
1768  Helper::Memory::freeIf(deleteInputData, articleMaps);
1769  Helper::Memory::freeIf(deleteInputData, dateMaps);
1770  Helper::Memory::freeIf(deleteInputData, sentenceMaps);
1771 
1772  this->tokenized = true;
1773 
1774  // if necessary, check the consistency of the newly combined corpus
1775  if(this->checkConsistency) {
1776  this->check("combineTokenized");
1777  }
1778  }
1779 
1780  /*
1781  * COPYING
1782  */
1783 
1785 
1792  inline void Corpus::copyContinuous(std::string& to) const {
1793  this->checkThatNotTokenized("copyContinuous");
1794 
1795  to = this->corpus;
1796  }
1797 
1799 
1813  std::string& to,
1814  TextMap& articleMapTo,
1815  TextMap& dateMapTo
1816  ) const {
1817  this->checkThatNotTokenized("copyContinuous");
1818 
1819  to = this->corpus;
1820  articleMapTo = this->articleMap;
1821  dateMapTo = this->dateMap;
1822  }
1823 
1825 
1870  std::size_t chunkSize,
1871  Tokens& to,
1872  std::vector<TextMap>& articleMapsTo,
1873  std::vector<TextMap>& dateMapsTo
1874  ) const {
1875  // check corpus and argument
1876  if(this->corpus.empty()) {
1877  return;
1878  }
1879 
1880  if(chunkSize == 0) {
1881  throw Exception(
1882  "Corpus::copyChunksContinuous():"
1883  " Invalid chunk size (zero)"
1884  " for a non-empty corpus"
1885  );
1886  }
1887 
1888  this->checkThatNotTokenized("copyChunksContinuous");
1889 
1890  // check whether slicing is necessary
1891  if(this->corpus.size() <= chunkSize) {
1892  to.emplace_back(this->corpus);
1893  articleMapsTo.emplace_back(this->articleMap);
1894  dateMapsTo.emplace_back(this->dateMap);
1895 
1896  return;
1897  }
1898 
1899  // reserve memory for chunks
1900  const auto chunks{
1901  this->corpus.size() / chunkSize
1902  + (this->corpus.size() % chunkSize > 0 ? 1 : 0)
1903  };
1904 
1905  to.reserve(to.size() + chunks);
1906 
1907  if(!(this->articleMap.empty())) {
1908  articleMapsTo.reserve(articleMapsTo.size() + chunks);
1909  }
1910 
1911  if(!(this->dateMap.empty())) {
1912  dateMapsTo.reserve(dateMapsTo.size() + chunks);
1913  }
1914 
1915  // slice corpus into chunks
1916  bool noSpace{false};
1917 
1918  if(this->articleMap.empty()) {
1919  // no article part: simply add parts of the corpus
1920  std::size_t pos{};
1921 
1922  while(pos < this->corpus.size()) {
1923  to.emplace_back(
1924  this->corpus,
1925  pos,
1926  Corpus::getValidLengthOfChunk(
1927  this->corpus,
1928  pos,
1929  chunkSize,
1930  chunkSize
1931  )
1932  );
1933 
1934  pos += to.back().size();
1935  }
1936 
1937  noSpace = true;
1938  }
1939  else {
1940  std::size_t corpusPos{};
1941  std::size_t articlePos{};
1942  auto articleIt{this->articleMap.cbegin()};
1943  auto dateIt{this->dateMap.cbegin()};
1944 
1945  while(corpusPos < this->corpus.size()) { /* loop for chunks */
1946  // create chunk
1947  TextMap chunkArticleMap;
1948  TextMap chunkDateMap;
1949  std::string chunk;
1950 
1951  // add space if necessary
1952  if(noSpace) {
1953  chunk.push_back(' ');
1954 
1955  ++corpusPos;
1956 
1957  noSpace = false;
1958  }
1959 
1960  // loop through multiple articles inside one chunk
1961  for(; articleIt != this->articleMap.cend(); ++articleIt) {
1962  if(dateIt != this->dateMap.cend()) {
1963  // check date of the article
1964  if(
1965  articlePos == 0
1966  && articleIt->p > TextMapEntry::end(*dateIt)
1967  ) {
1968  ++dateIt;
1969  }
1970 
1971  if(
1972  this->checkConsistency
1973  &&
1974  articleIt->p
1975  > dateIt->p
1976  + dateIt->l
1977  ) {
1978  Corpus::exceptionArticleBehindDate(
1979  articleIt->p,
1980  dateIt->p,
1981  TextMapEntry::end(*dateIt)
1982  );
1983  }
1984  }
1985 
1986  // get remaining article length
1987  const auto remaining{articleIt->l - articlePos};
1988 
1989  if(chunk.size() + remaining <= chunkSize) {
1990  if(remaining > 0) {
1991  // add the remainder of the article to the chunk
1992  chunkArticleMap.emplace_back(chunk.size(), remaining, articleIt->value);
1993 
1994  if(dateIt != this->dateMap.cend()) {
1995  if(!chunkDateMap.empty() && chunkDateMap.back().value == dateIt->value) {
1996  /* including space before article */
1997  TextMapEntry::length(chunkDateMap.back()) += remaining + 1;
1998  }
1999  else if(corpusPos >= dateIt->p) {
2000  chunkDateMap.emplace_back(chunk.size(), remaining, dateIt->value);
2001  }
2002  }
2003 
2004  chunk.append(this->corpus, corpusPos, remaining);
2005 
2006  // update position in corpus
2007  corpusPos += remaining;
2008  }
2009 
2010  // reset position in (next) article
2011  articlePos = 0;
2012 
2013  if(chunk.size() < chunkSize) {
2014  // add space to the end of the article
2015  chunk.push_back(' ');
2016 
2017  ++corpusPos;
2018 
2019  // check for end of chunk
2020  if(chunk.size() == chunkSize) {
2021  // start next chunk with next article
2022  ++articleIt;
2023 
2024  break; /* chunk is full */
2025  }
2026  }
2027  else {
2028  // add space to the beginning of the next chunk instead
2029  noSpace = true;
2030 
2031  // start next chunk with next article
2032  ++articleIt;
2033 
2034  break; /* chunk is full */
2035  }
2036  }
2037  else {
2038  // fill the remainder of the chunk with part of the article
2039  auto fill{chunkSize - chunk.size()};
2040 
2041  if(fill == 0) {
2042  break; /* chunk is full */
2043  }
2044 
2045  // check the end for valid UTF-8
2046  fill = Corpus::getValidLengthOfChunk(
2047  this->corpus,
2048  corpusPos,
2049  fill,
2050  chunkSize
2051  );
2052 
2053  if(fill == 0) {
2054  break; /* not enough space in chunk for last (UTF-8) character */
2055  }
2056 
2057  chunkArticleMap.emplace_back(chunk.size(), fill, articleIt->value);
2058 
2059  if(dateIt != this->dateMap.cend()) {
2060  if(!chunkDateMap.empty() && chunkDateMap.back().value == dateIt->value) {
2061  /* including space before the article */
2062  TextMapEntry::length(chunkDateMap.back()) += fill + 1;
2063  }
2064  else if(corpusPos >= dateIt->p) {
2065  chunkDateMap.emplace_back(chunk.size(), fill, dateIt->value);
2066  }
2067  }
2068 
2069  chunk.append(this->corpus, corpusPos, fill);
2070 
2071  // update positions
2072  corpusPos += fill;
2073  articlePos += fill;
2074 
2075  break; /* chunk is full */
2076  }
2077  }
2078 
2079  // consistency checks
2080  if(this->checkConsistency) {
2081  if(chunk.size() > chunkSize) {
2082  Corpus::exceptionChunkSize(chunk.size(), chunkSize);
2083  }
2084 
2085  if(articleIt == this->articleMap.cend() && corpusPos < this->corpus.size()) {
2086  Corpus::exceptionArticleMapEnd(corpusPos, this->corpus.size());
2087  }
2088  }
2089 
2090  // check for empty chunk (should not happen)
2091  if(chunk.empty()) {
2092  break;
2093  }
2094 
2095  // add current chunk
2096  to.emplace_back(chunk);
2097  articleMapsTo.emplace_back(chunkArticleMap);
2098  dateMapsTo.emplace_back(chunkDateMap);
2099  }
2100  }
2101 
2102  if(!(this->articleMap.empty()) && !to.empty()) {
2103  // consistency check
2104  if(this->checkConsistency && to.back().empty()) {
2105  throw Exception(
2106  "Corpus::copyChunksContinuous():"
2107  " The final chunk is empty"
2108  );
2109  }
2110 
2111  // remove last space
2112  if(!noSpace) {
2113  to.back().pop_back();
2114  }
2115 
2116  // remove last chunk if it is empty
2117  if(to.back().empty()) {
2118  to.pop_back();
2119  }
2120 
2121  // consistency check
2122  if(this->checkConsistency && to.back().empty()) {
2123  throw Exception(
2124  "Corpus::copyChunksContinuous():"
2125  " The final chunk is empty"
2126  );
2127  }
2128  }
2129  }
2130 
2132 
2195  std::size_t chunkSize,
2196  Tokens& to,
2197  Sizes& tokenNumsTo,
2198  std::vector<TextMap>& articleMapsTo,
2199  std::vector<TextMap>& dateMapsTo,
2200  std::vector<SentenceMap>& sentenceMapsTo
2201  ) const {
2202  // check corpus and argument
2203  if(this->tokens.empty()) {
2204  return;
2205  }
2206 
2207  if(chunkSize == 0) {
2208  throw Exception(
2209  "Corpus::copyChunksTokenized():"
2210  " Invalid chunk size (zero)"
2211  " for a non-empty corpus"
2212  );
2213  }
2214 
2215  if(this->sentenceMap.empty()) {
2216  throw Exception(
2217  "Corpus::copyChunksTokenized():"
2218  " Empty sentence map"
2219  " for a non-empty corpus"
2220  );
2221  }
2222 
2223  this->checkThatTokenized("copyChunksTokenized");
2224 
2225  Corpus::checkTokensForChunking(this->tokens);
2226 
2227  // check whether slicing is necessary
2228  const auto size{
2229  this->tokenBytes
2230  + this->tokens.size() /* include newline after each token */
2231  - 1 /* (except the last one) */
2232  };
2233 
2234  if(size < chunkSize) {
2235  this->addAsOneChunk(
2236  size,
2237  to,
2238  tokenNumsTo,
2239  articleMapsTo,
2240  dateMapsTo,
2241  sentenceMapsTo
2242  );
2243 
2244  /* added whole corpus as one chunk */
2245  return;
2246  }
2247 
2248  // reserve memory for chunks
2249  const auto sizeOfLastChunk{size % chunkSize};
2250  const auto numberOfChunks{size / chunkSize + (sizeOfLastChunk > 0 ? 1 : 0)};
2251 
2252  Corpus::reserveChunks(
2253  numberOfChunks,
2254  to,
2255  tokenNumsTo,
2256  articleMapsTo,
2257  dateMapsTo,
2258  sentenceMapsTo,
2259  !(this->articleMap.empty()),
2260  !(this->dateMap.empty())
2261  );
2262 
2263  // fill chunk with content, sentences, dates, and articles
2264  std::size_t chunkOffset{};
2265  std::size_t chunkTokens{};
2266  std::string chunkContent;
2267  SentenceMap chunkSentences;
2268  TextMap chunkDates;
2269  TextMap chunkArticles;
2270  std::size_t nextDate{};
2271  std::size_t nextArticle{};
2272  std::size_t tokensComplete{};
2273  std::size_t additionalBytes{};
2274  TextMapEntry remainingDate;
2275  TextMapEntry remainingArticle;
2276 
2277  chunkContent.reserve(chunkSize);
2278 
2279  for(const auto& sentence : this->sentenceMap) {
2280  Corpus::checkForEntry(
2281  "date",
2282  sentence,
2283  nextDate,
2284  this->dateMap,
2285  chunkOffset,
2286  chunkDates,
2287  this->checkConsistency
2288  );
2289 
2290  Corpus::checkForEntry(
2291  "article",
2292  sentence,
2293  nextArticle,
2294  this->articleMap,
2295  chunkOffset,
2296  chunkArticles,
2297  this->checkConsistency
2298  );
2299 
2300  while(
2301  Corpus::pushSentence(
2302  sentence,
2303  chunkSize,
2304  chunkOffset,
2305  chunkTokens,
2306  chunkContent,
2307  chunkSentences,
2308  this->tokens,
2309  tokensComplete,
2310  additionalBytes
2311  )
2312  ) {
2313  const bool splitToken{additionalBytes > 0};
2314 
2315  Corpus::finishChunk(
2316  chunkContent,
2317  chunkSentences,
2318  to,
2319  tokenNumsTo,
2320  sentenceMapsTo,
2321  chunkTokens,
2322  chunkOffset,
2323  splitToken,
2324  (sizeOfLastChunk == 0 || to.size() < (numberOfChunks - 1)) ?
2325  chunkSize : (sizeOfLastChunk + 1)
2326  );
2327 
2328  Corpus::splitEntry(chunkDates, chunkTokens, splitToken, remainingDate);
2329  Corpus::splitEntry(chunkArticles, chunkTokens, splitToken, remainingArticle);
2330 
2331  Corpus::finishMap(chunkDates, dateMapsTo, remainingDate);
2332  Corpus::finishMap(chunkArticles, articleMapsTo, remainingArticle);
2333 
2334  // reset number of tokens in chunk
2335  chunkTokens = 0;
2336  }
2337  }
2338 
2339  // finish last chunk
2340  Corpus::finishChunk(
2341  chunkContent,
2342  chunkSentences,
2343  to,
2344  tokenNumsTo,
2345  sentenceMapsTo,
2346  chunkTokens,
2347  chunkOffset,
2348  false,
2349  0
2350  );
2351 
2352  Corpus::finishMap(chunkDates, dateMapsTo, remainingDate);
2353  Corpus::finishMap(chunkArticles, articleMapsTo, remainingArticle);
2354 
2355  // remove last newline
2356  if(!to.empty()) {
2357  to.back().pop_back();
2358 
2359  if(to.back().empty()) {
2360  to.pop_back();
2361  tokenNumsTo.pop_back();
2362  sentenceMapsTo.pop_back();
2363 
2364  if(!articleMapsTo.empty()) {
2365  articleMapsTo.pop_back();
2366  }
2367 
2368  if(!dateMapsTo.empty()) {
2369  dateMapsTo.pop_back();
2370  }
2371  }
2372  }
2373  }
2374 
2375  /*
2376  * FILTERING
2377  */
2378 
2380 
2411  inline bool Corpus::filterByDate(const std::string& from, const std::string& to) {
2412  // check arguments
2413  if(from.empty() && to.empty()) {
2414  return false;
2415  }
2416 
2417  // check corpus
2418  if(this->tokenized) {
2419  if(this->tokens.empty()) {
2420  return false;
2421  }
2422  }
2423  else if(this->corpus.empty()) {
2424  return false;
2425  }
2426 
2427  if(this->dateMap.empty()) {
2428  // no date map -> empty result
2429  this->clear();
2430 
2431  return true;
2432  }
2433 
2434  // find first date in range
2435  auto begin{this->dateMap.cbegin()};
2436 
2437  for(; begin != this->dateMap.cend(); ++begin) {
2438  if(Helper::DateTime::isISODateInRange(begin->value, from, to)) {
2439  break;
2440  }
2441  }
2442 
2443  if(begin == this->dateMap.cend()) {
2444  // no date in range -> empty result
2445  this->clear();
2446 
2447  return true;
2448  }
2449 
2450  // find first date not in range anymore
2451  auto end{begin};
2452 
2453  ++end; /* current date is in range as has already been checked */
2454 
2455  for(; end != this->dateMap.cend(); ++end) {
2456  if(!Helper::DateTime::isISODateInRange(end->value, from, to)) {
2457  break;
2458  }
2459  }
2460 
2461  // trim date map
2462  if(begin != this->dateMap.cbegin()) {
2463  // create trimmed date map and swap it with the existing one
2464  TextMap(begin, end).swap(this->dateMap);
2465  }
2466  else {
2467  // only remove trailing dates
2468  this->dateMap.resize(std::distance(this->dateMap.cbegin(), end));
2469  }
2470 
2471  // save offset to be subtracted from all positions, (old) position of the last date and new total length of the corpus
2472  const auto offset{TextMapEntry::pos(this->dateMap.front())};
2473  const auto len{TextMapEntry::end(this->dateMap.back()) - offset};
2474 
2475  // trim corpus
2476  if(this->tokenized) {
2477  // (and calculate new size, in bytes)
2478  std::size_t deleteBytes{};
2479 
2480  const auto deleteTo{this->tokens.begin() + offset};
2481 
2482  if(deleteTo != this->tokens.begin()) {
2483  deleteBytes = std::accumulate(
2484  this->tokens.begin(),
2485  deleteTo,
2486  std::size_t{},
2487  [](const auto& a, const auto& b) {
2488  return a + b.size();
2489  }
2490  );
2491 
2492  this->tokens.erase(this->tokens.begin(), deleteTo);
2493  }
2494 
2495  const auto deleteFrom{this->tokens.begin() + len};
2496 
2497  if(deleteFrom != this->tokens.end()) {
2498  deleteBytes += std::accumulate(
2499  deleteFrom,
2500  this->tokens.end(),
2501  std::size_t{},
2502  [](const auto& a, const auto& b) {
2503  return a + b.size();
2504  }
2505  );
2506 
2507  this->tokens.erase(deleteFrom, this->tokens.end());
2508  }
2509 
2510  if(deleteBytes > 0) {
2511  this->tokens.shrink_to_fit();
2512 
2513  this->tokenBytes -= deleteBytes;
2514  }
2515  }
2516  else {
2517  // replace the current corpus with the trimmed one, and release memory
2518  if(offset > 0) {
2519  this->corpus.erase(0, offset);
2520  }
2521 
2522  this->corpus.resize(len);
2523  this->corpus.shrink_to_fit();
2524  }
2525 
2526  // find first article in range
2527  begin = this->articleMap.cbegin();
2528 
2529  for(; begin != this->articleMap.cend(); ++begin) {
2530  if(begin->p == offset) {
2531  break;
2532  }
2533 
2534  // consistency check
2535  if(this->checkConsistency && begin->p > offset) {
2536  Corpus::exceptionMismatchWithDate(
2537  "article",
2538  begin->p,
2539  offset
2540  );
2541  }
2542  }
2543 
2544  // consistency check
2545  if(this->checkConsistency && begin == this->articleMap.cend()) {
2546  Corpus::exceptionDateBehindLast(
2547  "article",
2548  offset,
2549  TextMapEntry::pos(this->articleMap.back())
2550  );
2551  }
2552 
2553  // find first article not in range anymore
2554  end = begin;
2555 
2556  ++end; /* current article is in range as has already been checked */
2557 
2558  for(; end != this->articleMap.cend(); ++end) {
2559  if(end->p >= offset + len) {
2560  break;
2561  }
2562  }
2563 
2564  // trim article map
2565  if(begin != this->articleMap.cbegin()) {
2566  // create trimmed article map and swap it with the the existing one
2567  TextMap(begin, end).swap(this->articleMap);
2568  }
2569  else {
2570  // only remove trailing articles
2571  this->articleMap.resize(
2572  std::distance(this->articleMap.cbegin(), end)
2573  );
2574  }
2575 
2576  if(this->tokenized) {
2577  // find first sentence in range
2578  auto smBegin = this->sentenceMap.cbegin();
2579 
2580  for(; smBegin != this->sentenceMap.cend(); ++smBegin) {
2581  if(smBegin->first == offset) {
2582  break;
2583  }
2584 
2585  // consistency check
2586  if(this->checkConsistency && smBegin->first > offset) {
2587  Corpus::exceptionMismatchWithDate(
2588  "sentence",
2589  smBegin->first,
2590  offset
2591  );
2592  }
2593  }
2594 
2595  // consistency check
2596  if(this->checkConsistency && smBegin == this->sentenceMap.cend()) {
2597  Corpus::exceptionDateBehindLast(
2598  "sentence",
2599  offset,
2600  TextMapEntry::pos(this->sentenceMap.back())
2601  );
2602  }
2603 
2604  // find first sentence not in range anymore
2605  auto smEnd = smBegin;
2606 
2607  ++smEnd; /* current sentence is in range as has already been checked */
2608 
2609  for(; smEnd != this->sentenceMap.cend(); ++smEnd) {
2610  if(smEnd->first >= offset + len) {
2611  break;
2612  }
2613  }
2614 
2615  // trim sentence map
2616  if(smBegin != this->sentenceMap.cbegin()) {
2617  // create trimmed sentence map and swap it with the the existing one
2618  SentenceMap(smBegin, smEnd).swap(this->sentenceMap);
2619  }
2620  else {
2621  // only remove trailing sentences
2622  this->sentenceMap.resize(
2623  std::distance(this->sentenceMap.cbegin(), smEnd)
2624  );
2625  }
2626  }
2627 
2628  // update positions in date, article and sentence maps
2629  for(auto& date : this->dateMap) {
2630  TextMapEntry::pos(date) -= offset;
2631  }
2632 
2633  for(auto& article : this->articleMap) {
2634  TextMapEntry::pos(article) -= offset;
2635  }
2636 
2637  for(auto& sentence : this->sentenceMap) {
2638  TextMapEntry::pos(sentence) -= offset;
2639  }
2640 
2641  if(this->checkConsistency) {
2642  this->check("filterByDate");
2643  }
2644 
2645  return true;
2646  }
2647 
2649 
2674  inline std::size_t Corpus::filterArticles(
2675  const ArticleFunc& callbackArticle,
2676  StatusSetter& statusSetter
2677  ) {
2678  this->checkThatTokenized("filterArticle");
2679 
2680  if(this->tokens.empty()) {
2681  return 0;
2682  }
2683 
2684  statusSetter.change("Filtering corpus...");
2685 
2686  std::size_t articleCounter{};
2687  std::size_t statusCounter{};
2688  std::size_t removed{};
2689 
2690  for(const auto& article : this->articleMap) {
2691  const auto articleEnd{TextMapEntry::end(article)};
2692 
2693  if(
2694  !callbackArticle(
2695  this->tokens,
2696  TextMapEntry::pos(article),
2697  articleEnd
2698  )
2699  ) {
2700  // empty all tokens belonging to the article that has been filtered out
2701  for(
2702  std::size_t tokenIndex{TextMapEntry::pos(article)};
2703  tokenIndex < articleEnd;
2704  ++tokenIndex
2705  ) {
2706  this->tokenBytes -= this->tokens.at(tokenIndex).size();
2707 
2708  Helper::Memory::free(this->tokens.at(tokenIndex));
2709  }
2710 
2711  ++removed;
2712  }
2713 
2714  ++articleCounter;
2715  ++statusCounter;
2716 
2717  if(statusCounter == filterUpdateEvery) {
2718  if(!statusSetter.update(articleCounter, this->articleMap.size(), true)) {
2719  return 0;
2720  }
2721 
2722  statusCounter = 0;
2723  }
2724  }
2725 
2726  statusSetter.finish();
2727 
2728  if(removed == 0) {
2729  return 0;
2730  }
2731 
2732  // remove emptied dates, articles, and tokens
2733  statusSetter.change("Cleaning corpus...");
2734 
2735  this->reTokenize();
2736 
2737  // check consistency, if necessary
2738  if(this->checkConsistency) {
2739  this->check("filterArticles");
2740  }
2741 
2742  return removed;
2743  }
2744 
2745  /*
2746  * TOKENIZATION
2747  */
2748 
2750 
2822  inline bool Corpus::tokenize(
2823  const std::vector<std::uint16_t>& manipulators,
2824  const std::vector<std::string>& models,
2825  const std::vector<std::string>& dictionaries,
2826  const std::vector<std::string>& languages,
2827  std::uint64_t freeMemoryEvery,
2828  StatusSetter& statusSetter
2829  ) {
2830  bool isManipulation{
2831  std::any_of(
2832  manipulators.begin(),
2833  manipulators.end(),
2834  [](const auto manipulator) {
2835  return manipulator > corpusManipNone;
2836  }
2837  )
2838  };
2839 
2840  // prepare manipulators and check their configurations
2841  std::vector<Data::Tagger> taggers;
2842  std::unique_ptr<Lemmatizer> lemmatizer;
2843  std::unique_ptr<TokenRemover> tokenRemover;
2844  std::vector<TokenCorrect> tokenCorrectors;
2845  std::size_t manipulatorIndex{};
2846 
2847  for(const auto& manipulator : manipulators) {
2848  switch(manipulator) {
2849  case corpusManipNone:
2850  break;
2851 
2854  Corpus::notUsed("model", models, manipulatorIndex);
2855  Corpus::notUsed("dictionary", dictionaries, manipulatorIndex);
2856  Corpus::notUsed("language", languages, manipulatorIndex);
2857 
2858  break;
2859 
2860  case corpusManipTagger:
2862  if(models.at(manipulatorIndex).empty()) {
2863  throw Exception(
2864  "Corpus::tokenize():"
2865  " No model set for part-of-speech tagger (manipulator #"
2866  + std::to_string(manipulatorIndex + 1)
2867  + ")"
2868  );
2869  }
2870 
2871  Corpus::notUsed("dictionary", dictionaries, manipulatorIndex);
2872  Corpus::notUsed("language", languages, manipulatorIndex);
2873 
2874  taggers.emplace_back();
2875 
2876  break;
2877 
2878  case corpusManipLemmatizer:
2879  if(dictionaries.at(manipulatorIndex).empty()) {
2880  throw Exception(
2881  "Corpus::tokenize():"
2882  " No dictionary set for lemmatizer (manipulator #"
2883  + std::to_string(manipulatorIndex + 1)
2884  + ")"
2885  );
2886  }
2887 
2888  Corpus::notUsed("model", models, manipulatorIndex);
2889  Corpus::notUsed("language", languages, manipulatorIndex);
2890 
2891  if(!lemmatizer) {
2892  lemmatizer = std::make_unique<Lemmatizer>();
2893  }
2894 
2895  break;
2896 
2897  case corpusManipRemove:
2898  case corpusManipTrim:
2899  if(dictionaries.at(manipulatorIndex).empty()) {
2900  throw Exception(
2901  "Corpus::tokenize():"
2902  " No dictionary set for token remover/trimmer (manipulator #"
2903  + std::to_string(manipulatorIndex + 1)
2904  + ")"
2905  );
2906  }
2907 
2908  Corpus::notUsed("model", models, manipulatorIndex);
2909  Corpus::notUsed("language", languages, manipulatorIndex);
2910 
2911  if(!tokenRemover) {
2912  tokenRemover = std::make_unique<TokenRemover>();
2913  }
2914 
2915  break;
2916 
2917  case corpusManipCorrect:
2918  Corpus::notUsed("model", models, manipulatorIndex);
2919  Corpus::notUsed("dictionary", dictionaries, manipulatorIndex);
2920 
2921  tokenCorrectors.emplace_back(languages.at(manipulatorIndex));
2922 
2923  break;
2924 
2925  default:
2926  throw Exception(
2927  "Corpus::tokenize():"
2928  " Invalid manipulator (#"
2929  + std::to_string(manipulator)
2930  + ")"
2931  );
2932  }
2933 
2934  ++manipulatorIndex;
2935  }
2936 
2937  // set manipulation callback
2938  auto callbackLambda = [
2939  &manipulators,
2940  &taggers,
2941  &dictionaries,
2942  &languages,
2943  &lemmatizer,
2944  &tokenRemover,
2945  &tokenCorrectors
2946  ](
2947  Tokens::iterator sentenceBegin,
2948  Tokens::iterator sentenceEnd
2949  ) {
2950  std::size_t manipulatorIndex{};
2951  std::size_t taggerIndex{};
2952  std::size_t correctIndex{};
2953 
2954  for(const auto& manipulator : manipulators) {
2955  switch(manipulator) {
2956  case corpusManipNone:
2957  break;
2958 
2959  case corpusManipTagger:
2961  taggers.at(taggerIndex).label(sentenceBegin, sentenceEnd);
2962 
2963  ++taggerIndex;
2964 
2965  break;
2966 
2968  for(auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2969  Data::Stemmer::stemEnglish(*tokenIt);
2970  }
2971 
2972  break;
2973 
2975  for(auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2976  Data::Stemmer::stemGerman(*tokenIt);
2977  }
2978 
2979  break;
2980 
2981  case corpusManipLemmatizer:
2982  for(auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2983  lemmatizer->lemmatize(*tokenIt, dictionaries.at(manipulatorIndex));
2984  }
2985 
2986  break;
2987 
2988  case corpusManipRemove:
2989  for(auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2990  tokenRemover->remove(*tokenIt, dictionaries.at(manipulatorIndex));
2991  }
2992 
2993  break;
2994 
2995  case corpusManipTrim:
2996  for(auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2997  tokenRemover->trim(*tokenIt, dictionaries.at(manipulatorIndex));
2998  }
2999 
3000  break;
3001 
3002  case corpusManipCorrect:
3003  for(auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
3004  tokenCorrectors.at(correctIndex).correct(*tokenIt);
3005  }
3006 
3007  ++correctIndex;
3008 
3009  break;
3010 
3011  default:
3012  throw Exception(
3013  "Corpus::tokenize():"
3014  " Invalid manipulator (#"
3015  + std::to_string(manipulator)
3016  + ")"
3017  );
3018  }
3019 
3020  ++manipulatorIndex;
3021  }
3022  };
3023 
3024  // tokenize corpus
3025  return this->tokenizeCustom(
3026  isManipulation ? std::optional<SentenceFunc>{callbackLambda} : std::nullopt,
3027  freeMemoryEvery,
3028  statusSetter
3029  );
3030  }
3031 
3033 
3082  const std::optional<SentenceFunc>& callback,
3083  std::uint64_t freeMemoryEvery,
3084  StatusSetter& statusSetter
3085  ) {
3086  if(this->tokenized) {
3087  if(
3088  !(
3089  this->tokenizeTokenized(
3090  callback,
3091  statusSetter
3092  )
3093  )
3094  ) {
3095  return false;
3096  }
3097  }
3098  else {
3099  if(
3100  !(
3101  this->tokenizeContinuous(
3102  callback,
3103  freeMemoryEvery,
3104  statusSetter
3105  )
3106  )
3107  ) {
3108  return false;
3109  }
3110  }
3111 
3112  statusSetter.finish();
3113 
3114  return true;
3115  }
3116 
3117  /*
3118  * CLEANUP
3119  */
3120 
3122 
3128  inline void Corpus::clear() {
3135 
3136  this->tokenized = false;
3137  this->tokenBytes = 0;
3138  }
3139 
3140  /*
3141  * INTERNAL HELPER FUNCTIONS (private)
3142  */
3143 
3144  // move data of combined corpus into class
3145  inline void Corpus::moveCombinedIn(DateArticleSentenceMap& from) {
3146  for(auto& [date, articles] : from) {
3147  std::size_t dateLength{};
3148 
3149  this->dateMap.emplace_back();
3150 
3151  TextMapEntry::pos(this->dateMap.back()) = this->tokens.size();
3152  this->dateMap.back().value = date;
3153 
3154  for(auto& [article, sentences] : articles) {
3155  std::size_t articleLength{};
3156 
3157  this->articleMap.emplace_back();
3158 
3159  TextMapEntry::pos(this->articleMap.back()) = this->tokens.size();
3160  this->articleMap.back().value = article;
3161 
3162  for(auto& sentence : sentences) {
3163  this->sentenceMap.emplace_back(
3164  this->tokens.size(),
3165  sentence.size()
3166  );
3167 
3168  dateLength += sentence.size();
3169  articleLength += sentence.size();
3170 
3171  this->tokenBytes += Helper::Container::bytes(sentence);
3172 
3173  Helper::Container::moveInto(this->tokens, sentence);
3174  }
3175 
3176  TextMapEntry::length(this->articleMap.back()) = articleLength;
3177  }
3178 
3179  TextMapEntry::length(this->dateMap.back()) = dateLength;
3180  }
3181 
3182  this->tokenized = true;
3183  }
3184 
3185  // check that the corpus has not been tokenized, throw an exception otherwise
3186  inline void Corpus::checkThatNotTokenized(std::string_view function) const {
3187  if(this->tokenized) {
3188  throw Exception(
3189  "Corpus::"
3190  + std::string(function)
3191  + "(): The corpus has been tokenized"
3192  );
3193  }
3194  }
3195 
3196  // check that the corpus has been tokenized, throw an exception otherwise
3197  inline void Corpus::checkThatTokenized(std::string_view function) const {
3198  if(!(this->tokenized)) {
3199  throw Exception(
3200  "Corpus::"
3201  + std::string(function)
3202  + "(): The corpus has not been tokenized"
3203  );
3204  }
3205  }
3206 
3207  // add an article to the (continuous) corpus
3208  inline void Corpus::addArticle(
3209  std::string& text,
3210  std::string& id,
3211  std::string& dateTime,
3212  TextMapEntry& dateMapEntry,
3213  bool deleteInputData
3214  ) {
3215  auto pos{this->corpus.size()};
3216 
3217  // add article ID (or empty article) to article map
3218  if(!id.empty()) {
3219  this->articleMap.emplace_back(pos, text.length(), id);
3220 
3221  Helper::Memory::freeIf(deleteInputData, id);
3222  }
3223  else if(!(this->articleMap.empty()) && this->articleMap.back().value.empty()) {
3224  // expand empty article in the end of the article map
3225  // (including space before current text)
3226  TextMapEntry::length(this->articleMap.back()) += text.length() + 1;
3227  }
3228  else {
3229  // add empty article to the end of the article map
3230  this->articleMap.emplace_back(pos, text.length());
3231  }
3232 
3233  // add date to date map if necessary
3234  if(!dateTime.empty()) {
3235  // check for valid (long enough) date/time
3236  if(dateTime.length() >= dateLength) {
3237  // get only date (YYYY-MM-DD) from date/time
3238  const std::string date(dateTime, 0, dateLength);
3239 
3240  // check whether a date is already set
3241  if(!dateMapEntry.value.empty()) {
3242  // date is already set -> compare with current date
3243  if(dateMapEntry.value == date) {
3244  // last date equals current date -> append text to last date
3245  // (including space before current text)
3246  TextMapEntry::length(dateMapEntry) += text.length() + 1;
3247  }
3248  else {
3249  // last date differs from current date -> conclude last date and start new date
3250  this->dateMap.emplace_back(dateMapEntry);
3251 
3252  dateMapEntry = TextMapEntry(this->corpus.size(), text.length(), date);
3253  }
3254  }
3255  else {
3256  // no date is set yet -> start new date
3257  dateMapEntry = TextMapEntry(this->corpus.size(), text.length(), date);
3258  }
3259  }
3260  else if(!dateMapEntry.value.empty()) {
3261  // no valid date found, but last date is set -> conclude last date
3262  this->dateMap.emplace_back(dateMapEntry);
3263 
3264  Helper::Memory::free(dateMapEntry);
3265  }
3266 
3267  Helper::Memory::freeIf(deleteInputData, dateTime);
3268  }
3269 
3270  // concatenate corpus text
3271  this->corpus += text;
3272 
3273  Helper::Memory::freeIf(deleteInputData, text);
3274 
3275  // add space at the end of the corpus
3276  this->corpus.push_back(' ');
3277  }
3278 
3279  // add chunk to (tokenized) corpus
3280  inline void Corpus::addChunk(
3281  const std::string& content,
3282  const std::optional<std::reference_wrapper<const TextMap>>& articles,
3283  const std::optional<std::reference_wrapper<const TextMap>>& dates,
3284  const SentenceMap& sentences,
3285  bool& continueToken
3286  ) {
3287  if(content.empty()) {
3288  return;
3289  }
3290 
3291  const auto chunkOffset{
3292  this->tokens.empty() ? 0 : this->tokens.size() - 1
3293  };
3294 
3295  // add sentences
3296  bool skip{
3297  /* does first sentence continue previous one (that has already been added)? */
3298  !(this->sentenceMap.empty())
3299  && TextMapEntry::end(this->sentenceMap.back()) > chunkOffset
3300  };
3301 
3302  for(const auto& sentence : sentences) {
3303  if(skip) {
3304  /* skip first sentence */
3305  skip = false;
3306 
3307  continue;
3308  }
3309 
3310  this->sentenceMap.emplace_back(sentence);
3311 
3312  TextMapEntry::pos(this->sentenceMap.back()) += chunkOffset;
3313  }
3314 
3315  // prepare first token
3316  if(this->tokens.empty()) {
3317  this->tokens.emplace_back();
3318  }
3319 
3320  // add tokens
3321  for(const auto c : content) {
3322  if(c == '\n') {
3323  this->tokens.emplace_back();
3324 
3325  continue;
3326  }
3327 
3328  this->tokens.back().push_back(c);
3329 
3330  ++(this->tokenBytes);
3331  }
3332 
3333  // add articles and dates, if necessary
3334  Corpus::addChunkMap(articles, this->articleMap, chunkOffset, continueToken);
3335  Corpus::addChunkMap(dates, this->dateMap, chunkOffset, continueToken);
3336 
3337  // save whether token will be continued in next chunk (if there is one)
3338  continueToken = content.back() != '\n';
3339  }
3340 
3341  // check consistency of corpus after manipulation, throws Corpus::Exception
3342  inline void Corpus::check(std::string_view function) const {
3343  if(this->tokenized) {
3344  this->checkTokenized(function);
3345  }
3346 
3347  const auto end{
3348  this->tokenized ? this->tokens.size() : this->corpus.size()
3349  };
3350 
3351  Corpus::checkMap(function, "date map", this->dateMap, end, this->tokenized, true);
3352  Corpus::checkMap(function, "article map", this->articleMap, end, this->tokenized, false);
3353  Corpus::checkMap(function, this->sentenceMap, end, this->tokenized);
3354  }
3355 
3356  // check consistency of tokenized corpus, throws Corpus::Exception
3357  inline void Corpus::checkTokenized(std::string_view function) const {
3358  if(
3359  !(this->articleMap.empty())
3360  && !(this->dateMap.empty())
3361  && !(this->sentenceMap.empty())
3362  ) {
3363  auto article = this->articleMap.cbegin();
3364  auto sentence = this->sentenceMap.cbegin();
3365 
3366  // go through all dates
3367  for(auto date{this->dateMap.cbegin()}; date != this->dateMap.cend(); ++date) {
3368  // jump to first article of date
3369  while(article != this->articleMap.cend() && article->p < date->p) {
3370  ++article;
3371  }
3372 
3373  // jump to first sentence of date
3374  while(sentence != this->sentenceMap.cend() && sentence->first < date->p) {
3375  ++sentence;
3376  }
3377 
3378  // go through all articles in current date
3379  const auto dateEnd{TextMapEntry::end(*date)};
3380 
3381  while(article != this->articleMap.cend() && article->p < dateEnd) {
3382  // jump to first sentence of article
3383  while(sentence != this->sentenceMap.cend() && sentence->first < date->p) {
3384  ++sentence;
3385  }
3386 
3387  if(sentence == this->sentenceMap.cend()) {
3388  break;
3389  }
3390 
3391  // go through all sentences in current article
3392  const auto articleEnd{TextMapEntry::end(*article)};
3393 
3394  while(sentence != this->sentenceMap.cend() && sentence->first < articleEnd) {
3395  // check whether sentence is out of bounds
3396  const auto sentenceEnd{TextMapEntry::end(*sentence)};
3397 
3398  if(sentenceEnd > dateEnd) {
3399  Corpus::exceptionSentenceBehind(
3400  function,
3401  "date",
3402  *sentence,
3403  *date,
3404  this->dateMap,
3405  std::next(date),
3406  this->tokens
3407  );
3408  }
3409 
3410  if(sentenceEnd > articleEnd) {
3411  Corpus::exceptionSentenceBehind(
3412  function,
3413  "article",
3414  *sentence,
3415  *article,
3416  this->articleMap,
3417  std::next(article),
3418  this->tokens
3419  );
3420  }
3421 
3422  // go to next sentence
3423  ++sentence;
3424  }
3425 
3426  // go to next article
3427  ++article;
3428  }
3429  }
3430  }
3431 
3432  // check number of size of tokenized corpus
3433  const auto bytes{
3434  std::accumulate(
3435  this->tokens.begin(),
3436  this->tokens.end(),
3437  std::size_t{},
3438  [](const auto& size, const auto& token) {
3439  return size + token.size();
3440  }
3441  )
3442  };
3443 
3444  if(bytes != this->tokenBytes) {
3445  Corpus::exceptionTokenBytes(function, this->tokenBytes, bytes);
3446  }
3447  }
3448 
3449  // add whole corpus as one chunk
3450  inline void Corpus::addAsOneChunk(
3451  std::size_t size,
3452  Tokens& to,
3453  Sizes& tokenNumsTo,
3454  std::vector<TextMap>& articleMapsTo,
3455  std::vector<TextMap>& dateMapsTo,
3456  std::vector<SentenceMap>& sentenceMapsTo
3457  ) const {
3458  to.emplace_back(std::string{});
3459 
3460  to.back().reserve(to.size() + size);
3461 
3462  for(const auto& token : this->tokens) {
3463  to.back() += token;
3464 
3465  to.back().push_back('\n');
3466  }
3467 
3468  // remove last newline, if necessary
3469  if(!to.back().empty()) {
3470  to.back().pop_back();
3471  }
3472 
3473  articleMapsTo.emplace_back(this->articleMap);
3474  dateMapsTo.emplace_back(this->dateMap);
3475  sentenceMapsTo.emplace_back(this->sentenceMap);
3476  tokenNumsTo.emplace_back(this->tokens.size());
3477  }
3478 
3479  // re-tokenize corpus, removing all empty tokens, articles, and dates
3480  inline void Corpus::reTokenize() {
3481  // remove empty entries from maps
3482  Corpus::removeEmptyEntries(this->dateMap, this->tokens);
3483  Corpus::removeEmptyEntries(this->articleMap, this->tokens);
3484  Corpus::removeEmptyEntries(this->sentenceMap, this->tokens);
3485 
3486  // remove empty tokens from maps
3487  std::size_t dateIndex{};
3488  std::size_t articleIndex{};
3489  std::size_t sentenceIndex{};
3490  PositionLength originDate{};
3491  PositionLength originArticle{};
3492  PositionLength originSentence{};
3493  std::size_t removed{};
3494 
3495  for(std::size_t tokenIndex{}; tokenIndex < this->tokens.size(); ++tokenIndex) {
3496  Corpus::skipEntriesBefore(this->dateMap, dateIndex, originDate, tokenIndex);
3497  Corpus::skipEntriesBefore(this->articleMap, articleIndex, originArticle, tokenIndex);
3498  Corpus::skipEntriesBefore(this->sentenceMap, sentenceIndex, originSentence, tokenIndex);
3499 
3500  Corpus::updatePosition(
3501  "date map",
3502  this->dateMap,
3503  dateIndex,
3504  originDate.first,
3505  tokenIndex,
3506  removed
3507  );
3508  Corpus::updatePosition(
3509  "article map",
3510  this->articleMap,
3511  articleIndex,
3512  originArticle.first,
3513  tokenIndex,
3514  removed
3515  );
3516  Corpus::updatePosition(
3517  "sentence map",
3518  this->sentenceMap,
3519  sentenceIndex,
3520  originSentence.first,
3521  tokenIndex,
3522  removed
3523  );
3524 
3525  if(this->tokens[tokenIndex].empty()) {
3526  Corpus::removeTokenFromLength(this->dateMap, dateIndex, originDate, tokenIndex);
3527  Corpus::removeTokenFromLength(this->articleMap, articleIndex, originArticle, tokenIndex);
3528  Corpus::removeTokenFromLength(this->sentenceMap, sentenceIndex, originSentence, tokenIndex);
3529 
3530  ++removed;
3531  }
3532  }
3533 
3534  // remove empty tokens
3535  Corpus::removeEmpty(this->tokens);
3536  }
3537 
3538  // tokenize already tokenized corpus, return whether thread is still running
3539  inline bool Corpus::tokenizeTokenized(
3540  const std::optional<SentenceFunc>& callback,
3541  StatusSetter& statusSetter
3542  ) {
3543  // run manipulators on already tokenized corpus
3544  if(!callback) {
3545  return statusSetter.isRunning();
3546  }
3547 
3548  std::size_t numDeletedTokens{};
3549  std::size_t dateIndex{};
3550  std::size_t articleIndex{};
3551  std::size_t dateEnd{};
3552  std::size_t articleEnd{};
3553  std::size_t sentenceCounter{};
3554  std::size_t statusCounter{};
3555  bool inDate{false};
3556  bool inArticle{false};
3557  bool emptyDates{false};
3558  bool emptyArticles{false};
3559  bool emptySentences{false};
3560 
3561  // reset number of bytes
3562  this->tokenBytes = 0;
3563 
3564  // go through all sentences
3565  for(auto& sentenceEntry : this->sentenceMap) {
3566  // skip dates and articles before current sentence
3567  // (including last date and article, if finished)
3568  Corpus::skipEntriesBefore(
3569  this->dateMap,
3570  dateIndex,
3571  dateEnd,
3572  TextMapEntry::pos(sentenceEntry),
3573  inDate
3574  );
3575  Corpus::skipEntriesBefore(
3576  this->articleMap,
3577  articleIndex,
3578  articleEnd,
3579  TextMapEntry::pos(sentenceEntry),
3580  inArticle
3581  );
3582 
3583  // check for beginning of date and/or article
3584  if(
3585  Corpus::entryBeginsAt(
3586  this->dateMap,
3587  dateIndex,
3588  TextMapEntry::pos(sentenceEntry)
3589  )
3590  ) {
3591  inDate = true;
3592 
3593  // update beginning of date
3594  TextMapEntry::pos(this->dateMap.at(dateIndex)) -= numDeletedTokens;
3595  }
3596 
3597  if(
3598  Corpus::entryBeginsAt(
3599  this->articleMap,
3600  articleIndex,
3601  TextMapEntry::pos(sentenceEntry)
3602  )
3603  ) {
3604  inArticle = true;
3605 
3606  // update beginning of article
3607  TextMapEntry::pos(this->articleMap.at(articleIndex)) -= numDeletedTokens;
3608  }
3609 
3610  // store unchanged sentence data
3611  const auto sentenceBegin{TextMapEntry::pos(sentenceEntry)};
3612  const auto sentenceEnd{TextMapEntry::end(sentenceEntry)};
3613 
3614  // update beginning of sentence
3615  TextMapEntry::pos(sentenceEntry) -= numDeletedTokens;
3616 
3617  // modify sentence (or its tokens), if necessary
3618  if(callback) {
3619  (*callback)(
3620  this->tokens.begin() + sentenceBegin,
3621  this->tokens.begin() + sentenceEnd
3622  );
3623  }
3624 
3625  for(auto tokenIndex{sentenceBegin}; tokenIndex < sentenceEnd; ++tokenIndex) {
3626  const auto& token{this->tokens.at(tokenIndex)};
3627 
3628  // remove empty token from date, article, and sentence map
3629  if(token.empty()) {
3630  if(inDate) {
3631  Corpus::removeToken(this->dateMap, dateIndex, emptyDates);
3632  }
3633 
3634  if(inArticle) {
3635  Corpus::removeToken(this->articleMap, articleIndex, emptyArticles);
3636  }
3637 
3638  Corpus::removeToken(sentenceEntry, emptySentences);
3639 
3640  // delete token later
3641  ++numDeletedTokens;
3642  }
3643  else {
3644  this->tokenBytes += token.size();
3645  }
3646  }
3647 
3648  ++sentenceCounter;
3649  ++statusCounter;
3650 
3651  if(statusCounter == tokenizeUpdateEvery) {
3652  if(!statusSetter.update(sentenceCounter, this->sentenceMap.size(), true)) {
3653  return false;
3654  }
3655 
3656  statusCounter = 0;
3657  }
3658  }
3659 
3660  statusSetter.change("Cleaning corpus...");
3661 
3662  // delete empty dates
3663  if(emptyDates) {
3664  Corpus::removeEmptyEntries(this->dateMap);
3665  }
3666 
3667  // delete empty articles
3668  if(emptyArticles) {
3669  Corpus::removeEmptyEntries(this->articleMap);
3670  }
3671 
3672  // delete empty sentences
3673  if(emptySentences) {
3674  Corpus::removeEmptyEntries(this->sentenceMap);
3675  }
3676 
3677  // delete empty tokens
3678  if(numDeletedTokens > 0) {
3679  Corpus::removeEmpty(this->tokens);
3680  }
3681 
3682  // check consistency
3683  if(this->checkConsistency) {
3684  this->check("tokenizeTokenized");
3685  }
3686 
3687  return statusSetter.isRunning();
3688  }
3689 
3690  // tokenize still continuous corpus
3691  inline bool Corpus::tokenizeContinuous(
3692  const std::optional<SentenceFunc>& callback,
3693  std::uint64_t freeMemoryEvery,
3694  StatusSetter& statusSetter
3695  ) {
3696  // tokenize continuous text corpus
3697  Tokens sentence;
3698 
3699  std::size_t tokenBegin{};
3700  std::size_t sentenceFirstToken{};
3701  std::size_t currentToken{};
3702  std::size_t statusCounter{};
3703  std::size_t corpusTrimmed{};
3704 
3705  bool inArticle{false};
3706  bool inDate{false};
3707 
3708  std::size_t articleFirstToken{};
3709  std::size_t dateFirstToken{};
3710  std::size_t articleEnd{Corpus::getFirstEnd(this->articleMap)};
3711  std::size_t dateEnd{Corpus::getFirstEnd(this->dateMap)};
3712  std::size_t nextArticle{};
3713  std::size_t nextDate{};
3714 
3715  TextMap newArticleMap;
3716  TextMap newDateMap;
3717 
3718  newArticleMap.reserve(this->articleMap.size());
3719  newDateMap.reserve(this->dateMap.size());
3720 
3721  // go through all characters in the continous text corpus
3722  for(std::size_t pos{}; pos < this->corpus.size() + corpusTrimmed; ++pos) {
3723  bool sentenceEnd{false};
3724  bool noSeparator{false};
3725  bool appendToArticle{false};
3726  bool appendToDate{false};
3727 
3728  if(!(this->articleMap.empty())) {
3729  // check for beginning of article
3730  if(
3731  !inArticle
3732  && nextArticle < this->articleMap.size()
3733  && pos == TextMapEntry::pos(this->articleMap[nextArticle])
3734  ) {
3735  articleFirstToken = currentToken;
3736  articleEnd = TextMapEntry::end(this->articleMap[nextArticle]);
3737 
3738  inArticle = true;
3739 
3740  ++nextArticle;
3741  }
3742 
3743  // check for end of article
3744  if(
3745  inArticle
3746  && pos == articleEnd
3747  ) {
3748  inArticle = false;
3749 
3750  newArticleMap.emplace_back(
3751  articleFirstToken,
3752  currentToken - articleFirstToken,
3753  this->articleMap.at(nextArticle - 1).value
3754  );
3755 
3756  sentenceEnd = true;
3757  appendToArticle = true;
3758  }
3759  }
3760 
3761  if(!(this->dateMap.empty())) {
3762  // check for beginning of date
3763  if(
3764  !inDate
3765  && nextDate < this->dateMap.size()
3766  && pos == TextMapEntry::pos(this->dateMap.at(nextDate))
3767  ) {
3768  dateFirstToken = currentToken;
3769  dateEnd = TextMapEntry::end(this->dateMap.at(nextDate));
3770 
3771  inDate = true;
3772 
3773  ++nextDate;
3774  }
3775 
3776  // check for end of date
3777  if(
3778  inDate
3779  && pos == dateEnd
3780  ) {
3781  inDate = false;
3782 
3783  newDateMap.emplace_back(
3784  dateFirstToken,
3785  currentToken - dateFirstToken,
3786  this->dateMap.at(nextDate - 1).value
3787  );
3788 
3789  sentenceEnd = true;
3790  appendToDate = true;
3791  }
3792  }
3793 
3794  // check for end of sentence
3795  switch(this->corpus.at(pos - corpusTrimmed)) {
3796  case '.':
3797  case ':':
3798  case ';':
3799  case '!':
3800  case '?':
3801  sentenceEnd = true;
3802 
3803  break;
3804 
3805  case ' ':
3806  case ',':
3807  case '/':
3808  case '\\':
3809  case '|':
3810  case '&':
3811  case '\a':
3812  case '\b':
3813  case '\t':
3814  case '\n':
3815  case '\v':
3816  case '\f':
3817  case '\r':
3818  case '\0':
3819  break;
3820 
3821  default:
3822  if(sentenceEnd) {
3823  // end of token and sentence without separating character
3824  noSeparator = true;
3825  }
3826  else {
3827  // go to next character
3828  continue;
3829  }
3830  }
3831 
3832  // end token
3833  auto tokenLength{pos - tokenBegin};
3834 
3835  if(noSeparator) {
3836  ++tokenLength;
3837  }
3838 
3839  if(tokenLength > 0) {
3840  sentence.emplace_back(this->corpus, tokenBegin - corpusTrimmed, tokenLength);
3841 
3842  ++currentToken;
3843 
3844  if(appendToArticle) {
3845  ++TextMapEntry::length(newArticleMap.back());
3846  }
3847 
3848  if(appendToDate) {
3849  ++TextMapEntry::length(newDateMap.back());
3850  }
3851  }
3852 
3853  if(freeMemoryEvery > 0 && pos - corpusTrimmed > freeMemoryEvery) {
3854  // free memory, i.e. remove the already processed beginning of the corpus
3855  this->corpus.erase(0, pos - corpusTrimmed);
3856  this->corpus.shrink_to_fit();
3857 
3858  corpusTrimmed = pos;
3859  }
3860 
3861  tokenBegin = pos + 1;
3862 
3863  if(sentenceEnd && !sentence.empty()) {
3864  Corpus::processSentence(
3865  sentence,
3866  callback,
3867  appendToArticle,
3868  appendToDate,
3869  currentToken,
3870  sentenceFirstToken,
3871  newArticleMap,
3872  newDateMap,
3873  this->sentenceMap,
3874  this->tokens,
3875  this->tokenBytes
3876  );
3877 
3878  // update status if necessary
3879  ++statusCounter;
3880 
3881  if(statusCounter == tokenizeUpdateEvery) {
3882  if(
3883  !statusSetter.update(
3884  pos + 1,
3885  this->corpus.size() + corpusTrimmed,
3886  true
3887  )
3888  ) {
3889  return false;
3890  }
3891 
3892  statusCounter = 0;
3893  }
3894  }
3895  }
3896 
3897  // check for end of last article
3898  bool endOfLastArticle{false};
3899 
3900  if(
3901  inArticle
3902  && this->corpus.size() + corpusTrimmed == articleEnd
3903  ) {
3904  inArticle = false;
3905 
3906  newArticleMap.emplace_back(
3907  articleFirstToken,
3908  currentToken - articleFirstToken,
3909  this->articleMap.at(nextArticle - 1).value
3910  );
3911 
3912  endOfLastArticle = true;
3913  }
3914 
3915  // check for end of last date
3916  bool endOfLastDate{false};
3917 
3918  if(
3919  inDate
3920  && this->corpus.size() + corpusTrimmed == dateEnd
3921  ) {
3922  inDate = false;
3923 
3924  newDateMap.emplace_back(
3925  dateFirstToken,
3926  currentToken - dateFirstToken,
3927  this->dateMap.at(nextDate - 1).value
3928  );
3929 
3930  endOfLastDate = true;
3931  }
3932 
3933  // add last token if not added yet
3934  if(tokenBegin - corpusTrimmed < this->corpus.size()) {
3935  sentence.emplace_back(
3936  this->corpus,
3937  tokenBegin - corpusTrimmed,
3938  this->corpus.size() + corpusTrimmed - tokenBegin
3939  );
3940 
3941  if(endOfLastArticle) {
3942  ++TextMapEntry::length(newDateMap.back());
3943  }
3944 
3945  if(endOfLastDate) {
3946  ++TextMapEntry::length(newArticleMap.back());
3947  }
3948  }
3949 
3950  // add last sentence if not added yet
3951  if(!sentence.empty()) {
3952  Corpus::processSentence(
3953  sentence,
3954  callback,
3955  endOfLastArticle,
3956  endOfLastDate,
3957  currentToken,
3958  sentenceFirstToken,
3959  newArticleMap,
3960  newDateMap,
3961  this->sentenceMap,
3962  this->tokens,
3963  this->tokenBytes
3964  );
3965  }
3966 
3968 
3969  // check consistency
3970  if(this->checkConsistency) {
3971  if(inArticle) {
3972  throw Exception(
3973  "Corpus::tokenizeContinuous():"
3974  " Last article '"
3975  + this->articleMap.at(nextArticle - 1).value
3976  + "' has not been finished"
3977  );
3978  }
3979 
3980  if(inDate) {
3981  throw Exception(
3982  "Corpus::tokenizeContinuous():"
3983  " Last date '"
3984  + this->dateMap.at(nextDate - 1).value
3985  + "' has not been finished"
3986  );
3987  }
3988 
3989  if(nextArticle < this->articleMap.size()) {
3990  throw Exception(
3991  "Corpus::tokenizeContinuous():"
3992  " Unexpected article '"
3993  + this->articleMap.at(nextArticle).value
3994  + "' after end of corpus"
3995  );
3996  }
3997 
3998  if(nextDate < this->dateMap.size()) {
3999  throw Exception(
4000  "Corpus::tokenizeContinuous():"
4001  " Unexpected date '"
4002  + this->dateMap.at(nextDate).value
4003  + "' after end of corpus"
4004  );
4005  }
4006  }
4007 
4008  newArticleMap.swap(this->articleMap);
4009  newDateMap.swap(this->dateMap);
4010 
4011  this->tokenized = true;
4012 
4013  // check consistency
4014  if(this->checkConsistency) {
4015  this->check("tokenizeContinuous");
4016  }
4017 
4018  return statusSetter.isRunning();
4019  }
4020 
4021  /*
4022  * INTERNAL STATIC HELPER FUNCTIONS (private)
4023  */
4024 
4025  // combine corpora, return whether thread is still running
4026  inline bool Corpus::combineCorpora(
4027  std::vector<Corpus>& from,
4029  StatusSetter& statusSetter
4030  ) {
4031  std::size_t corpusCounter{};
4032 
4033  for(auto& corpus : from) {
4034  ++corpusCounter;
4035 
4036  if(!Corpus::addCorpus(corpus, to, corpusCounter, from.size(), statusSetter)) {
4037  return false;
4038  }
4039  }
4040 
4041  Helper::Memory::free(from);
4042 
4043  return statusSetter.isRunning();
4044  }
4045 
4046  // get all tokens that belong to a specific date or article
4047  inline Corpus::Tokens Corpus::getTokensForEntry(
4048  const TextMap& map,
4049  const std::string& id,
4050  const Tokens& tokens
4051  ) {
4052  const auto& found{
4053  std::find_if(
4054  map.cbegin(),
4055  map.cend(),
4056  [&id](const auto& entry) {
4057  return entry.value == id;
4058  }
4059  )
4060  };
4061 
4062  if(found == map.cend()) {
4063  return Tokens{};
4064  }
4065 
4066  const auto entryEnd{TextMapEntry::end(*found)};
4067 
4068  Tokens copy;
4069 
4070  copy.reserve(found->l);
4071 
4072  for(auto tokenIndex{found->p}; tokenIndex < entryEnd; ++tokenIndex) {
4073  copy.emplace_back(tokens.at(tokenIndex));
4074  }
4075 
4076  return copy;
4077  }
4078 
4079  // remove empty tokens
4080  inline void Corpus::removeEmpty(Tokens& from) {
4081  from.erase(
4082  std::remove_if(
4083  from.begin(),
4084  from.end(),
4085  [](const auto& str) {
4086  return str.empty();
4087  }
4088  ),
4089  from.end()
4090  );
4091  }
4092 
4093  // remove token from an article or date map
4094  inline void Corpus::removeToken(TextMap& map, std::size_t entryIndex, bool& emptiedTo) {
4095  if(TextMapEntry::length(map.at(entryIndex)) == 0) {
4096  throw Exception(
4097  "Corpus::removeToken():"
4098  " Could not remove token from map:"
4099  " Map entry is already empty."
4100  );
4101  }
4102 
4103  // update length of map entry
4104  --TextMapEntry::length(map.at(entryIndex));
4105 
4106  // check whether map entry is empty
4107  if(TextMapEntry::length(map.at(entryIndex)) == 0) {
4108  emptiedTo = true;
4109  }
4110  }
4111 
4112  // remove token from a sentence map entry
4113  inline void Corpus::removeToken(SentenceMapEntry& entry, bool& emptiedTo) {
4114  if(entry.second == 0) {
4115  throw Exception(
4116  "Corpus::removeToken():"
4117  " Could not remove token from sentence:"
4118  " Sentence is already empty."
4119  );
4120  }
4121 
4122  // update length of sentence
4123  --(entry.second);
4124 
4125  // check whether sentence is empty
4126  if(entry.second == 0) {
4127  emptiedTo = true;
4128  }
4129  }
4130 
4131  // get a valid end of the current chunk (without cutting off UTF-8 characters), throws Corpus::Exception
4132  // NOTE: the result is between (maxLength - 3) and maxLength, although at least zero
4133  inline std::size_t Corpus::getValidLengthOfChunk(
4134  const std::string& source,
4135  std::size_t pos,
4136  std::size_t maxLength,
4137  std::size_t maxChunkSize
4138  ) {
4139  // check arguments
4140  if(maxLength > maxChunkSize) {
4141  Corpus::exceptionInvalidMaxChunkSize(maxLength, maxChunkSize);
4142  }
4143 
4144  if(maxChunkSize == 0) {
4145  throw Exception(
4146  "Corpus::getValidLengthOfChunk():"
4147  " Invalid maximum chunk size of zero"
4148  );
4149  }
4150 
4151  if(maxLength == 0) {
4152  return 0;
4153  }
4154 
4155  // cut a maximum of three bytes
4156  std::uint8_t cut{};
4157 
4158  for(; cut < utf8MaxBytes; ++cut) {
4159  if(cut > maxLength) {
4160  break;
4161  }
4162 
4163  // check last four of the remaining characters (if available)
4164  const auto maxBack{static_cast<std::uint8_t>(cut + utf8MaxBytes)};
4165  const auto checkFrom{maxLength > maxBack ? pos + maxLength - maxBack : pos};
4166  const auto checkLength{maxLength > maxBack ? utf8MaxBytes : maxLength - cut};
4167 
4168  if(Helper::Utf8::isLastCharValidUtf8(source.substr(checkFrom, checkLength))) {
4169  return maxLength - cut;
4170  }
4171  }
4172 
4173  if(cut == utf8MaxBytes) {
4174  throw Exception(
4175  "Corpus::getValidLengthOfChunk():"
4176  " Could not slice corpus"
4177  " because of invalid UTF-8 character"
4178  );
4179  }
4180 
4181  if(maxLength >= maxChunkSize) {
4182  throw Exception(
4183  "Corpus::getValidLengthOfChunk():"
4184  " The chunk size is too small"
4185  " to slice a corpus with UTF-8 character(s)"
4186  );
4187  }
4188 
4189  return 0;
4190  }
4191 
4192  // get a valid end of the current chunk (without cutting off UTF-8 characters), throws Corpus::Exception
4193  // NOTE: the result is between (maxLength - 3) and maxLength, although at least zero
4194  inline std::size_t Corpus::getValidLengthOfChunk(
4195  const std::string& chunkContent,
4196  std::size_t maxChunkSize
4197  ) {
4198  return Corpus::getValidLengthOfChunk(chunkContent, 0, maxChunkSize, maxChunkSize);
4199  }
4200 
4201  // check whether any token contains a newline
4202  inline void Corpus::checkTokensForChunking(const Tokens& tokens) {
4203  if(
4204  std::any_of(tokens.begin(), tokens.end(), [](const auto& token) {
4205  return std::any_of(token.begin(), token.end(), [](const auto c) {
4206  return c == '\n';
4207  });
4208  })
4209  ) {
4210  throw Exception(
4211  "Corpus::copyChunksTokenized():"
4212  " Cannot split corpus into chunks"
4213  " as one of its tokens contains a newline"
4214  );
4215  }
4216  }
4217 
4218  // reserve memory for chunks
4219  inline void Corpus::reserveChunks(
4220  std::size_t chunks,
4221  Tokens& to,
4222  Sizes& tokenNumsTo,
4223  std::vector<TextMap>& articleMapsTo,
4224  std::vector<TextMap>& dateMapsTo,
4225  std::vector<SentenceMap>& sentenceMapsTo,
4226  bool hasArticleMap,
4227  bool hasDateMap
4228  ) {
4229  to.reserve(to.size() + chunks);
4230 
4231  if(hasArticleMap) {
4232  articleMapsTo.reserve(articleMapsTo.size() + chunks);
4233  }
4234 
4235  if(hasDateMap) {
4236  dateMapsTo.reserve(dateMapsTo.size() + chunks);
4237  }
4238 
4239  sentenceMapsTo.reserve(sentenceMapsTo.size() + chunks);
4240  tokenNumsTo.reserve(tokenNumsTo.size() + chunks);
4241  }
4242 
4243  // check current sentence for map entry while filling tokenized chunk
4244  inline void Corpus::checkForEntry(
4245  std::string_view type,
4246  const SentenceMapEntry& sentence,
4247  std::size_t& nextIndex,
4248  const TextMap& map,
4249  std::size_t chunkOffset,
4250  TextMap& chunkMap,
4251  bool checkConsistency
4252  ) {
4253  if(nextIndex > map.size()) {
4254  throw Exception(
4255  "Corpus::copyChunksTokenized():"
4256  " Skipped beyond end of last "
4257  + std::string(type)
4258  );
4259  }
4260 
4261  if(nextIndex == map.size()) {
4262  return;
4263  }
4264 
4265  while(
4266  TextMapEntry::pos(map.at(nextIndex))
4267  == TextMapEntry::pos(sentence)
4268  ) {
4269  const auto& next{map.at(nextIndex)};
4270 
4271  if(TextMapEntry::length(next) > 0) {
4272  chunkMap.emplace_back(
4273  TextMapEntry::pos(next) - chunkOffset,
4274  TextMapEntry::length(next),
4275  next.value
4276  );
4277  }
4278 
4279  ++nextIndex;
4280 
4281  if(nextIndex == map.size()) {
4282  break;
4283  }
4284  }
4285 
4286  if(
4287  checkConsistency
4288  && nextIndex < map.size()
4289  && TextMapEntry::pos(map.at(nextIndex))
4290  < TextMapEntry::pos(sentence)
4291  ) {
4292  const auto& next{map.at(nextIndex)};
4293 
4294  Corpus::exceptionUnexpectedBeforeSentence(
4295  type,
4296  next.value,
4297  TextMapEntry::pos(next),
4298  TextMapEntry::pos(sentence)
4299  );
4300  }
4301  }
4302 
4303  // finish chunk
4304  inline void Corpus::finishChunk(
4305  std::string& contentFrom,
4306  SentenceMap& sentencesFrom,
4307  Tokens& contentTo,
4308  Sizes& tokenNumTo,
4309  std::vector<SentenceMap>& sentencesTo,
4310  std::size_t chunkTokens,
4311  std::size_t& chunkOffset,
4312  bool splitToken,
4313  std::size_t nextChunkSize
4314  ) {
4315  // move content
4316  contentTo.emplace_back(std::move(contentFrom));
4317 
4318  contentFrom.clear();
4319 
4320  if(nextChunkSize > 0) {
4321  contentFrom.reserve(nextChunkSize);
4322  }
4323 
4324  // copy sentences
4325  sentencesTo.emplace_back(sentencesFrom);
4326 
4327  sentencesFrom.clear();
4328 
4329  // add token count
4330  tokenNumTo.push_back(chunkTokens + (splitToken ? 1 : 0));
4331 
4332  // update chunk offset
4333  chunkOffset += chunkTokens;
4334  }
4335 
4336  // check whether to split current text map entry for chunking
4337  inline void Corpus::splitEntry(
4338  TextMap& map,
4339  std::size_t token,
4340  bool splitToken,
4341  TextMapEntry& remainingTo
4342  ) {
4343  if(map.empty()) {
4344  return;
4345  }
4346 
4347  const auto end{TextMapEntry::end(map.back())};
4348 
4349  if(end > token || (end == token && splitToken)) {
4350  TextMapEntry::length(remainingTo) = end - token;
4351  TextMapEntry::length(map.back()) -= TextMapEntry::length(remainingTo);
4352 
4353  if(splitToken) {
4354  ++TextMapEntry::length(map.back());
4355  }
4356 
4357  remainingTo.value = map.back().value;
4358  }
4359  }
4360 
4361  // finish text map for current chunk
4362  inline void Corpus::finishMap(TextMap& from, std::vector<TextMap>& to, TextMapEntry& remaining) {
4363  while(!from.empty() && TextMapEntry::length(from.back()) == 0) {
4364  from.pop_back();
4365  }
4366 
4367  to.emplace_back(std::move(from));
4368 
4369  from.clear();
4370 
4371  if(TextMapEntry::length(remaining) > 0) {
4372  from.emplace_back(std::move(remaining));
4373 
4374  Helper::Memory::free(remaining);
4375  }
4376  }
4377 
4378  // check that the specified value is not set, throw an exception otherwise
4379  inline void Corpus::notUsed(
4380  std::string_view type,
4381  const Tokens& values,
4382  std::size_t index
4383  ) {
4384  if(!values.at(index).empty()) {
4385  std::string typeCapitalized(type);
4386 
4387  if(!typeCapitalized.empty()) {
4388  typeCapitalized[0] = std::toupper(typeCapitalized[0]);
4389  }
4390 
4391  throw Exception(
4392  "Corpus::tokenize():"
4393  " "
4394  + typeCapitalized
4395  + " ('"
4396  + values.at(index)
4397  + "') set but not used by manipulator #"
4398  + std::to_string(index + 1)
4399  );
4400  }
4401  }
4402 
4403  // add map from chunk to (tokenized) corpus
4404  inline void Corpus::addChunkMap(
4405  const std::optional<std::reference_wrapper<const TextMap>>& from,
4406  TextMap& to,
4407  std::size_t offset,
4408  bool splitToken
4409  ) {
4410  if(!from) {
4411  return;
4412  }
4413 
4414  if(from.value().get().empty()) {
4415  return;
4416  }
4417 
4418  bool skip{false};
4419 
4420  if(!to.empty() && to.back().value == from.value().get().at(0).value) {
4421  /* combine last with current map */
4422  TextMapEntry::length(to.back()) += TextMapEntry::length(from.value().get().at(0));
4423 
4424  if(splitToken) {
4425  /* remove second part of splitted token from length */
4426  --TextMapEntry::length(to.back());
4427  }
4428 
4429  skip = true;
4430  }
4431 
4432  for(const auto& entry : from.value().get()) {
4433  if(skip) {
4434  /* skip first map entry */
4435  skip = false;
4436 
4437  continue;
4438  }
4439 
4440  to.emplace_back(entry);
4441 
4442  TextMapEntry::pos(to.back()) += offset;
4443  }
4444  }
4445 
4446  // check article or date map for inconsistencies, throws Corpus::Exception
4447  inline void Corpus::checkMap(
4448  std::string_view function,
4449  std::string_view name,
4450  const TextMap& map,
4451  std::size_t end,
4452  bool isTokenized,
4453  bool isDateMap
4454  ) {
4455  // check the argument
4456  if(map.empty()) {
4457  return;
4458  }
4459 
4460  // check the start positions of all entries in the map
4461  std::size_t last{};
4462 
4463  for(const auto& entry : map) {
4464  if(last > 0 && TextMapEntry::pos(entry) != last) {
4465  Corpus::exceptionInvalidPosition(
4466  function,
4467  TextMapEntry::pos(entry),
4468  last,
4469  name
4470  );
4471  }
4472 
4473  last = TextMapEntry::end(entry);
4474 
4475  if(!isTokenized) {
4476  ++last;
4477  }
4478 
4479  if(isDateMap && entry.value.length() != dateLength) {
4480  Corpus::exceptionInvalidDate(
4481  function,
4482  entry.value,
4483  name
4484  );
4485  }
4486  }
4487 
4488  // check the end position of the last entry in the map
4489  const auto& back{map.back()};
4490 
4491  if(TextMapEntry::end(back) != end) {
4492  Corpus::exceptionInvalidEnd(
4493  function,
4494  TextMapEntry::end(back),
4495  end,
4496  name
4497  );
4498  }
4499  }
4500 
4501  // check sentence map for inconsistencies, throws Corpus::Exception
4502  inline void Corpus::checkMap(
4503  std::string_view function,
4504  const SentenceMap& map,
4505  std::size_t end,
4506  bool isTokenized
4507  ) {
4508  // check the argument
4509  if(map.empty()) {
4510  return;
4511  }
4512 
4513  // check the start positions of all entries in the map
4514  std::size_t last{};
4515 
4516  for(const auto& entry : map) {
4517  if(TextMapEntry::pos(entry) != last) {
4518  Corpus::exceptionInvalidPosition(
4519  function,
4520  TextMapEntry::pos(entry),
4521  last,
4522  "sentence map"
4523  );
4524  }
4525 
4526  last = TextMapEntry::end(entry);
4527 
4528  if(!isTokenized) {
4529  ++last;
4530  }
4531  }
4532 
4533  // check the end position of the last entry in the map
4534  const auto& back{map.back()};
4535 
4536  if(TextMapEntry::end(back) != end) {
4537  Corpus::exceptionInvalidEnd(
4538  function,
4539  TextMapEntry::end(back),
4540  end,
4541  "sentence map"
4542  );
4543  }
4544  }
4545 
4546  // skip map entries before current position
4547  inline void Corpus::skipEntriesBefore(
4548  const TextMap& map,
4549  std::size_t& entryIndex,
4550  std::size_t& entryEnd,
4551  std::size_t pos,
4552  bool& inEntryTo
4553  ) {
4554  bool increaseIndex{inEntryTo};
4555  bool skipped{false};
4556 
4557  while(
4558  entryIndex < map.size()
4559  && (entryEnd <= pos || TextMapEntry::length(map[entryIndex]) == 0)
4560  ) {
4561  if(increaseIndex) {
4562  ++entryIndex;
4563  }
4564  else {
4565  increaseIndex = true;
4566  }
4567 
4568  entryEnd = Corpus::getEntryEnd(map, entryIndex);
4569 
4570  skipped = true;
4571  }
4572 
4573  if(skipped) {
4574  inEntryTo = false;
4575  }
4576  }
4577 
4578  // get the end of the first article/date, regardless of whether it is in the map or not
4579  inline std::size_t Corpus::getFirstEnd(const TextMap& map) {
4580  if(!map.empty()) {
4581  if(TextMapEntry::pos(map[0]) > 0) {
4582  return TextMapEntry::pos(map[0]);
4583  }
4584 
4585  return TextMapEntry::length(map[0]);
4586  }
4587 
4588  return 0;
4589  }
4590 
4591  // get the end of a text map entry with the given index (or the end of the map)
4592  inline std::size_t Corpus::getEntryEnd(const TextMap& map, std::size_t entryIndex) {
4593  if(map.empty()) {
4594  return 0;
4595  }
4596 
4597  if(entryIndex < map.size()) {
4598  return TextMapEntry::end(map[entryIndex]);
4599  }
4600 
4601  return TextMapEntry::end(map.back());
4602  }
4603 
4604  // process sentence for tokenization of the corpus
4605  inline void Corpus::processSentence(
4606  Tokens& sentence,
4607  const std::optional<SentenceFunc>& callback,
4608  bool inArticle,
4609  bool inDate,
4610  std::size_t& currentToken,
4611  std::size_t& sentenceFirstToken,
4612  TextMap& articleMap,
4613  TextMap& dateMap,
4615  Tokens& tokens,
4616  std::size_t& tokenBytes
4617  ) {
4618  if(callback) {
4619  // modify sentence (or its tokens), if necessary
4620  (*callback)(sentence.begin(), sentence.end());
4621  }
4622 
4623  // modify tokens of the sentence, do not keep emptied tokens
4624  for(auto tokenIt{sentence.begin()}; tokenIt != sentence.end(); ) {
4625  if(tokenIt->empty()) {
4626  // remove empty token
4627  tokenIt = sentence.erase(tokenIt);
4628 
4629  --currentToken;
4630 
4631  // shrink article and date, if necessary
4632  if(inArticle) {
4633  --TextMapEntry::length(articleMap.back());
4634 
4635  if(TextMapEntry::length(articleMap.back()) == 0) {
4636  articleMap.pop_back();
4637  }
4638  }
4639 
4640  if(inDate) {
4641  --TextMapEntry::length(dateMap.back());
4642 
4643  if(TextMapEntry::length(dateMap.back()) == 0) {
4644  dateMap.pop_back();
4645  }
4646  }
4647  }
4648  else {
4649  tokenBytes += tokenIt->size();
4650 
4651  ++tokenIt;
4652  }
4653  }
4654 
4655  if(!sentence.empty()) {
4656  // add sentence to map
4657  sentenceMap.emplace_back(
4658  sentenceFirstToken,
4659  sentence.size()
4660  );
4661 
4662  // move the tokens in the finished sentence into the tokens of the corpus
4663  Helper::Container::moveInto(tokens, sentence);
4664  }
4665 
4666  sentence.clear();
4667 
4668  sentenceFirstToken = currentToken; /* (= already next token) */
4669  }
4670 
4671  // add corpus to combined corpus, return whether thread is still running
4672  inline bool Corpus::addCorpus(
4673  Corpus& from,
4675  std::size_t number,
4676  std::size_t total,
4677  StatusSetter& statusSetter
4678  ) {
4679  if(from.empty()) {
4680  return true;
4681  }
4682 
4683  if(!from.tokenized) {
4684  throw Exception(
4685  "Corpus::Corpus():"
4686  " All sources need to be tokenized."
4687  );
4688  }
4689 
4690  // set status and add sentences to combined corpus
4691  const bool isRunning{
4692  statusSetter.change(Corpus::mergingStatus(number, total))
4693  && Corpus::addSentences(from, to, statusSetter)
4694  };
4695 
4696  statusSetter.finish();
4697 
4698  from.clear();
4699 
4700  return isRunning;
4701  }
4702 
4703  // add sentences from corpus to combined corpus, return whether thread is still running
4704  inline bool Corpus::addSentences(
4705  Corpus& from,
4707  StatusSetter& statusSetter
4708  ) {
4709  std::size_t articleIndex{};
4710  std::size_t dateIndex{};
4711  std::size_t articleEnd{Corpus::getFirstEnd(from.articleMap)};
4712  std::size_t dateEnd{Corpus::getFirstEnd(from.dateMap)};
4713  bool inArticle{false};
4714  bool inDate{false};
4715  std::size_t sentenceCounter{};
4716  std::size_t statusCounter{};
4717  std::string article;
4718  std::string date;
4719  std::vector<Tokens> content;
4720 
4721  // go through all sentences
4722  for(auto& sentence : from.sentenceMap) {
4723  // skip articles and dates before current sentence
4724  // (including last article and date, if finished)
4725  Corpus::skipEntriesBefore(
4726  from.articleMap,
4727  articleIndex,
4728  articleEnd,
4729  TextMapEntry::pos(sentence),
4730  inArticle
4731  );
4732  Corpus::skipEntriesBefore(
4733  from.dateMap,
4734  dateIndex,
4735  dateEnd,
4736  TextMapEntry::pos(sentence),
4737  inDate
4738  );
4739 
4740  // check for beginning of article and/or date
4741  if(Corpus::entryBeginsAt(from.articleMap, articleIndex, TextMapEntry::pos(sentence))) {
4742  // finish last article
4743  Corpus::finishArticle(
4744  content,
4745  to,
4746  date,
4747  article
4748  );
4749 
4750  // get next article
4751  Corpus::nextEntry(
4752  from.articleMap,
4753  articleIndex,
4754  article,
4755  articleEnd,
4756  from.tokens.size()
4757  );
4758 
4759  inArticle = true;
4760  }
4761  else if(!inArticle) {
4762  article = "";
4763  }
4764 
4765  if(Corpus::entryBeginsAt(from.dateMap, dateIndex, TextMapEntry::pos(sentence))) {
4766  // get next date
4767  Corpus::nextEntry(
4768  from.dateMap,
4769  dateIndex,
4770  date,
4771  dateEnd,
4772  from.tokens.size()
4773  );
4774 
4775  inDate = true;
4776  }
4777  else if(!inDate) {
4778  date = "";
4779  }
4780 
4781  // add sentence to content
4782  content.emplace_back(
4783  from.tokens.begin() + TextMapEntry::pos(sentence),
4784  from.tokens.begin() + TextMapEntry::end(sentence)
4785  );
4786 
4787  // update status
4788  ++sentenceCounter;
4789  ++statusCounter;
4790 
4791  if(statusCounter == mergeUpdateEvery) {
4792  if(!statusSetter.update(sentenceCounter, from.sentenceMap.size(), true)) {
4793  return false;
4794  }
4795 
4796  statusCounter = 0;
4797  }
4798  }
4799 
4800  // finish last article
4801  Corpus::finishArticle(
4802  content,
4803  to,
4804  date,
4805  article
4806  );
4807 
4808  return true;
4809  }
4810 
4811  // append or add article to combined corpus
4812  inline void Corpus::finishArticle(
4813  std::vector<Tokens>& from,
4815  const std::string& date,
4816  const std::string& article
4817  ) {
4818  if(from.empty()) {
4819  return;
4820  }
4821 
4822  Helper::Container::moveInto(to[date][article], from); /* (inserts date/article if necessary) */
4823  Helper::Memory::free(from);
4824  }
4825 
4826  // go to next article or date to be added to the combined corpus
4827  inline void Corpus::nextEntry(
4828  const TextMap& map,
4829  std::size_t index,
4830  std::string& nameTo,
4831  std::size_t& endTo,
4832  std::size_t corpusEnd
4833  ) {
4834  if(index < map.size()) {
4835  nameTo = map[index].value;
4836  endTo = TextMapEntry::end(map[index]);
4837  }
4838  else {
4839  nameTo = "";
4840  endTo = corpusEnd;
4841  }
4842  }
4843 
4844  // push as much of a (remaining) sentence into a chunk as possible,
4845  // return whether the chunk is full, throws Corpus::Exception
4846  inline bool Corpus::pushSentence(
4847  const SentenceMapEntry& sentence,
4848  std::size_t chunkSize,
4849  std::size_t chunkOffset,
4850  std::size_t& chunkTokens,
4851  std::string& chunkContent,
4852  SentenceMap& chunkSentences,
4853  const Tokens& tokens,
4854  std::size_t& tokensComplete,
4855  std::size_t& additionalBytes
4856  ) {
4857  auto bytesBefore{additionalBytes};
4858 
4859  // add sentence to chunk
4860  const auto sentenceOffset{
4861  tokensComplete - TextMapEntry::pos(sentence)
4862  };
4863 
4864  chunkSentences.emplace_back(
4865  TextMapEntry::pos(sentence) + sentenceOffset - chunkOffset,
4866  TextMapEntry::length(sentence) - sentenceOffset
4867  );
4868 
4869  // add tokens to chunk
4870  for(std::size_t token{tokensComplete}; token < TextMapEntry::end(sentence); ++token) {
4871  // get (remaining) token
4872  const auto oldSize{chunkContent.size()};
4873 
4874  chunkContent += (
4875  additionalBytes > 0 ? tokens.at(token).substr(additionalBytes)
4876  : tokens.at(token)
4877  );
4878 
4879  chunkContent.push_back('\n');
4880 
4881  if(chunkContent.size() > chunkSize) {
4882  /* (remaining) token does not fit into chunk completely */
4883  const auto size{Corpus::getValidLengthOfChunk(chunkContent, chunkSize)};
4884 
4885  chunkContent.erase(size);
4886 
4887  additionalBytes += chunkContent.size() - oldSize;
4888 
4889  if(token == TextMapEntry::pos(sentence) + sentenceOffset && additionalBytes == bytesBefore) {
4890  /* no content from current sentence has been added */
4891  chunkSentences.pop_back();
4892 
4893  if(tokensComplete == chunkOffset) {
4894  throw Exception(
4895  "Corpus::copyChunksTokenized():"
4896  " Separating tokens into chunks failed - chunk size too small?"
4897  );
4898  }
4899  }
4900 
4901  return true;
4902  }
4903 
4904  /* (remaining) token fits into chunk completely */
4905  additionalBytes = 0;
4906  bytesBefore = 0;
4907 
4908  ++tokensComplete;
4909  ++chunkTokens;
4910  }
4911 
4912  return false;
4913  }
4914 
4915  // get string for thread status
4916  inline std::string Corpus::mergingStatus(std::size_t number, std::size_t total) {
4917  std::ostringstream status;
4918 
4919  Corpus::locale(status);
4920 
4921  status << "Merging corpora (";
4922  status << number;
4923  status << "/";
4924  status << total;
4925  status << ")...";
4926 
4927  return status.str();
4928  }
4929 
4930  // set locale for output streams
4931  inline void Corpus::locale(std::ostream& os) {
4932  os.imbue(Helper::CommaLocale::locale());
4933  }
4934 
4935  /*
4936  * INTERNAL STATIC HELPER FUNCTIONS FOR EXCEPTION HANDLING (private)
4937  */
4938 
4939  // exception when trying to get an article: article map is empty
4940  inline void Corpus::exceptionGetNoArticleMap(
4941  std::string_view function,
4942  std::size_t article
4943  ) {
4944  std::ostringstream exception;
4945 
4946  Corpus::locale(exception);
4947 
4948  exception << "Corpus::";
4949  exception << function;
4950  exception << "(): Article #";
4951  exception << article;
4952  exception << " requested, but the article map is empty";
4953 
4954  throw Exception(exception.str());
4955  }
4956 
4957  // exception when trying to get an article: article is out of the article map's bounds
4958  inline void Corpus::exceptionArticleOutOfBounds(
4959  std::string_view function,
4960  std::size_t article,
4961  std::size_t size
4962  ) {
4963  std::ostringstream exception;
4964 
4965  Corpus::locale(exception);
4966 
4967  exception << "Corpus::";
4968  exception << function;
4969  exception << "(): The specified article index (#";
4970  exception << article;
4971  exception << ") is out of bounds [#0;#";
4972  exception << size - 1;
4973  exception << "]";
4974 
4975  throw Exception(exception.str());
4976  }
4977 
4978  // exception when trying to get a date: invalid date length
4979  inline void Corpus::exceptionDateLength(
4980  std::string_view function,
4981  std::size_t length
4982  ) {
4983  std::ostringstream exception;
4984 
4985  Corpus::locale(exception);
4986 
4987  exception << "Corpus::";
4988  exception << function;
4989  exception << "(): Invalid length of date (";
4990  exception << length;
4991  exception << " instead of ";
4992  exception << dateLength;
4993  exception << ")";
4994 
4995  throw Exception(exception.str());
4996  }
4997 
4998  // exception when combining chunks: article map of chunk does not start at its beginning
4999  inline void Corpus::exceptionArticleMapStart(
5000  std::string_view function,
5001  std::string_view expected,
5002  std::size_t chunkIndex,
5003  std::size_t numberOfChunks,
5004  std::size_t start
5005  ) {
5006  std::ostringstream exception;
5007 
5008  Corpus::locale(exception);
5009 
5010  exception << "Corpus::";
5011  exception << function;
5012  exception << "(): Article map in corpus chunk ";
5013  exception << chunkIndex + 1;
5014  exception << "/";
5015  exception << numberOfChunks;
5016  exception << " starts at #";
5017  exception << start;
5018  exception << " instead of ";
5019  exception << expected;
5020 
5021  throw Exception(exception.str());
5022  }
5023 
5024  // exception when combining tokenized chunks: length of last sentence exceeeds length of corpus
5025  inline void Corpus::exceptionLastSentenceLength(
5026  std::size_t pos,
5027  std::size_t length,
5028  std::size_t corpusSize
5029  ) {
5030  std::ostringstream exception;
5031 
5032  Corpus::locale(exception);
5033 
5034  exception << "Corpus::combineTokenized(): Length of last sentence (";
5035  exception << pos;
5036  exception << " + ";
5037  exception << length;
5038  exception << " [";
5039  exception << pos + length;
5040  exception << "]) exceeds length of corpus (";
5041  exception << corpusSize;
5042  exception << ")";
5043 
5044  throw Exception(exception.str());
5045  }
5046 
5047  // exception when copying chunks: article lies behind its date
5048  inline void Corpus::exceptionArticleBehindDate(
5049  std::size_t articlePos,
5050  std::size_t datePos,
5051  std::size_t dateEnd
5052  ) {
5053  std::ostringstream exception;
5054 
5055  Corpus::locale(exception);
5056 
5057  exception << "Corpus::copyChunksContinuous(): Article position (#";
5058  exception << articlePos;
5059  exception << ") lies behind date at [#";
5060  exception << datePos;
5061  exception << ";#";
5062  exception << dateEnd;
5063  exception << "]";
5064 
5065  throw Exception(exception.str());
5066  }
5067 
5068  // exception when copying chunks: chunk size is too large
5069  inline void Corpus::exceptionChunkSize(std::size_t size, std::size_t chunkSize) {
5070  std::ostringstream exception;
5071 
5072  Corpus::locale(exception);
5073 
5074  exception << "Corpus::copyChunksContinuous(): Chunk is too large:";
5075  exception << size;
5076  exception << " > ";
5077  exception << chunkSize;
5078 
5079  throw Exception(exception.str());
5080  }
5081 
5082  // exception when copying chunks: end of articles reached before corpus ends
5083  inline void Corpus::exceptionArticleMapEnd(std::size_t pos, std::size_t size) {
5084  std::ostringstream exception;
5085 
5086  Corpus::locale(exception);
5087 
5088  exception << "Corpus::copyChunksContinuous(): End of articles, but not of corpus ( #";
5089  exception << pos;
5090  exception << " < #";
5091  exception << size;
5092  exception << ")";
5093 
5094  throw Exception(exception.str());
5095  }
5096 
5097  // exception when copying tokenized chunks: article or date begins before current sentence
5098  inline void Corpus::exceptionUnexpectedBeforeSentence(
5099  std::string_view type,
5100  std::string_view name,
5101  std::size_t pos,
5102  std::size_t sentencePos
5103  ) {
5104  std::ostringstream exception;
5105 
5106  Corpus::locale(exception);
5107 
5108  exception << "Corpus::copyChunksTokenized(): Unexpected begin of ";
5109  exception << type;
5110  exception << " '";
5111  exception << name;
5112  exception << "' (@";
5113  exception << pos;
5114  exception << ") before the beginning of the current sentence (@";
5115  exception << sentencePos;
5116  exception << ")";
5117 
5118  throw Exception(exception.str());
5119  }
5120 
5121  // exception when filtering corpus by date: mismatch between article or sentence and date position
5122  inline void Corpus::exceptionMismatchWithDate(
5123  std::string_view type,
5124  std::size_t pos,
5125  std::size_t datePos
5126  ) {
5127  std::ostringstream exception;
5128 
5129  Corpus::locale(exception);
5130 
5131  exception << "Corpus::filterByDate(): Mismatch between positions of ";
5132  exception << type;
5133  exception << " (@ #";
5134  exception << pos;
5135  exception << ") and date (@ #";
5136  exception << datePos;
5137  exception << ") in ";
5138  exception << type;
5139  exception << " and date map of the corpus";
5140 
5141  throw Exception(exception.str());
5142  }
5143 
5144  // exception when filtering corpus by date: date lies behind last article or sentence
5145  inline void Corpus::exceptionDateBehindLast(
5146  std::string_view type,
5147  std::size_t datePos,
5148  std::size_t lastPos
5149  ) {
5150  std::ostringstream exception;
5151 
5152  Corpus::locale(exception);
5153 
5154  exception << "Corpus::filterByDate(): Position of identified date (@ #";
5155  exception << datePos;
5156  exception << ") is behind the position of the last ";
5157  exception << type;
5158  exception << " (@ #";
5159  exception << lastPos;
5160  exception << ") in ";
5161  exception << type;
5162  exception << " and date map of the corpus";
5163 
5164  throw Exception(exception.str());
5165  }
5166 
5167  // exception when checking tokenized corpus: end of sentence behind date or article
5168  inline void Corpus::exceptionSentenceBehind(
5169  std::string_view function,
5170  std::string_view type,
5171  const std::pair<std::size_t, std::size_t>& sentence,
5172  const TextMapEntry& entry,
5173  const TextMap& map,
5174  const TextMap::const_iterator& next,
5175  const Tokens& tokens
5176  ) {
5177  std::ostringstream exception;
5178 
5179  const auto sentenceEnd{TextMapEntry::end(sentence)};
5180  const auto entryEnd{TextMapEntry::end(entry)};
5181 
5182  Corpus::locale(exception);
5183 
5184  exception << "Corpus::";
5185  exception << function;
5186  exception << "(): End of sentence (l=";
5187  exception << sentence.second;
5188  exception << ") is behind end of ";
5189  exception << type;
5190  exception << " '";
5191  exception << entry.value;
5192  exception << "' (l=";
5193  exception << TextMapEntry::length(entry);
5194  exception << "): ";
5195  exception << sentenceEnd;
5196 
5197  if(sentenceEnd > 0 && sentenceEnd <= tokens.size()) {
5198  exception << " ['";
5199  exception << tokens.at(sentenceEnd - 1);
5200  exception << "']";
5201  }
5202  else if(sentenceEnd == 0) {
5203  exception << " [BEGIN]";
5204  }
5205  else {
5206  exception << " [BEHIND]";
5207  }
5208 
5209  exception << " > ";
5210  exception << entryEnd;
5211 
5212  if(entryEnd > 0 && entryEnd <= tokens.size()) {
5213  exception << " ['";
5214  exception << tokens.at(entryEnd - 1);
5215  exception << "']";
5216  }
5217  else if(entryEnd == 0) {
5218  exception << " [BEGIN]";
5219  }
5220  else {
5221  exception << " [BEHIND]";
5222  }
5223 
5224  exception << " (";
5225  exception << "sentence: '";
5226 
5227  bool addSpace{false};
5228 
5229  for(std::size_t token{TextMapEntry::pos(sentence)}; token < sentenceEnd; ++token) {
5230  if(token < tokens.size()) {
5231  if(addSpace) {
5232  exception << ' ';
5233  }
5234  else {
5235  addSpace = true;
5236  }
5237 
5238  exception << tokens.at(token);
5239  }
5240  }
5241 
5242  exception << "'";
5243 
5244  if(next != map.cend()) {
5245  exception << " (next ";
5246  exception << type;
5247  exception << ": '";
5248  exception << next->value;
5249  exception << "')";
5250  }
5251 
5252  throw Exception(exception.str());
5253  }
5254 
5255  // exception when the stored size of a tokenized corpus is wrong
5256  inline void Corpus::exceptionTokenBytes(
5257  std::string_view function,
5258  std::size_t size,
5259  std::size_t actualSize
5260  ) {
5261  std::ostringstream exception;
5262 
5263  Corpus::locale(exception);
5264 
5265  exception << "Corpus::";
5266  exception << function;
5267  exception << "(): Corpus size is set to ";
5268  exception << size;
5269  exception << "B, but actual corpus size is ";
5270  exception << actualSize;
5271  exception << "B";
5272 
5273  throw Exception(exception.str());
5274  }
5275 
5276  // exception when setting maximum chunk size: invalid maximum chunk size given
5277  inline void Corpus::exceptionInvalidMaxChunkSize(std::size_t size, std::size_t max) {
5278  std::ostringstream exception;
5279 
5280  Corpus::locale(exception);
5281 
5282  exception << "Corpus::getValidLengthOfChunk(): Invalid maximum chunk size (";
5283  exception << size;
5284  exception << " > ";
5285  exception << max;
5286  exception << ")";
5287 
5288  throw Exception(exception.str());
5289  }
5290 
5291  // exception when filtering map: invalid position
5292  inline void Corpus::exceptionPositionTooSmall(
5293  std::size_t pos,
5294  std::size_t expectedMin,
5295  std::string_view name
5296  ) {
5297  std::ostringstream exception;
5298 
5299  Corpus::locale(exception);
5300 
5301  exception << "Corpus::reTokenize(): Invalid position #";
5302  exception << pos;
5303  exception << " (expected: >= #";
5304  exception << expectedMin;
5305  exception << ") in ";
5306  exception << name;
5307 
5308  throw Exception(exception.str());
5309  }
5310 
5311  // exception when checking map: invalid position
5312  inline void Corpus::exceptionInvalidPosition(
5313  std::string_view function,
5314  std::size_t pos,
5315  std::size_t expected,
5316  std::string_view name
5317  ) {
5318  std::ostringstream exception;
5319 
5320  Corpus::locale(exception);
5321 
5322  exception << "Corpus::";
5323  exception << function;
5324  exception << "(): Invalid position #";
5325  exception << pos;
5326  exception << " (expected: #";
5327  exception << expected;
5328  exception << ") in ";
5329  exception << name;
5330 
5331  throw Exception(exception.str());
5332  }
5333 
5334  // exception when checking date map: invalid date length
5335  inline void Corpus::exceptionInvalidDate(
5336  std::string_view function,
5337  std::string_view value,
5338  std::string_view name
5339  ) {
5340  std::ostringstream exception;
5341 
5342  Corpus::locale(exception);
5343 
5344  exception << "Corpus::";
5345  exception << function;
5346  exception << "(): Invalid date in date map: '";
5347  exception << value;
5348  exception << "' (expected string of length ";
5349  exception << dateLength;
5350  exception << ") in '";
5351  exception << name;
5352  exception << "'";
5353 
5354  throw Exception(exception.str());
5355  }
5356 
5357  // exception when checking map: invalid end of last entry
5358  inline void Corpus::exceptionInvalidEnd(
5359  std::string_view function,
5360  std::size_t pos,
5361  std::size_t expected,
5362  std::string_view name
5363  ) {
5364  std::ostringstream exception;
5365 
5366  Corpus::locale(exception);
5367 
5368  exception << "Corpus::";
5369  exception << function;
5370  exception << "(): Invalid end of last entry in map at #";
5371  exception << pos;
5372  exception << " (expected: at #";
5373  exception << expected;
5374  exception << ") in ";
5375  exception << name;
5376 
5377  throw Exception(exception.str());
5378  }
5379 
5380 } /* namespace crawlservpp::Data */
5381 
5382 #endif /* DATA_CORPUS_HPP_ */
std::string getDate(const std::string &date) const
Gets all articles at the specified date from a continous text corpus.
Definition: Corpus.hpp:1145
constexpr std::uint16_t corpusManipTrim
Trim tokens by tokens found in a dictionary.
Definition: Corpus.hpp:128
std::vector< std::size_t > Sizes
Definition: Corpus.hpp:172
TextMap dateMap
Index of dates.
Definition: Corpus.hpp:331
bool isRunning() const
Checks whether the thread is still supposed to run.
Definition: StatusSetter.hpp:236
bool update(std::size_t done, std::size_t total) const
Updates the status with a fractal progress.
Definition: StatusSetter.hpp:161
TextMap & getDateMap()
Gets a reference to the date map of the corpus.
Definition: Corpus.hpp:986
static std::size_t & pos(TextMapEntry &entry)
Gets a reference to the position of a text map entry.
Definition: TextMap.hpp:172
std::size_t filterArticles(const ArticleFunc &callbackArticle, StatusSetter &statusSetter)
Filters a tokenized corpus by removing articles.
Definition: Corpus.hpp:2674
std::string get(std::size_t index) const
Gets the article with the specified index from a continous text corpus.
Definition: Corpus.hpp:1056
std::function< bool(const Tokens &, std::size_t, std::size_t)> ArticleFunc
Definition: Corpus.hpp:175
void stemGerman(std::string &token)
Stems a token in German.
Definition: German.hpp:118
static void moveInto(T &to, T &from)
Moves the elements of an iterable container into another iterable container.
Definition: Container.hpp:99
constexpr auto minSingleUtf8CharSize
Minimum length of single UTF-8 code points to remove.
Definition: Corpus.hpp:97
bool change(const std::string &statusMessage)
Changes the status message and resets the current progress.
Definition: StatusSetter.hpp:143
Class for corpus-specific exceptions.
Definition: Corpus.hpp:315
void copyChunksTokenized(std::size_t chunkSize, Tokens &to, Sizes &tokenNumsTo, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo, std::vector< SentenceMap > &sentenceMapsTo) const
Copies the underlying tokenized text corpus into chunks of the given size.
Definition: Corpus.hpp:2194
const Tokens & getcTokens() const
Gets a constant reference to the tokens in a tokenized text corpus.
Definition: Corpus.hpp:923
std::vector< std::pair< std::size_t, std::size_t > > SentenceMap
Definition: Corpus.hpp:180
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr std::uint16_t corpusManipRemove
Remove single tokens found in a dictionary.
Definition: Corpus.hpp:125
Tokens getDateTokenized(const std::string &date) const
Gets the tokens of all articles at the specified date from a tokenized text corpus.
Definition: Corpus.hpp:1272
Text map entry.
Definition: TextMap.hpp:49
PositionLength SentenceMapEntry
Definition: Corpus.hpp:182
std::function< void(Tokens::iterator, Tokens::iterator)> SentenceFunc
Definition: Corpus.hpp:176
static std::locale locale()
Definition: CommaLocale.hpp:44
std::string substr(std::size_t from, std::size_t len)
Gets a substring from the corpus.
Definition: Corpus.hpp:1363
bool isLastCharValidUtf8(std::string_view stringToCheck)
Tokens tokens
Tokenized text corpus.
Definition: Corpus.hpp:325
bool hasArticleMap() const
Checks whether the corpus has an article map.
Definition: Corpus.hpp:949
std::map< std::string, std::map< std::string, std::vector< Tokens > >> DateArticleSentenceMap
Definition: Corpus.hpp:179
constexpr auto tokenizeUpdateEvery
After how many sentences the status is updated when tokenizing a corpus.
Definition: Corpus.hpp:91
std::vector< Tokens > getArticles() const
Gets the tokens of all articles from a tokenized corpus.
Definition: Corpus.hpp:1295
constexpr std::uint16_t corpusManipLemmatizer
Multilingual lemmatizer.
Definition: Corpus.hpp:122
Class representing a text corpus.
Definition: Corpus.hpp:165
void stemEnglish(std::string &token)
Stems a token in English.
Definition: English.hpp:61
const SentenceMap & getcSentenceMap() const
Gets a constant reference to the sentence map of the corpus.
Definition: Corpus.hpp:1035
constexpr std::uint16_t corpusManipCorrect
Correct single tokens using a aspell dictionary.
Definition: Corpus.hpp:131
bool hasSentenceMap() const
Checks whether the corpus has sentence map.
Definition: Corpus.hpp:1005
void copyChunksContinuous(std::size_t chunkSize, Tokens &to, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo) const
Copies the underlying continous text corpus into chunks of the given size.
Definition: Corpus.hpp:1869
constexpr auto first
Index of the first byte.
Definition: Bytes.hpp:57
void clear()
Clears the corpus.
Definition: Corpus.hpp:3128
static void freeIf(bool isFree, T &target)
Frees memory early by swapping, if necessary.
Definition: Memory.hpp:52
constexpr auto dateLength
The length of a date string in the format YYYY-MM-DD.
Definition: Corpus.hpp:82
std::string & getCorpus()
Gets a reference to the continous text corpus.
Definition: Corpus.hpp:869
constexpr auto filterUpdateEvery
After how many articles the status is updated when filtering a corpus (by queries).
Definition: Corpus.hpp:94
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
bool tokenizeCustom(const std::optional< SentenceFunc > &callback, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
Converts a text corpus into processed tokens, using custom manipulators.
Definition: Corpus.hpp:3081
constexpr auto mergeUpdateEvery
After how many sentences the status is updated when merging corpora.
Definition: Corpus.hpp:88
constexpr std::uint16_t corpusManipTagger
The POS (position of speech) tagger based on Wapiti by Thomas Lavergne.
Definition: Corpus.hpp:110
bool filterByDate(const std::string &from, const std::string &to)
Filters a text corpus by the given date(s).
Definition: Corpus.hpp:2411
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< std::string > Tokens
Definition: Corpus.hpp:173
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void combineContinuous(Tokens &chunks, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, bool deleteInputData)
Creates continuous text corpus by combining previously separated chunks as well as their article and ...
Definition: Corpus.hpp:1499
constexpr std::uint16_t corpusManipNone
Do not manipulate anything.
Definition: Corpus.hpp:107
constexpr std::uint16_t corpusManipTaggerPosterior
The posterior POS tagger based on Wapiti by Thomas Lavergne (slow, but more accurate).
Definition: Corpus.hpp:113
Tokens & getTokens()
Gets a reference to the tokens in a tokenized text corpus.
Definition: Corpus.hpp:908
SentenceMap sentenceMap
Index of sentences.
Definition: Corpus.hpp:334
std::string corpus
Continuous text corpus.
Definition: Corpus.hpp:322
SentenceMap & getSentenceMap()
Gets a reference to the sentence map of the corpus.
Definition: Corpus.hpp:1019
std::pair< std::size_t, std::size_t > PositionLength
Definition: Corpus.hpp:181
Tokens getTokenized(std::size_t index) const
Gets the article with the specified index from a tokenized text corpus.
Definition: Corpus.hpp:1192
bool empty() const
Checks whether the corpus is empty.
Definition: Corpus.hpp:1337
bool isISODateInRange(std::string_view isoDate, std::string_view rangeFrom, std::string_view rangeTo)
Checks whether the given ISO date is in the given range of dates.
Definition: DateTime.hpp:1105
TextMap & getArticleMap()
Gets a reference to the article map of the corpus.
Definition: Corpus.hpp:958
constexpr std::uint16_t corpusManipEnglishStemmer
The porter2_stemmer algorithm for English only, implemented by Sean Massung.
Definition: Corpus.hpp:116
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
TextMap articleMap
Index of articles and their IDs.
Definition: Corpus.hpp:328
void create(Tokens &texts, bool deleteInputData)
Creates text corpus from a vector of strings.
Definition: Corpus.hpp:1386
std::size_t size() const
Gets the size of the text corpus, in bytes.
Definition: Corpus.hpp:1324
const TextMap & getcDateMap() const
Gets a constant reference to the date map of the corpus.
Definition: Corpus.hpp:995
constexpr std::uint16_t corpusManipGermanStemmer
Simple stemmer for German only, based on CISTEM by Leonie Weißweiler and Alexander Fraser...
Definition: Corpus.hpp:119
void copyContinuous(std::string &to) const
Copies the underlying continuous text corpus to the given string.
Definition: Corpus.hpp:1792
bool tokenize(const std::vector< std::uint16_t > &manipulators, const std::vector< std::string > &models, const std::vector< std::string > &dictionaries, const std::vector< std::string > &languages, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
Converts a text corpus into processed tokens.
Definition: Corpus.hpp:2822
const TextMap & getcArticleMap() const
Gets a constant reference to the article map of the corpus.
Definition: Corpus.hpp:967
std::size_t getNumTokens() const
Gets the number of tokens in the corpus.
Definition: Corpus.hpp:937
bool hasDateMap() const
Checks whether the corpus has a date map.
Definition: Corpus.hpp:977
constexpr std::uint8_t utf8MaxBytes
Maximum number of bytes used by one UTF-8-encoded multibyte character.
Definition: Corpus.hpp:85
void finish() const
Re-sets the progress of the thread.
Definition: StatusSetter.hpp:241
std::string value
Value of the annotation.
Definition: TextMap.hpp:69
bool isTokenized() const
Gets whether the corpus has been tokenized.
Definition: Corpus.hpp:895
Corpus(bool consistencyChecks)
Constructor setting the internal property.
Definition: Corpus.hpp:807
Namespace for different types of data.
constexpr auto maxSingleUtf8CharSize
Maximum length of single UTF-8 code points to remove.
Definition: Corpus.hpp:100
const std::string & getcCorpus() const
Gets a constant reference to the continous text corpus.
Definition: Corpus.hpp:883
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
static std::size_t & length(TextMapEntry &entry)
Gets a reference to the length of a text map entry.
Definition: TextMap.hpp:234
static std::size_t end(const T &entry)
Gets the end of a map entry.
Definition: TextMap.hpp:221
void combineTokenized(Tokens &chunks, Sizes &tokenNums, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, std::vector< SentenceMap > &sentenceMaps, bool deleteInputData)
Creates a tokenized text corpus by combining previously separated chunks, as well as their article...
Definition: Corpus.hpp:1662