35 #ifndef DATA_CORPUS_HPP_ 36 #define DATA_CORPUS_HPP_ 45 #include "../Helper/CommaLocale.hpp" 46 #include "../Helper/Container.hpp" 47 #include "../Helper/DateTime.hpp" 48 #include "../Helper/Memory.hpp" 49 #include "../Helper/Utf8.hpp" 50 #include "../Main/Exception.hpp" 51 #include "../Struct/StatusSetter.hpp" 52 #include "../Struct/TextMap.hpp" 67 #include <string_view> 172 using Sizes = std::vector<std::size_t>;
175 using ArticleFunc = std::function<bool(const Tokens&, std::size_t, std::size_t)>;
176 using SentenceFunc = std::function<void(Tokens::iterator, Tokens::iterator)>;
179 = std::map<std::string, std::map<std::string, std::vector<Tokens>>>;
180 using SentenceMap = std::vector<std::pair<std::size_t, std::size_t>>;
187 explicit Corpus(
bool consistencyChecks);
188 Corpus(std::vector<Corpus>& others,
bool consistencyChecks,
StatusSetter& statusSetter);
195 [[nodiscard]]
const std::string&
getcCorpus()
const;
214 [[nodiscard]] std::string
get(std::size_t index)
const;
215 [[nodiscard]] std::string
get(
const std::string& id)
const;
216 [[nodiscard]] std::string
getDate(
const std::string& date)
const;
220 [[nodiscard]] std::vector<Tokens>
getArticles()
const;
222 [[nodiscard]] std::size_t
size()
const;
223 [[nodiscard]]
bool empty()
const;
225 [[nodiscard]] std::string
substr(std::size_t from, std::size_t len);
237 std::vector<std::string>& articleIds,
238 std::vector<std::string>& dateTimes,
243 std::vector<TextMap>& articleMaps,
244 std::vector<TextMap>& dateMaps,
250 std::vector<TextMap>& articleMaps,
251 std::vector<TextMap>& dateMaps,
252 std::vector<SentenceMap>& sentenceMaps,
263 TextMap& articleMapTo,
267 std::size_t chunkSize,
269 std::vector<TextMap>& articleMapsTo,
270 std::vector<TextMap>& dateMapsTo
273 std::size_t chunkSize,
276 std::vector<TextMap>& articleMapsTo,
277 std::vector<TextMap>& dateMapsTo,
278 std::vector<SentenceMap>& sentenceMapsTo
285 bool filterByDate(
const std::string& from,
const std::string& to);
293 const std::vector<std::uint16_t>& manipulators,
294 const std::vector<std::string>& models,
295 const std::vector<std::string>& dictionaries,
296 const std::vector<std::string>& languages,
297 std::uint64_t freeMemoryEvery,
301 const std::optional<SentenceFunc>& callback,
302 std::uint64_t freeMemoryEvery,
340 bool tokenized{
false};
341 bool checkConsistency{
false};
342 std::size_t tokenBytes{};
347 void checkThatNotTokenized(std::string_view
function)
const;
348 void checkThatTokenized(std::string_view
function)
const;
353 std::string& dateTime,
358 const std::string& content,
359 const std::optional<std::reference_wrapper<const TextMap>>& articles,
360 const std::optional<std::reference_wrapper<const TextMap>>& dates,
365 void check(std::string_view
function)
const;
366 void checkTokenized(std::string_view
function)
const;
372 std::vector<TextMap>& articleMapsTo,
373 std::vector<TextMap>& dateMapsTo,
374 std::vector<SentenceMap>& sentenceMapsTo
379 [[nodiscard]]
bool tokenizeTokenized(
380 const std::optional<SentenceFunc>& callback,
383 [[nodiscard]]
bool tokenizeContinuous(
384 const std::optional<SentenceFunc>& callback,
385 std::uint64_t freeMemoryEvery,
390 static bool combineCorpora(
391 std::vector<Corpus>& from,
396 [[nodiscard]]
static Tokens getTokensForEntry(
398 const std::string&
id,
402 [[nodiscard]]
static std::size_t getValidLengthOfChunk(
403 const std::string& source,
405 std::size_t maxLength,
406 std::size_t maxChunkSize
408 [[nodiscard]]
static std::size_t getValidLengthOfChunk(
409 const std::string& chunkContent,
410 std::size_t maxChunkSize
413 static void checkTokensForChunking(
const Tokens& tokens);
415 static void reserveChunks(
419 std::vector<TextMap>& articleMapsTo,
420 std::vector<TextMap>& dateMapsTo,
421 std::vector<SentenceMap>& sentenceMapsTo,
426 static void checkForEntry(
427 std::string_view type,
429 std::size_t& nextIndex,
431 std::size_t chunkOffset,
433 bool checkConsistency
436 static void finishChunk(
437 std::string& contentFrom,
441 std::vector<SentenceMap>& sentencesTo,
442 std::size_t chunkTokens,
443 std::size_t& chunkOffset,
445 std::size_t nextChunkSize
447 static void splitEntry(
453 static void finishMap(
455 std::vector<TextMap>& to,
460 std::string_view type,
461 const std::vector<std::string>& values,
465 static std::size_t bytes(
const Tokens& tokens);
467 static void addChunkMap(
468 const std::optional<std::reference_wrapper<const TextMap>>& from,
474 static void checkMap(
475 std::string_view
function,
476 std::string_view name,
482 static void checkMap(
483 std::string_view
function,
489 static void skipEntriesBefore(
491 std::size_t& entryIndex,
492 std::size_t& entryEnd,
497 static void removeEmpty(
Tokens& from);
498 static void removeToken(TextMap& map, std::size_t entryIndex,
bool& emptiedTo);
501 static std::size_t getFirstEnd(
const TextMap& map);
502 static std::size_t getEntryEnd(
const TextMap& map, std::size_t entryIndex);
504 static void processSentence(
506 const std::optional<SentenceFunc>& callback,
509 std::size_t& currentToken,
510 std::size_t& sentenceFirstToken,
515 std::size_t& tokenBytes
518 static bool addCorpus(
525 static bool addSentences(
531 static void finishArticle(
532 std::vector<Tokens>& from,
534 const std::string& date,
535 const std::string& article
537 static void nextEntry(
542 std::size_t corpusEnd
545 static bool pushSentence(
547 std::size_t chunkSize,
548 std::size_t chunkOffset,
549 std::size_t& chunkTokens,
550 std::string& chunkContent,
553 std::size_t& tokensComplete,
554 std::size_t& additionalBytes
557 static std::string mergingStatus(std::size_t number, std::size_t total);
559 static void locale(std::ostream& os);
562 static void exceptionGetNoArticleMap(
563 std::string_view
function,
566 static void exceptionArticleOutOfBounds(
567 std::string_view
function,
571 static void exceptionDateLength(
572 std::string_view
function,
575 static void exceptionArticleMapStart(
576 std::string_view
function,
577 std::string_view expected,
578 std::size_t chunkIndex,
579 std::size_t numberOfChunks,
582 static void exceptionLastSentenceLength(
585 std::size_t corpusSize
587 static void exceptionArticleBehindDate(
588 std::size_t articlePos,
592 static void exceptionChunkSize(std::size_t size, std::size_t chunkSize);
593 static void exceptionArticleMapEnd(std::size_t pos, std::size_t size);
594 static void exceptionUnexpectedBeforeSentence(
595 std::string_view type,
596 std::string_view name,
598 std::size_t sentencePos
600 static void exceptionMismatchWithDate(
601 std::string_view type,
605 static void exceptionDateBehindLast(
606 std::string_view type,
610 static void exceptionSentenceBehind(
611 std::string_view
function,
612 std::string_view type,
613 const std::pair<std::size_t, std::size_t>& sentence,
616 const TextMap::const_iterator& next,
619 static void exceptionTokenBytes(
620 std::string_view
function,
622 std::size_t actualSize
624 static void exceptionInvalidMaxChunkSize(std::size_t size, std::size_t max);
625 static void exceptionPositionTooSmall(
627 std::size_t expectedMin,
628 std::string_view name
630 static void exceptionInvalidPosition(
631 std::string_view
function,
633 std::size_t expected,
634 std::string_view name
636 static void exceptionInvalidDate(
637 std::string_view
function,
638 std::string_view value,
639 std::string_view name
641 static void exceptionInvalidEnd(
642 std::string_view
function,
644 std::size_t expected,
645 std::string_view name
653 template<
class T>
static void reserveCombined(
654 const std::vector<T>& vec,
662 [](
const auto& a,
const auto& b) {
670 template<
typename T> [[nodiscard]]
static bool entryBeginsAt(
679 template<
typename T>
static void removeEmptyEntries(
687 [&
tokens](
const auto& entry) {
695 if(!(tokens.at(tokenIndex).empty())) {
708 template<
typename T>
static void removeEmptyEntries(T& map) {
713 [](
const auto& entry) {
722 template<
typename T>
static void skipEntriesBefore(
724 std::size_t& entryIndex,
735 entryIndex < map.size()
743 if(entryIndex < map.size()) {
751 template<
typename T>
static void updatePosition(
752 std::string_view type,
754 std::size_t entryIndex,
755 std::size_t entryPos,
760 entryIndex >= map.size()
767 Corpus::exceptionPositionTooSmall(
778 template<
typename T>
static void removeTokenFromLength(
780 std::size_t entryIndex,
782 std::size_t tokenIndex
785 entryIndex < map.size()
786 && tokenIndex >= origin.first
787 && tokenIndex < origin.second
807 inline Corpus::Corpus(
bool consistencyChecks) : checkConsistency(consistencyChecks) {}
829 std::vector<Corpus>& others,
830 bool consistencyChecks,
832 ) : checkConsistency(consistencyChecks) {
838 if(others.size() == 1) {
839 std::swap(*
this, others[0]);
847 if(!Corpus::combineCorpora(others, combined, statusSetter)) {
852 statusSetter.
change(
"Preparing combined corpus...");
854 this->moveCombinedIn(combined);
870 this->checkThatNotTokenized(
"getCorpus");
884 this->checkThatNotTokenized(
"getcCorpus");
896 return this->tokenized;
909 this->checkThatTokenized(
"getTokens");
924 this->checkThatTokenized(
"getcTokens");
938 this->checkThatTokenized(
"getNumTokens");
940 return this->
tokens.size();
978 return !(this->
dateMap.empty());
1020 this->checkThatTokenized(
"getSentenceMap");
1036 this->checkThatTokenized(
"getcSentenceMap");
1057 this->checkThatNotTokenized(
"get");
1060 Corpus::exceptionGetNoArticleMap(
1067 Corpus::exceptionArticleOutOfBounds(
1074 const auto& articleEntry{this->
articleMap.at(index)};
1076 return this->
corpus.substr(
1099 this->checkThatNotTokenized(
"get");
1105 " No ID has been specified" 1109 const auto& articleEntry{
1113 [&id](
const auto& entry) {
1114 return entry.value == id;
1119 if(articleEntry == this->
articleMap.cend()) {
1120 return std::string();
1123 return this->
corpus.substr(
1146 this->checkThatNotTokenized(
"getDate");
1150 Corpus::exceptionDateLength(
1156 const auto& dateEntry{
1160 [&date](
const auto& entry) {
1161 return entry.value == date;
1166 if(dateEntry == this->
dateMap.cend()) {
1167 return std::string();
1170 return this->
corpus.substr(
1193 this->checkThatTokenized(
"getTokenized");
1196 Corpus::exceptionGetNoArticleMap(
1203 Corpus::exceptionArticleOutOfBounds(
1210 const auto& articleEntry{this->
articleMap.at(index)};
1217 for(
auto tokenIndex{
TextMapEntry::pos(articleEntry)}; tokenIndex < articleEnd; ++tokenIndex) {
1218 copy.emplace_back(this->
tokens.at(tokenIndex));
1242 this->checkThatTokenized(
"getTokenized");
1247 "Corpus::getTokenized():" 1248 " No ID has been specified" 1273 this->checkThatTokenized(
"getDateTokenized");
1277 Corpus::exceptionDateLength(
1283 return Corpus::getTokensForEntry(this->
dateMap, date, this->
tokens);
1296 this->checkThatTokenized(
"getArticles");
1298 std::vector<Tokens> copy;
1302 for(
const auto& article : this->
articleMap) {
1303 copy.emplace_back();
1309 for(
auto tokenIndex{
TextMapEntry::pos(article)}; tokenIndex < articleEnd; ++tokenIndex) {
1310 copy.back().emplace_back(this->
tokens.at(tokenIndex));
1325 return this->tokenized ? this->tokenBytes : this->
corpus.size();
1338 if(this->tokenized) {
1339 return this->
tokens.empty();
1342 return this->
corpus.empty();
1364 this->checkThatNotTokenized(
"substr");
1366 return this->
corpus.substr(from, len);
1388 bool deleteInputData
1394 for(
auto& text : texts) {
1401 this->
corpus.push_back(
' ');
1407 if(!(this->
corpus.empty())) {
1433 std::vector<std::string>& articleIds,
1434 std::vector<std::string>& dateTimes,
1435 bool deleteInputData
1438 if(articleIds.empty() && dateTimes.empty()) {
1439 this->
create(texts, deleteInputData);
1447 std::string emptyString;
1450 for(std::size_t n{}; n < texts.size(); ++n) {
1453 articleIds.size() > n ? articleIds[n] : emptyString,
1454 dateTimes.size() > n ? dateTimes[n] : emptyString,
1465 if(!(this->
corpus.empty())) {
1470 if(!dateMapEntry.
value.empty()) {
1471 this->
dateMap.emplace_back(dateMapEntry);
1501 std::vector<TextMap>& articleMaps,
1502 std::vector<TextMap>& dateMaps,
1503 bool deleteInputData
1509 Corpus::reserveCombined(chunks, this->
corpus);
1510 Corpus::reserveCombined(articleMaps, this->
articleMap);
1511 Corpus::reserveCombined(dateMaps, this->
dateMap);
1514 for(
auto chunkIt = chunks.begin(); chunkIt != chunks.end(); ++chunkIt) {
1515 const auto chunkIndex{
1516 static_cast<std::size_t
>(
1523 const auto pos{this->
corpus.size()};
1526 this->
corpus += *chunkIt;
1530 bool beginsWithNewArticle{
false};
1532 if(articleMaps.size() > chunkIndex) {
1534 auto& map{articleMaps[chunkIndex]};
1537 const auto&
first{map[0]};
1541 Corpus::exceptionArticleMapStart(
1542 "combineContinuous",
1550 auto it{map.cbegin()};
1563 beginsWithNewArticle =
true;
1567 for(; it != map.cend(); ++it) {
1579 if(dateMaps.size() > chunkIndex) {
1581 auto& map{dateMaps[chunkIndex]};
1584 const auto&
first{map[0]};
1585 auto it{map.cbegin()};
1596 if(beginsWithNewArticle) {
1604 for(; it != map.cend(); ++it) {
1665 std::vector<TextMap>& articleMaps,
1666 std::vector<TextMap>& dateMaps,
1667 std::vector<SentenceMap>& sentenceMaps,
1668 bool deleteInputData
1675 this->checkConsistency
1677 tokenNums.size() > chunks.size()
1678 || articleMaps.size() > chunks.size()
1679 || dateMaps.size() > chunks.size()
1680 || sentenceMaps.size() > chunks.size()
1684 "Corpus::combineTokenized():" 1685 " More token counts, article maps, date maps," 1686 " and/or sentence maps than corpus chunks" 1690 if(chunks.empty()) {
1694 if(sentenceMaps.empty()) {
1696 "Corpus::combineTokenized():" 1697 " No sentence maps for non-empty corpus" 1702 const auto totalTokens{
1712 this->
tokens.reserve(totalTokens);
1714 Corpus::reserveCombined(articleMaps, this->
articleMap);
1715 Corpus::reserveCombined(dateMaps, this->
dateMap);
1716 Corpus::reserveCombined(sentenceMaps, this->
sentenceMap);
1719 std::size_t chunkIndex{};
1720 bool splitToken{
false};
1722 for(
auto& chunk : chunks) {
1725 (chunkIndex < articleMaps.size()) ?
1726 std::optional<std::reference_wrapper<const TextMap>>{articleMaps[chunkIndex]}
1728 (chunkIndex < dateMaps.size()) ?
1729 std::optional<std::reference_wrapper<const TextMap>>{dateMaps[chunkIndex]}
1731 sentenceMaps.at(chunkIndex),
1738 if(chunkIndex < articleMaps.size()) {
1742 if(chunkIndex < dateMaps.size()) {
1751 "Corpus::combineTokenized():" 1752 " Empty sentence map for non-empty corpus" 1757 this->checkConsistency
1760 Corpus::exceptionLastSentenceLength(
1772 this->tokenized =
true;
1775 if(this->checkConsistency) {
1776 this->check(
"combineTokenized");
1793 this->checkThatNotTokenized(
"copyContinuous");
1814 TextMap& articleMapTo,
1817 this->checkThatNotTokenized(
"copyContinuous");
1870 std::size_t chunkSize,
1872 std::vector<TextMap>& articleMapsTo,
1873 std::vector<TextMap>& dateMapsTo
1876 if(this->
corpus.empty()) {
1880 if(chunkSize == 0) {
1882 "Corpus::copyChunksContinuous():" 1883 " Invalid chunk size (zero)" 1884 " for a non-empty corpus" 1888 this->checkThatNotTokenized(
"copyChunksContinuous");
1891 if(this->
corpus.size() <= chunkSize) {
1892 to.emplace_back(this->
corpus);
1893 articleMapsTo.emplace_back(this->
articleMap);
1894 dateMapsTo.emplace_back(this->
dateMap);
1901 this->
corpus.size() / chunkSize
1902 + (this->
corpus.size() % chunkSize > 0 ? 1 : 0)
1905 to.reserve(to.size() + chunks);
1908 articleMapsTo.reserve(articleMapsTo.size() + chunks);
1911 if(!(this->
dateMap.empty())) {
1912 dateMapsTo.reserve(dateMapsTo.size() + chunks);
1916 bool noSpace{
false};
1922 while(pos < this->
corpus.size()) {
1926 Corpus::getValidLengthOfChunk(
1934 pos += to.back().size();
1940 std::size_t corpusPos{};
1941 std::size_t articlePos{};
1943 auto dateIt{this->
dateMap.cbegin()};
1945 while(corpusPos < this->
corpus.size()) {
1947 TextMap chunkArticleMap;
1948 TextMap chunkDateMap;
1953 chunk.push_back(
' ');
1961 for(; articleIt != this->
articleMap.cend(); ++articleIt) {
1962 if(dateIt != this->
dateMap.cend()) {
1972 this->checkConsistency
1978 Corpus::exceptionArticleBehindDate(
1987 const auto remaining{articleIt->l - articlePos};
1989 if(chunk.size() + remaining <= chunkSize) {
1992 chunkArticleMap.emplace_back(chunk.size(), remaining, articleIt->value);
1994 if(dateIt != this->
dateMap.cend()) {
1995 if(!chunkDateMap.empty() && chunkDateMap.back().value == dateIt->value) {
1999 else if(corpusPos >= dateIt->p) {
2000 chunkDateMap.emplace_back(chunk.size(), remaining, dateIt->value);
2004 chunk.append(this->
corpus, corpusPos, remaining);
2007 corpusPos += remaining;
2013 if(chunk.size() < chunkSize) {
2015 chunk.push_back(
' ');
2020 if(chunk.size() == chunkSize) {
2039 auto fill{chunkSize - chunk.size()};
2046 fill = Corpus::getValidLengthOfChunk(
2057 chunkArticleMap.emplace_back(chunk.size(), fill, articleIt->value);
2059 if(dateIt != this->
dateMap.cend()) {
2060 if(!chunkDateMap.empty() && chunkDateMap.back().value == dateIt->value) {
2064 else if(corpusPos >= dateIt->p) {
2065 chunkDateMap.emplace_back(chunk.size(), fill, dateIt->value);
2069 chunk.append(this->
corpus, corpusPos, fill);
2080 if(this->checkConsistency) {
2081 if(chunk.size() > chunkSize) {
2082 Corpus::exceptionChunkSize(chunk.size(), chunkSize);
2085 if(articleIt == this->
articleMap.cend() && corpusPos < this->
corpus.size()) {
2086 Corpus::exceptionArticleMapEnd(corpusPos, this->
corpus.size());
2096 to.emplace_back(chunk);
2097 articleMapsTo.emplace_back(chunkArticleMap);
2098 dateMapsTo.emplace_back(chunkDateMap);
2102 if(!(this->
articleMap.empty()) && !to.empty()) {
2104 if(this->checkConsistency && to.back().empty()) {
2106 "Corpus::copyChunksContinuous():" 2107 " The final chunk is empty" 2113 to.back().pop_back();
2117 if(to.back().empty()) {
2122 if(this->checkConsistency && to.back().empty()) {
2124 "Corpus::copyChunksContinuous():" 2125 " The final chunk is empty" 2195 std::size_t chunkSize,
2198 std::vector<TextMap>& articleMapsTo,
2199 std::vector<TextMap>& dateMapsTo,
2200 std::vector<SentenceMap>& sentenceMapsTo
2203 if(this->
tokens.empty()) {
2207 if(chunkSize == 0) {
2209 "Corpus::copyChunksTokenized():" 2210 " Invalid chunk size (zero)" 2211 " for a non-empty corpus" 2217 "Corpus::copyChunksTokenized():" 2218 " Empty sentence map" 2219 " for a non-empty corpus" 2223 this->checkThatTokenized(
"copyChunksTokenized");
2225 Corpus::checkTokensForChunking(this->
tokens);
2234 if(
size < chunkSize) {
2235 this->addAsOneChunk(
2249 const auto sizeOfLastChunk{
size % chunkSize};
2250 const auto numberOfChunks{
size / chunkSize + (sizeOfLastChunk > 0 ? 1 : 0)};
2252 Corpus::reserveChunks(
2264 std::size_t chunkOffset{};
2265 std::size_t chunkTokens{};
2266 std::string chunkContent;
2269 TextMap chunkArticles;
2270 std::size_t nextDate{};
2271 std::size_t nextArticle{};
2272 std::size_t tokensComplete{};
2273 std::size_t additionalBytes{};
2277 chunkContent.reserve(chunkSize);
2280 Corpus::checkForEntry(
2287 this->checkConsistency
2290 Corpus::checkForEntry(
2297 this->checkConsistency
2301 Corpus::pushSentence(
2313 const bool splitToken{additionalBytes > 0};
2315 Corpus::finishChunk(
2324 (sizeOfLastChunk == 0 || to.size() < (numberOfChunks - 1)) ?
2325 chunkSize : (sizeOfLastChunk + 1)
2328 Corpus::splitEntry(chunkDates, chunkTokens, splitToken, remainingDate);
2329 Corpus::splitEntry(chunkArticles, chunkTokens, splitToken, remainingArticle);
2331 Corpus::finishMap(chunkDates, dateMapsTo, remainingDate);
2332 Corpus::finishMap(chunkArticles, articleMapsTo, remainingArticle);
2340 Corpus::finishChunk(
2352 Corpus::finishMap(chunkDates, dateMapsTo, remainingDate);
2353 Corpus::finishMap(chunkArticles, articleMapsTo, remainingArticle);
2357 to.back().pop_back();
2359 if(to.back().empty()) {
2361 tokenNumsTo.pop_back();
2362 sentenceMapsTo.pop_back();
2364 if(!articleMapsTo.empty()) {
2365 articleMapsTo.pop_back();
2368 if(!dateMapsTo.empty()) {
2369 dateMapsTo.pop_back();
2413 if(from.empty() && to.empty()) {
2418 if(this->tokenized) {
2419 if(this->
tokens.empty()) {
2423 else if(this->
corpus.empty()) {
2435 auto begin{this->
dateMap.cbegin()};
2437 for(; begin != this->
dateMap.cend(); ++begin) {
2443 if(begin == this->
dateMap.cend()) {
2455 for(; end != this->
dateMap.cend(); ++end) {
2462 if(begin != this->
dateMap.cbegin()) {
2464 TextMap(begin, end).swap(this->
dateMap);
2468 this->
dateMap.resize(std::distance(this->
dateMap.cbegin(), end));
2476 if(this->tokenized) {
2478 std::size_t deleteBytes{};
2480 const auto deleteTo{this->
tokens.begin() + offset};
2482 if(deleteTo != this->
tokens.begin()) {
2483 deleteBytes = std::accumulate(
2487 [](
const auto& a,
const auto& b) {
2488 return a + b.size();
2495 const auto deleteFrom{this->
tokens.begin() + len};
2497 if(deleteFrom != this->
tokens.end()) {
2498 deleteBytes += std::accumulate(
2502 [](
const auto& a,
const auto& b) {
2503 return a + b.size();
2510 if(deleteBytes > 0) {
2511 this->
tokens.shrink_to_fit();
2513 this->tokenBytes -= deleteBytes;
2519 this->
corpus.erase(0, offset);
2522 this->
corpus.resize(len);
2523 this->
corpus.shrink_to_fit();
2529 for(; begin != this->
articleMap.cend(); ++begin) {
2530 if(begin->p == offset) {
2535 if(this->checkConsistency && begin->p > offset) {
2536 Corpus::exceptionMismatchWithDate(
2545 if(this->checkConsistency && begin == this->
articleMap.cend()) {
2546 Corpus::exceptionDateBehindLast(
2558 for(; end != this->
articleMap.cend(); ++end) {
2559 if(end->p >= offset + len) {
2572 std::distance(this->
articleMap.cbegin(), end)
2576 if(this->tokenized) {
2580 for(; smBegin != this->
sentenceMap.cend(); ++smBegin) {
2581 if(smBegin->first == offset) {
2586 if(this->checkConsistency && smBegin->first > offset) {
2587 Corpus::exceptionMismatchWithDate(
2596 if(this->checkConsistency && smBegin == this->
sentenceMap.cend()) {
2597 Corpus::exceptionDateBehindLast(
2605 auto smEnd = smBegin;
2609 for(; smEnd != this->
sentenceMap.cend(); ++smEnd) {
2610 if(smEnd->first >= offset + len) {
2629 for(
auto& date : this->
dateMap) {
2641 if(this->checkConsistency) {
2642 this->check(
"filterByDate");
2678 this->checkThatTokenized(
"filterArticle");
2680 if(this->
tokens.empty()) {
2684 statusSetter.
change(
"Filtering corpus...");
2686 std::size_t articleCounter{};
2687 std::size_t statusCounter{};
2688 std::size_t removed{};
2690 for(
const auto& article : this->
articleMap) {
2703 tokenIndex < articleEnd;
2706 this->tokenBytes -= this->
tokens.at(tokenIndex).size();
2718 if(!statusSetter.
update(articleCounter, this->articleMap.size(),
true)) {
2733 statusSetter.
change(
"Cleaning corpus...");
2738 if(this->checkConsistency) {
2739 this->check(
"filterArticles");
2823 const std::vector<std::uint16_t>& manipulators,
2824 const std::vector<std::string>& models,
2825 const std::vector<std::string>& dictionaries,
2826 const std::vector<std::string>& languages,
2827 std::uint64_t freeMemoryEvery,
2830 bool isManipulation{
2832 manipulators.begin(),
2834 [](
const auto manipulator) {
2841 std::vector<Data::Tagger> taggers;
2842 std::unique_ptr<Lemmatizer> lemmatizer;
2843 std::unique_ptr<TokenRemover> tokenRemover;
2844 std::vector<TokenCorrect> tokenCorrectors;
2845 std::size_t manipulatorIndex{};
2847 for(
const auto& manipulator : manipulators) {
2848 switch(manipulator) {
2854 Corpus::notUsed(
"model", models, manipulatorIndex);
2855 Corpus::notUsed(
"dictionary", dictionaries, manipulatorIndex);
2856 Corpus::notUsed(
"language", languages, manipulatorIndex);
2862 if(models.at(manipulatorIndex).empty()) {
2864 "Corpus::tokenize():" 2865 " No model set for part-of-speech tagger (manipulator #" 2866 + std::to_string(manipulatorIndex + 1)
2871 Corpus::notUsed(
"dictionary", dictionaries, manipulatorIndex);
2872 Corpus::notUsed(
"language", languages, manipulatorIndex);
2874 taggers.emplace_back();
2879 if(dictionaries.at(manipulatorIndex).empty()) {
2881 "Corpus::tokenize():" 2882 " No dictionary set for lemmatizer (manipulator #" 2883 + std::to_string(manipulatorIndex + 1)
2888 Corpus::notUsed(
"model", models, manipulatorIndex);
2889 Corpus::notUsed(
"language", languages, manipulatorIndex);
2892 lemmatizer = std::make_unique<Lemmatizer>();
2899 if(dictionaries.at(manipulatorIndex).empty()) {
2901 "Corpus::tokenize():" 2902 " No dictionary set for token remover/trimmer (manipulator #" 2903 + std::to_string(manipulatorIndex + 1)
2908 Corpus::notUsed(
"model", models, manipulatorIndex);
2909 Corpus::notUsed(
"language", languages, manipulatorIndex);
2912 tokenRemover = std::make_unique<TokenRemover>();
2918 Corpus::notUsed(
"model", models, manipulatorIndex);
2919 Corpus::notUsed(
"dictionary", dictionaries, manipulatorIndex);
2921 tokenCorrectors.emplace_back(languages.at(manipulatorIndex));
2927 "Corpus::tokenize():" 2928 " Invalid manipulator (#" 2929 + std::to_string(manipulator)
2938 auto callbackLambda = [
2947 Tokens::iterator sentenceBegin,
2948 Tokens::iterator sentenceEnd
2950 std::size_t manipulatorIndex{};
2951 std::size_t taggerIndex{};
2952 std::size_t correctIndex{};
2954 for(
const auto& manipulator : manipulators) {
2955 switch(manipulator) {
2961 taggers.at(taggerIndex).label(sentenceBegin, sentenceEnd);
2968 for(
auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2975 for(
auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2982 for(
auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2983 lemmatizer->lemmatize(*tokenIt, dictionaries.at(manipulatorIndex));
2989 for(
auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2990 tokenRemover->remove(*tokenIt, dictionaries.at(manipulatorIndex));
2996 for(
auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
2997 tokenRemover->trim(*tokenIt, dictionaries.at(manipulatorIndex));
3003 for(
auto& tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
3004 tokenCorrectors.at(correctIndex).correct(*tokenIt);
3013 "Corpus::tokenize():" 3014 " Invalid manipulator (#" 3015 + std::to_string(manipulator)
3026 isManipulation ? std::optional<SentenceFunc>{callbackLambda} : std::nullopt,
3082 const std::optional<SentenceFunc>& callback,
3083 std::uint64_t freeMemoryEvery,
3086 if(this->tokenized) {
3089 this->tokenizeTokenized(
3101 this->tokenizeContinuous(
3136 this->tokenized =
false;
3137 this->tokenBytes = 0;
3146 for(
auto& [date, articles] : from) {
3152 this->
dateMap.back().value = date;
3154 for(
auto& [article, sentences] : articles) {
3155 std::size_t articleLength{};
3162 for(
auto& sentence : sentences) {
3169 articleLength += sentence.size();
3182 this->tokenized =
true;
3186 inline void Corpus::checkThatNotTokenized(std::string_view
function)
const {
3187 if(this->tokenized) {
3190 + std::string(
function)
3191 +
"(): The corpus has been tokenized" 3197 inline void Corpus::checkThatTokenized(std::string_view
function)
const {
3198 if(!(this->tokenized)) {
3201 + std::string(
function)
3202 +
"(): The corpus has not been tokenized" 3208 inline void Corpus::addArticle(
3211 std::string& dateTime,
3213 bool deleteInputData
3215 auto pos{this->
corpus.size()};
3219 this->
articleMap.emplace_back(pos, text.length(), id);
3230 this->
articleMap.emplace_back(pos, text.length());
3234 if(!dateTime.empty()) {
3238 const std::string date(dateTime, 0,
dateLength);
3241 if(!dateMapEntry.
value.empty()) {
3243 if(dateMapEntry.
value == date) {
3250 this->
dateMap.emplace_back(dateMapEntry);
3260 else if(!dateMapEntry.
value.empty()) {
3262 this->
dateMap.emplace_back(dateMapEntry);
3276 this->
corpus.push_back(
' ');
3280 inline void Corpus::addChunk(
3281 const std::string& content,
3282 const std::optional<std::reference_wrapper<const TextMap>>& articles,
3283 const std::optional<std::reference_wrapper<const TextMap>>& dates,
3287 if(content.empty()) {
3291 const auto chunkOffset{
3302 for(
const auto& sentence : sentences) {
3316 if(this->
tokens.empty()) {
3317 this->
tokens.emplace_back();
3321 for(
const auto c : content) {
3323 this->
tokens.emplace_back();
3328 this->
tokens.back().push_back(c);
3330 ++(this->tokenBytes);
3334 Corpus::addChunkMap(articles, this->
articleMap, chunkOffset, continueToken);
3335 Corpus::addChunkMap(dates, this->
dateMap, chunkOffset, continueToken);
3338 continueToken = content.back() !=
'\n';
3342 inline void Corpus::check(std::string_view
function)
const {
3343 if(this->tokenized) {
3344 this->checkTokenized(
function);
3348 this->tokenized ? this->
tokens.size() : this->
corpus.size()
3351 Corpus::checkMap(
function,
"date map", this->
dateMap, end, this->tokenized,
true);
3352 Corpus::checkMap(
function,
"article map", this->
articleMap, end, this->tokenized,
false);
3353 Corpus::checkMap(
function, this->
sentenceMap, end, this->tokenized);
3357 inline void Corpus::checkTokenized(std::string_view
function)
const {
3367 for(
auto date{this->
dateMap.cbegin()}; date != this->
dateMap.cend(); ++date) {
3369 while(article != this->
articleMap.cend() && article->p < date->p) {
3374 while(sentence != this->
sentenceMap.cend() && sentence->first < date->p) {
3381 while(article != this->
articleMap.cend() && article->p < dateEnd) {
3383 while(sentence != this->
sentenceMap.cend() && sentence->first < date->p) {
3394 while(sentence != this->
sentenceMap.cend() && sentence->first < articleEnd) {
3398 if(sentenceEnd > dateEnd) {
3399 Corpus::exceptionSentenceBehind(
3410 if(sentenceEnd > articleEnd) {
3411 Corpus::exceptionSentenceBehind(
3438 [](
const auto&
size,
const auto& token) {
3439 return size + token.size();
3444 if(bytes != this->tokenBytes) {
3445 Corpus::exceptionTokenBytes(
function, this->tokenBytes, bytes);
3450 inline void Corpus::addAsOneChunk(
3454 std::vector<TextMap>& articleMapsTo,
3455 std::vector<TextMap>& dateMapsTo,
3456 std::vector<SentenceMap>& sentenceMapsTo
3458 to.emplace_back(std::string{});
3460 to.back().reserve(to.size() +
size);
3462 for(
const auto& token : this->
tokens) {
3465 to.back().push_back(
'\n');
3469 if(!to.back().empty()) {
3470 to.back().pop_back();
3473 articleMapsTo.emplace_back(this->
articleMap);
3474 dateMapsTo.emplace_back(this->
dateMap);
3476 tokenNumsTo.emplace_back(this->tokens.size());
3480 inline void Corpus::reTokenize() {
3487 std::size_t dateIndex{};
3488 std::size_t articleIndex{};
3489 std::size_t sentenceIndex{};
3493 std::size_t removed{};
3495 for(std::size_t tokenIndex{}; tokenIndex < this->
tokens.size(); ++tokenIndex) {
3496 Corpus::skipEntriesBefore(this->
dateMap, dateIndex, originDate, tokenIndex);
3497 Corpus::skipEntriesBefore(this->
articleMap, articleIndex, originArticle, tokenIndex);
3498 Corpus::skipEntriesBefore(this->
sentenceMap, sentenceIndex, originSentence, tokenIndex);
3500 Corpus::updatePosition(
3508 Corpus::updatePosition(
3512 originArticle.first,
3516 Corpus::updatePosition(
3520 originSentence.first,
3526 Corpus::removeTokenFromLength(this->
dateMap, dateIndex, originDate, tokenIndex);
3527 Corpus::removeTokenFromLength(this->
articleMap, articleIndex, originArticle, tokenIndex);
3528 Corpus::removeTokenFromLength(this->
sentenceMap, sentenceIndex, originSentence, tokenIndex);
3535 Corpus::removeEmpty(this->
tokens);
3539 inline bool Corpus::tokenizeTokenized(
3540 const std::optional<SentenceFunc>& callback,
3548 std::size_t numDeletedTokens{};
3549 std::size_t dateIndex{};
3550 std::size_t articleIndex{};
3551 std::size_t dateEnd{};
3552 std::size_t articleEnd{};
3553 std::size_t sentenceCounter{};
3554 std::size_t statusCounter{};
3556 bool inArticle{
false};
3557 bool emptyDates{
false};
3558 bool emptyArticles{
false};
3559 bool emptySentences{
false};
3562 this->tokenBytes = 0;
3568 Corpus::skipEntriesBefore(
3575 Corpus::skipEntriesBefore(
3585 Corpus::entryBeginsAt(
3598 Corpus::entryBeginsAt(
3620 this->
tokens.begin() + sentenceBegin,
3621 this->
tokens.begin() + sentenceEnd
3625 for(
auto tokenIndex{sentenceBegin}; tokenIndex < sentenceEnd; ++tokenIndex) {
3626 const auto& token{this->
tokens.at(tokenIndex)};
3631 Corpus::removeToken(this->
dateMap, dateIndex, emptyDates);
3635 Corpus::removeToken(this->
articleMap, articleIndex, emptyArticles);
3638 Corpus::removeToken(sentenceEntry, emptySentences);
3644 this->tokenBytes += token.size();
3652 if(!statusSetter.
update(sentenceCounter, this->sentenceMap.size(),
true)) {
3660 statusSetter.
change(
"Cleaning corpus...");
3664 Corpus::removeEmptyEntries(this->
dateMap);
3669 Corpus::removeEmptyEntries(this->
articleMap);
3673 if(emptySentences) {
3674 Corpus::removeEmptyEntries(this->sentenceMap);
3678 if(numDeletedTokens > 0) {
3679 Corpus::removeEmpty(this->
tokens);
3683 if(this->checkConsistency) {
3684 this->check(
"tokenizeTokenized");
3691 inline bool Corpus::tokenizeContinuous(
3692 const std::optional<SentenceFunc>& callback,
3693 std::uint64_t freeMemoryEvery,
3699 std::size_t tokenBegin{};
3700 std::size_t sentenceFirstToken{};
3701 std::size_t currentToken{};
3702 std::size_t statusCounter{};
3703 std::size_t corpusTrimmed{};
3705 bool inArticle{
false};
3708 std::size_t articleFirstToken{};
3709 std::size_t dateFirstToken{};
3710 std::size_t articleEnd{Corpus::getFirstEnd(this->
articleMap)};
3711 std::size_t dateEnd{Corpus::getFirstEnd(this->
dateMap)};
3712 std::size_t nextArticle{};
3713 std::size_t nextDate{};
3715 TextMap newArticleMap;
3718 newArticleMap.reserve(this->
articleMap.size());
3719 newDateMap.reserve(this->
dateMap.size());
3722 for(std::size_t pos{}; pos < this->
corpus.size() + corpusTrimmed; ++pos) {
3723 bool sentenceEnd{
false};
3724 bool noSeparator{
false};
3725 bool appendToArticle{
false};
3726 bool appendToDate{
false};
3735 articleFirstToken = currentToken;
3746 && pos == articleEnd
3750 newArticleMap.emplace_back(
3752 currentToken - articleFirstToken,
3757 appendToArticle =
true;
3761 if(!(this->
dateMap.empty())) {
3765 && nextDate < this->
dateMap.size()
3768 dateFirstToken = currentToken;
3783 newDateMap.emplace_back(
3785 currentToken - dateFirstToken,
3786 this->
dateMap.at(nextDate - 1).value
3790 appendToDate =
true;
3795 switch(this->
corpus.at(pos - corpusTrimmed)) {
3833 auto tokenLength{pos - tokenBegin};
3839 if(tokenLength > 0) {
3840 sentence.emplace_back(this->
corpus, tokenBegin - corpusTrimmed, tokenLength);
3844 if(appendToArticle) {
3853 if(freeMemoryEvery > 0 && pos - corpusTrimmed > freeMemoryEvery) {
3855 this->
corpus.erase(0, pos - corpusTrimmed);
3856 this->
corpus.shrink_to_fit();
3858 corpusTrimmed = pos;
3861 tokenBegin = pos + 1;
3863 if(sentenceEnd && !sentence.empty()) {
3864 Corpus::processSentence(
3885 this->corpus.size() + corpusTrimmed,
3898 bool endOfLastArticle{
false};
3902 && this->
corpus.size() + corpusTrimmed == articleEnd
3906 newArticleMap.emplace_back(
3908 currentToken - articleFirstToken,
3912 endOfLastArticle =
true;
3916 bool endOfLastDate{
false};
3920 && this->
corpus.size() + corpusTrimmed == dateEnd
3924 newDateMap.emplace_back(
3926 currentToken - dateFirstToken,
3927 this->
dateMap.at(nextDate - 1).value
3930 endOfLastDate =
true;
3934 if(tokenBegin - corpusTrimmed < this->
corpus.size()) {
3935 sentence.emplace_back(
3937 tokenBegin - corpusTrimmed,
3938 this->
corpus.size() + corpusTrimmed - tokenBegin
3941 if(endOfLastArticle) {
3951 if(!sentence.empty()) {
3952 Corpus::processSentence(
3970 if(this->checkConsistency) {
3973 "Corpus::tokenizeContinuous():" 3976 +
"' has not been finished" 3982 "Corpus::tokenizeContinuous():" 3984 + this->
dateMap.at(nextDate - 1).value
3985 +
"' has not been finished" 3991 "Corpus::tokenizeContinuous():" 3992 " Unexpected article '" 3994 +
"' after end of corpus" 3998 if(nextDate < this->
dateMap.size()) {
4000 "Corpus::tokenizeContinuous():" 4001 " Unexpected date '" 4002 + this->
dateMap.at(nextDate).value
4003 +
"' after end of corpus" 4009 newDateMap.swap(this->
dateMap);
4011 this->tokenized =
true;
4014 if(this->checkConsistency) {
4015 this->check(
"tokenizeContinuous");
4026 inline bool Corpus::combineCorpora(
4027 std::vector<Corpus>& from,
4031 std::size_t corpusCounter{};
4033 for(
auto&
corpus : from) {
4036 if(!Corpus::addCorpus(
corpus, to, corpusCounter, from.size(), statusSetter)) {
4049 const std::string&
id,
4056 [&id](
const auto& entry) {
4057 return entry.value == id;
4062 if(found == map.cend()) {
4070 copy.reserve(found->l);
4072 for(
auto tokenIndex{found->p}; tokenIndex < entryEnd; ++tokenIndex) {
4073 copy.emplace_back(tokens.at(tokenIndex));
4080 inline void Corpus::removeEmpty(
Tokens& from) {
4085 [](
const auto& str) {
4094 inline void Corpus::removeToken(TextMap& map, std::size_t entryIndex,
bool& emptiedTo) {
4097 "Corpus::removeToken():" 4098 " Could not remove token from map:" 4099 " Map entry is already empty." 4113 inline void Corpus::removeToken(
SentenceMapEntry& entry,
bool& emptiedTo) {
4114 if(entry.second == 0) {
4116 "Corpus::removeToken():" 4117 " Could not remove token from sentence:" 4118 " Sentence is already empty." 4126 if(entry.second == 0) {
4133 inline std::size_t Corpus::getValidLengthOfChunk(
4134 const std::string& source,
4136 std::size_t maxLength,
4137 std::size_t maxChunkSize
4140 if(maxLength > maxChunkSize) {
4141 Corpus::exceptionInvalidMaxChunkSize(maxLength, maxChunkSize);
4144 if(maxChunkSize == 0) {
4146 "Corpus::getValidLengthOfChunk():" 4147 " Invalid maximum chunk size of zero" 4151 if(maxLength == 0) {
4159 if(cut > maxLength) {
4164 const auto maxBack{
static_cast<std::uint8_t
>(cut +
utf8MaxBytes)};
4165 const auto checkFrom{maxLength > maxBack ? pos + maxLength - maxBack : pos};
4166 const auto checkLength{maxLength > maxBack ?
utf8MaxBytes : maxLength - cut};
4169 return maxLength - cut;
4173 if(cut == utf8MaxBytes) {
4175 "Corpus::getValidLengthOfChunk():" 4176 " Could not slice corpus" 4177 " because of invalid UTF-8 character" 4181 if(maxLength >= maxChunkSize) {
4183 "Corpus::getValidLengthOfChunk():" 4184 " The chunk size is too small" 4185 " to slice a corpus with UTF-8 character(s)" 4194 inline std::size_t Corpus::getValidLengthOfChunk(
4195 const std::string& chunkContent,
4196 std::size_t maxChunkSize
4198 return Corpus::getValidLengthOfChunk(chunkContent, 0, maxChunkSize, maxChunkSize);
4202 inline void Corpus::checkTokensForChunking(
const Tokens& tokens) {
4204 std::any_of(tokens.begin(), tokens.end(), [](
const auto& token) {
4205 return std::any_of(token.begin(), token.end(), [](
const auto c) {
4211 "Corpus::copyChunksTokenized():" 4212 " Cannot split corpus into chunks" 4213 " as one of its tokens contains a newline" 4219 inline void Corpus::reserveChunks(
4223 std::vector<TextMap>& articleMapsTo,
4224 std::vector<TextMap>& dateMapsTo,
4225 std::vector<SentenceMap>& sentenceMapsTo,
4229 to.reserve(to.size() + chunks);
4232 articleMapsTo.reserve(articleMapsTo.size() + chunks);
4236 dateMapsTo.reserve(dateMapsTo.size() + chunks);
4239 sentenceMapsTo.reserve(sentenceMapsTo.size() + chunks);
4240 tokenNumsTo.reserve(tokenNumsTo.size() + chunks);
4244 inline void Corpus::checkForEntry(
4245 std::string_view type,
4247 std::size_t& nextIndex,
4249 std::size_t chunkOffset,
4251 bool checkConsistency
4253 if(nextIndex > map.size()) {
4255 "Corpus::copyChunksTokenized():" 4256 " Skipped beyond end of last " 4261 if(nextIndex == map.size()) {
4269 const auto& next{map.at(nextIndex)};
4272 chunkMap.emplace_back(
4281 if(nextIndex == map.size()) {
4288 && nextIndex < map.size()
4292 const auto& next{map.at(nextIndex)};
4294 Corpus::exceptionUnexpectedBeforeSentence(
4304 inline void Corpus::finishChunk(
4305 std::string& contentFrom,
4309 std::vector<SentenceMap>& sentencesTo,
4310 std::size_t chunkTokens,
4311 std::size_t& chunkOffset,
4313 std::size_t nextChunkSize
4316 contentTo.emplace_back(std::move(contentFrom));
4318 contentFrom.clear();
4320 if(nextChunkSize > 0) {
4321 contentFrom.reserve(nextChunkSize);
4325 sentencesTo.emplace_back(sentencesFrom);
4327 sentencesFrom.clear();
4330 tokenNumTo.push_back(chunkTokens + (splitToken ? 1 : 0));
4333 chunkOffset += chunkTokens;
4337 inline void Corpus::splitEntry(
4349 if(end > token || (end == token && splitToken)) {
4357 remainingTo.
value = map.back().value;
4362 inline void Corpus::finishMap(TextMap& from, std::vector<TextMap>& to,
TextMapEntry& remaining) {
4367 to.emplace_back(std::move(from));
4372 from.emplace_back(std::move(remaining));
4379 inline void Corpus::notUsed(
4380 std::string_view type,
4384 if(!values.at(index).empty()) {
4385 std::string typeCapitalized(type);
4387 if(!typeCapitalized.empty()) {
4388 typeCapitalized[0] = std::toupper(typeCapitalized[0]);
4392 "Corpus::tokenize():" 4397 +
"') set but not used by manipulator #" 4398 + std::to_string(index + 1)
4404 inline void Corpus::addChunkMap(
4405 const std::optional<std::reference_wrapper<const TextMap>>& from,
4414 if(from.value().get().empty()) {
4420 if(!to.empty() && to.back().value == from.value().get().at(0).value) {
4432 for(
const auto& entry : from.value().get()) {
4440 to.emplace_back(entry);
4447 inline void Corpus::checkMap(
4448 std::string_view
function,
4449 std::string_view name,
4463 for(
const auto& entry : map) {
4465 Corpus::exceptionInvalidPosition(
4479 if(isDateMap && entry.value.length() !=
dateLength) {
4480 Corpus::exceptionInvalidDate(
4489 const auto& back{map.back()};
4492 Corpus::exceptionInvalidEnd(
4502 inline void Corpus::checkMap(
4503 std::string_view
function,
4516 for(
const auto& entry : map) {
4518 Corpus::exceptionInvalidPosition(
4534 const auto& back{map.back()};
4537 Corpus::exceptionInvalidEnd(
4547 inline void Corpus::skipEntriesBefore(
4549 std::size_t& entryIndex,
4550 std::size_t& entryEnd,
4554 bool increaseIndex{inEntryTo};
4555 bool skipped{
false};
4558 entryIndex < map.size()
4565 increaseIndex =
true;
4568 entryEnd = Corpus::getEntryEnd(map, entryIndex);
4579 inline std::size_t Corpus::getFirstEnd(
const TextMap& map) {
4592 inline std::size_t Corpus::getEntryEnd(
const TextMap& map, std::size_t entryIndex) {
4597 if(entryIndex < map.size()) {
4605 inline void Corpus::processSentence(
4607 const std::optional<SentenceFunc>& callback,
4610 std::size_t& currentToken,
4611 std::size_t& sentenceFirstToken,
4616 std::size_t& tokenBytes
4620 (*callback)(sentence.begin(), sentence.end());
4624 for(
auto tokenIt{sentence.begin()}; tokenIt != sentence.end(); ) {
4625 if(tokenIt->empty()) {
4627 tokenIt = sentence.erase(tokenIt);
4636 articleMap.pop_back();
4649 tokenBytes += tokenIt->size();
4655 if(!sentence.empty()) {
4657 sentenceMap.emplace_back(
4668 sentenceFirstToken = currentToken;
4672 inline bool Corpus::addCorpus(
4683 if(!from.tokenized) {
4686 " All sources need to be tokenized." 4691 const bool isRunning{
4692 statusSetter.
change(Corpus::mergingStatus(number, total))
4693 && Corpus::addSentences(from, to, statusSetter)
4704 inline bool Corpus::addSentences(
4709 std::size_t articleIndex{};
4710 std::size_t dateIndex{};
4711 std::size_t articleEnd{Corpus::getFirstEnd(from.
articleMap)};
4712 std::size_t dateEnd{Corpus::getFirstEnd(from.
dateMap)};
4713 bool inArticle{
false};
4715 std::size_t sentenceCounter{};
4716 std::size_t statusCounter{};
4717 std::string article;
4719 std::vector<Tokens> content;
4725 Corpus::skipEntriesBefore(
4732 Corpus::skipEntriesBefore(
4743 Corpus::finishArticle(
4761 else if(!inArticle) {
4782 content.emplace_back(
4801 Corpus::finishArticle(
4812 inline void Corpus::finishArticle(
4813 std::vector<Tokens>& from,
4815 const std::string& date,
4816 const std::string& article
4827 inline void Corpus::nextEntry(
4830 std::string& nameTo,
4832 std::size_t corpusEnd
4834 if(index < map.size()) {
4835 nameTo = map[index].value;
4846 inline bool Corpus::pushSentence(
4848 std::size_t chunkSize,
4849 std::size_t chunkOffset,
4850 std::size_t& chunkTokens,
4851 std::string& chunkContent,
4854 std::size_t& tokensComplete,
4855 std::size_t& additionalBytes
4857 auto bytesBefore{additionalBytes};
4860 const auto sentenceOffset{
4864 chunkSentences.emplace_back(
4870 for(std::size_t token{tokensComplete}; token <
TextMapEntry::end(sentence); ++token) {
4872 const auto oldSize{chunkContent.size()};
4875 additionalBytes > 0 ? tokens.at(token).substr(additionalBytes)
4879 chunkContent.push_back(
'\n');
4881 if(chunkContent.size() > chunkSize) {
4883 const auto size{Corpus::getValidLengthOfChunk(chunkContent, chunkSize)};
4885 chunkContent.erase(size);
4887 additionalBytes += chunkContent.size() - oldSize;
4889 if(token ==
TextMapEntry::pos(sentence) + sentenceOffset && additionalBytes == bytesBefore) {
4891 chunkSentences.pop_back();
4893 if(tokensComplete == chunkOffset) {
4895 "Corpus::copyChunksTokenized():" 4896 " Separating tokens into chunks failed - chunk size too small?" 4905 additionalBytes = 0;
4916 inline std::string Corpus::mergingStatus(std::size_t number, std::size_t total) {
4917 std::ostringstream status;
4919 Corpus::locale(status);
4921 status <<
"Merging corpora (";
4927 return status.str();
4931 inline void Corpus::locale(std::ostream& os) {
4940 inline void Corpus::exceptionGetNoArticleMap(
4941 std::string_view
function,
4944 std::ostringstream exception;
4946 Corpus::locale(exception);
4948 exception <<
"Corpus::";
4949 exception <<
function;
4950 exception <<
"(): Article #";
4951 exception << article;
4952 exception <<
" requested, but the article map is empty";
4958 inline void Corpus::exceptionArticleOutOfBounds(
4959 std::string_view
function,
4960 std::size_t article,
4963 std::ostringstream exception;
4965 Corpus::locale(exception);
4967 exception <<
"Corpus::";
4968 exception <<
function;
4969 exception <<
"(): The specified article index (#";
4970 exception << article;
4971 exception <<
") is out of bounds [#0;#";
4972 exception << size - 1;
4979 inline void Corpus::exceptionDateLength(
4980 std::string_view
function,
4983 std::ostringstream exception;
4985 Corpus::locale(exception);
4987 exception <<
"Corpus::";
4988 exception <<
function;
4989 exception <<
"(): Invalid length of date (";
4991 exception <<
" instead of ";
4999 inline void Corpus::exceptionArticleMapStart(
5000 std::string_view
function,
5001 std::string_view expected,
5002 std::size_t chunkIndex,
5003 std::size_t numberOfChunks,
5006 std::ostringstream exception;
5008 Corpus::locale(exception);
5010 exception <<
"Corpus::";
5011 exception <<
function;
5012 exception <<
"(): Article map in corpus chunk ";
5013 exception << chunkIndex + 1;
5015 exception << numberOfChunks;
5016 exception <<
" starts at #";
5018 exception <<
" instead of ";
5019 exception << expected;
5025 inline void Corpus::exceptionLastSentenceLength(
5028 std::size_t corpusSize
5030 std::ostringstream exception;
5032 Corpus::locale(exception);
5034 exception <<
"Corpus::combineTokenized(): Length of last sentence (";
5039 exception << pos +
length;
5040 exception <<
"]) exceeds length of corpus (";
5041 exception << corpusSize;
5048 inline void Corpus::exceptionArticleBehindDate(
5049 std::size_t articlePos,
5050 std::size_t datePos,
5053 std::ostringstream exception;
5055 Corpus::locale(exception);
5057 exception <<
"Corpus::copyChunksContinuous(): Article position (#";
5058 exception << articlePos;
5059 exception <<
") lies behind date at [#";
5060 exception << datePos;
5062 exception << dateEnd;
5069 inline void Corpus::exceptionChunkSize(std::size_t size, std::size_t chunkSize) {
5070 std::ostringstream exception;
5072 Corpus::locale(exception);
5074 exception <<
"Corpus::copyChunksContinuous(): Chunk is too large:";
5077 exception << chunkSize;
5083 inline void Corpus::exceptionArticleMapEnd(std::size_t pos, std::size_t size) {
5084 std::ostringstream exception;
5086 Corpus::locale(exception);
5088 exception <<
"Corpus::copyChunksContinuous(): End of articles, but not of corpus ( #";
5090 exception <<
" < #";
5098 inline void Corpus::exceptionUnexpectedBeforeSentence(
5099 std::string_view type,
5100 std::string_view name,
5102 std::size_t sentencePos
5104 std::ostringstream exception;
5106 Corpus::locale(exception);
5108 exception <<
"Corpus::copyChunksTokenized(): Unexpected begin of ";
5112 exception <<
"' (@";
5114 exception <<
") before the beginning of the current sentence (@";
5115 exception << sentencePos;
5122 inline void Corpus::exceptionMismatchWithDate(
5123 std::string_view type,
5127 std::ostringstream exception;
5129 Corpus::locale(exception);
5131 exception <<
"Corpus::filterByDate(): Mismatch between positions of ";
5133 exception <<
" (@ #";
5135 exception <<
") and date (@ #";
5136 exception << datePos;
5137 exception <<
") in ";
5139 exception <<
" and date map of the corpus";
5145 inline void Corpus::exceptionDateBehindLast(
5146 std::string_view type,
5147 std::size_t datePos,
5150 std::ostringstream exception;
5152 Corpus::locale(exception);
5154 exception <<
"Corpus::filterByDate(): Position of identified date (@ #";
5155 exception << datePos;
5156 exception <<
") is behind the position of the last ";
5158 exception <<
" (@ #";
5159 exception << lastPos;
5160 exception <<
") in ";
5162 exception <<
" and date map of the corpus";
5168 inline void Corpus::exceptionSentenceBehind(
5169 std::string_view
function,
5170 std::string_view type,
5171 const std::pair<std::size_t, std::size_t>& sentence,
5174 const TextMap::const_iterator& next,
5177 std::ostringstream exception;
5182 Corpus::locale(exception);
5184 exception <<
"Corpus::";
5185 exception <<
function;
5186 exception <<
"(): End of sentence (l=";
5187 exception << sentence.second;
5188 exception <<
") is behind end of ";
5191 exception << entry.
value;
5192 exception <<
"' (l=";
5195 exception << sentenceEnd;
5197 if(sentenceEnd > 0 && sentenceEnd <= tokens.size()) {
5199 exception << tokens.at(sentenceEnd - 1);
5202 else if(sentenceEnd == 0) {
5203 exception <<
" [BEGIN]";
5206 exception <<
" [BEHIND]";
5210 exception << entryEnd;
5212 if(entryEnd > 0 && entryEnd <= tokens.size()) {
5214 exception << tokens.at(entryEnd - 1);
5217 else if(entryEnd == 0) {
5218 exception <<
" [BEGIN]";
5221 exception <<
" [BEHIND]";
5225 exception <<
"sentence: '";
5227 bool addSpace{
false};
5229 for(std::size_t token{
TextMapEntry::pos(sentence)}; token < sentenceEnd; ++token) {
5230 if(token < tokens.size()) {
5238 exception << tokens.at(token);
5244 if(next != map.cend()) {
5245 exception <<
" (next ";
5248 exception << next->value;
5256 inline void Corpus::exceptionTokenBytes(
5257 std::string_view
function,
5259 std::size_t actualSize
5261 std::ostringstream exception;
5263 Corpus::locale(exception);
5265 exception <<
"Corpus::";
5266 exception <<
function;
5267 exception <<
"(): Corpus size is set to ";
5269 exception <<
"B, but actual corpus size is ";
5270 exception << actualSize;
5277 inline void Corpus::exceptionInvalidMaxChunkSize(std::size_t size, std::size_t max) {
5278 std::ostringstream exception;
5280 Corpus::locale(exception);
5282 exception <<
"Corpus::getValidLengthOfChunk(): Invalid maximum chunk size (";
5292 inline void Corpus::exceptionPositionTooSmall(
5294 std::size_t expectedMin,
5295 std::string_view name
5297 std::ostringstream exception;
5299 Corpus::locale(exception);
5301 exception <<
"Corpus::reTokenize(): Invalid position #";
5303 exception <<
" (expected: >= #";
5304 exception << expectedMin;
5305 exception <<
") in ";
5312 inline void Corpus::exceptionInvalidPosition(
5313 std::string_view
function,
5315 std::size_t expected,
5316 std::string_view name
5318 std::ostringstream exception;
5320 Corpus::locale(exception);
5322 exception <<
"Corpus::";
5323 exception <<
function;
5324 exception <<
"(): Invalid position #";
5326 exception <<
" (expected: #";
5327 exception << expected;
5328 exception <<
") in ";
5335 inline void Corpus::exceptionInvalidDate(
5336 std::string_view
function,
5337 std::string_view value,
5338 std::string_view name
5340 std::ostringstream exception;
5342 Corpus::locale(exception);
5344 exception <<
"Corpus::";
5345 exception <<
function;
5346 exception <<
"(): Invalid date in date map: '";
5348 exception <<
"' (expected string of length ";
5350 exception <<
") in '";
5358 inline void Corpus::exceptionInvalidEnd(
5359 std::string_view
function,
5361 std::size_t expected,
5362 std::string_view name
5364 std::ostringstream exception;
5366 Corpus::locale(exception);
5368 exception <<
"Corpus::";
5369 exception <<
function;
5370 exception <<
"(): Invalid end of last entry in map at #";
5372 exception <<
" (expected: at #";
5373 exception << expected;
5374 exception <<
") in ";
std::string getDate(const std::string &date) const
Gets all articles at the specified date from a continous text corpus.
Definition: Corpus.hpp:1145
constexpr std::uint16_t corpusManipTrim
Trim tokens by tokens found in a dictionary.
Definition: Corpus.hpp:128
std::vector< std::size_t > Sizes
Definition: Corpus.hpp:172
TextMap dateMap
Index of dates.
Definition: Corpus.hpp:331
bool isRunning() const
Checks whether the thread is still supposed to run.
Definition: StatusSetter.hpp:236
bool update(std::size_t done, std::size_t total) const
Updates the status with a fractal progress.
Definition: StatusSetter.hpp:161
TextMap & getDateMap()
Gets a reference to the date map of the corpus.
Definition: Corpus.hpp:986
static std::size_t & pos(TextMapEntry &entry)
Gets a reference to the position of a text map entry.
Definition: TextMap.hpp:172
std::size_t filterArticles(const ArticleFunc &callbackArticle, StatusSetter &statusSetter)
Filters a tokenized corpus by removing articles.
Definition: Corpus.hpp:2674
std::string get(std::size_t index) const
Gets the article with the specified index from a continous text corpus.
Definition: Corpus.hpp:1056
std::function< bool(const Tokens &, std::size_t, std::size_t)> ArticleFunc
Definition: Corpus.hpp:175
void stemGerman(std::string &token)
Stems a token in German.
Definition: German.hpp:118
static void moveInto(T &to, T &from)
Moves the elements of an iterable container into another iterable container.
Definition: Container.hpp:99
constexpr auto minSingleUtf8CharSize
Minimum length of single UTF-8 code points to remove.
Definition: Corpus.hpp:97
bool change(const std::string &statusMessage)
Changes the status message and resets the current progress.
Definition: StatusSetter.hpp:143
Class for corpus-specific exceptions.
Definition: Corpus.hpp:315
void copyChunksTokenized(std::size_t chunkSize, Tokens &to, Sizes &tokenNumsTo, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo, std::vector< SentenceMap > &sentenceMapsTo) const
Copies the underlying tokenized text corpus into chunks of the given size.
Definition: Corpus.hpp:2194
const Tokens & getcTokens() const
Gets a constant reference to the tokens in a tokenized text corpus.
Definition: Corpus.hpp:923
std::vector< std::pair< std::size_t, std::size_t > > SentenceMap
Definition: Corpus.hpp:180
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
constexpr std::uint16_t corpusManipRemove
Remove single tokens found in a dictionary.
Definition: Corpus.hpp:125
Tokens getDateTokenized(const std::string &date) const
Gets the tokens of all articles at the specified date from a tokenized text corpus.
Definition: Corpus.hpp:1272
Text map entry.
Definition: TextMap.hpp:49
PositionLength SentenceMapEntry
Definition: Corpus.hpp:182
std::function< void(Tokens::iterator, Tokens::iterator)> SentenceFunc
Definition: Corpus.hpp:176
static std::locale locale()
Definition: CommaLocale.hpp:44
std::string substr(std::size_t from, std::size_t len)
Gets a substring from the corpus.
Definition: Corpus.hpp:1363
bool isLastCharValidUtf8(std::string_view stringToCheck)
Tokens tokens
Tokenized text corpus.
Definition: Corpus.hpp:325
bool hasArticleMap() const
Checks whether the corpus has an article map.
Definition: Corpus.hpp:949
std::map< std::string, std::map< std::string, std::vector< Tokens > >> DateArticleSentenceMap
Definition: Corpus.hpp:179
constexpr auto tokenizeUpdateEvery
After how many sentences the status is updated when tokenizing a corpus.
Definition: Corpus.hpp:91
std::vector< Tokens > getArticles() const
Gets the tokens of all articles from a tokenized corpus.
Definition: Corpus.hpp:1295
constexpr std::uint16_t corpusManipLemmatizer
Multilingual lemmatizer.
Definition: Corpus.hpp:122
Class representing a text corpus.
Definition: Corpus.hpp:165
void stemEnglish(std::string &token)
Stems a token in English.
Definition: English.hpp:61
const SentenceMap & getcSentenceMap() const
Gets a constant reference to the sentence map of the corpus.
Definition: Corpus.hpp:1035
constexpr std::uint16_t corpusManipCorrect
Correct single tokens using a aspell dictionary.
Definition: Corpus.hpp:131
bool hasSentenceMap() const
Checks whether the corpus has sentence map.
Definition: Corpus.hpp:1005
void copyChunksContinuous(std::size_t chunkSize, Tokens &to, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo) const
Copies the underlying continous text corpus into chunks of the given size.
Definition: Corpus.hpp:1869
constexpr auto first
Index of the first byte.
Definition: Bytes.hpp:57
void clear()
Clears the corpus.
Definition: Corpus.hpp:3128
static void freeIf(bool isFree, T &target)
Frees memory early by swapping, if necessary.
Definition: Memory.hpp:52
constexpr auto dateLength
The length of a date string in the format YYYY-MM-DD.
Definition: Corpus.hpp:82
std::string & getCorpus()
Gets a reference to the continous text corpus.
Definition: Corpus.hpp:869
constexpr auto filterUpdateEvery
After how many articles the status is updated when filtering a corpus (by queries).
Definition: Corpus.hpp:94
static T::size_type bytes(const T &container)
Returns the number of bytes in an iterable container.
Definition: Container.hpp:144
bool tokenizeCustom(const std::optional< SentenceFunc > &callback, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
Converts a text corpus into processed tokens, using custom manipulators.
Definition: Corpus.hpp:3081
constexpr auto mergeUpdateEvery
After how many sentences the status is updated when merging corpora.
Definition: Corpus.hpp:88
constexpr std::uint16_t corpusManipTagger
The POS (position of speech) tagger based on Wapiti by Thomas Lavergne.
Definition: Corpus.hpp:110
bool filterByDate(const std::string &from, const std::string &to)
Filters a text corpus by the given date(s).
Definition: Corpus.hpp:2411
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< std::string > Tokens
Definition: Corpus.hpp:173
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void combineContinuous(Tokens &chunks, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, bool deleteInputData)
Creates continuous text corpus by combining previously separated chunks as well as their article and ...
Definition: Corpus.hpp:1499
constexpr std::uint16_t corpusManipNone
Do not manipulate anything.
Definition: Corpus.hpp:107
constexpr std::uint16_t corpusManipTaggerPosterior
The posterior POS tagger based on Wapiti by Thomas Lavergne (slow, but more accurate).
Definition: Corpus.hpp:113
Tokens & getTokens()
Gets a reference to the tokens in a tokenized text corpus.
Definition: Corpus.hpp:908
SentenceMap sentenceMap
Index of sentences.
Definition: Corpus.hpp:334
std::string corpus
Continuous text corpus.
Definition: Corpus.hpp:322
SentenceMap & getSentenceMap()
Gets a reference to the sentence map of the corpus.
Definition: Corpus.hpp:1019
std::pair< std::size_t, std::size_t > PositionLength
Definition: Corpus.hpp:181
Tokens getTokenized(std::size_t index) const
Gets the article with the specified index from a tokenized text corpus.
Definition: Corpus.hpp:1192
bool empty() const
Checks whether the corpus is empty.
Definition: Corpus.hpp:1337
bool isISODateInRange(std::string_view isoDate, std::string_view rangeFrom, std::string_view rangeTo)
Checks whether the given ISO date is in the given range of dates.
Definition: DateTime.hpp:1105
TextMap & getArticleMap()
Gets a reference to the article map of the corpus.
Definition: Corpus.hpp:958
constexpr std::uint16_t corpusManipEnglishStemmer
The porter2_stemmer algorithm for English only, implemented by Sean Massung.
Definition: Corpus.hpp:116
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
TextMap articleMap
Index of articles and their IDs.
Definition: Corpus.hpp:328
void create(Tokens &texts, bool deleteInputData)
Creates text corpus from a vector of strings.
Definition: Corpus.hpp:1386
std::size_t size() const
Gets the size of the text corpus, in bytes.
Definition: Corpus.hpp:1324
const TextMap & getcDateMap() const
Gets a constant reference to the date map of the corpus.
Definition: Corpus.hpp:995
constexpr std::uint16_t corpusManipGermanStemmer
Simple stemmer for German only, based on CISTEM by Leonie Weißweiler and Alexander Fraser...
Definition: Corpus.hpp:119
void copyContinuous(std::string &to) const
Copies the underlying continuous text corpus to the given string.
Definition: Corpus.hpp:1792
bool tokenize(const std::vector< std::uint16_t > &manipulators, const std::vector< std::string > &models, const std::vector< std::string > &dictionaries, const std::vector< std::string > &languages, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
Converts a text corpus into processed tokens.
Definition: Corpus.hpp:2822
const TextMap & getcArticleMap() const
Gets a constant reference to the article map of the corpus.
Definition: Corpus.hpp:967
std::size_t getNumTokens() const
Gets the number of tokens in the corpus.
Definition: Corpus.hpp:937
bool hasDateMap() const
Checks whether the corpus has a date map.
Definition: Corpus.hpp:977
constexpr std::uint8_t utf8MaxBytes
Maximum number of bytes used by one UTF-8-encoded multibyte character.
Definition: Corpus.hpp:85
void finish() const
Re-sets the progress of the thread.
Definition: StatusSetter.hpp:241
std::string value
Value of the annotation.
Definition: TextMap.hpp:69
bool isTokenized() const
Gets whether the corpus has been tokenized.
Definition: Corpus.hpp:895
Corpus(bool consistencyChecks)
Constructor setting the internal property.
Definition: Corpus.hpp:807
Namespace for different types of data.
constexpr auto maxSingleUtf8CharSize
Maximum length of single UTF-8 code points to remove.
Definition: Corpus.hpp:100
const std::string & getcCorpus() const
Gets a constant reference to the continous text corpus.
Definition: Corpus.hpp:883
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
static std::size_t & length(TextMapEntry &entry)
Gets a reference to the length of a text map entry.
Definition: TextMap.hpp:234
static std::size_t end(const T &entry)
Gets the end of a map entry.
Definition: TextMap.hpp:221
void combineTokenized(Tokens &chunks, Sizes &tokenNums, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, std::vector< SentenceMap > &sentenceMaps, bool deleteInputData)
Creates a tokenized text corpus by combining previously separated chunks, as well as their article...
Definition: Corpus.hpp:1662