#include "Lemmatizer.hpp"
#include "Stemmer/English.hpp"
#include "Stemmer/German.hpp"
#include "Tagger.hpp"
#include "TokenCorrect.hpp"
#include "TokenRemover.hpp"
#include "../Helper/CommaLocale.hpp"
#include "../Helper/Container.hpp"
#include "../Helper/DateTime.hpp"
#include "../Helper/Memory.hpp"
#include "../Helper/Utf8.hpp"
#include "../Main/Exception.hpp"
#include "../Struct/StatusSetter.hpp"
#include "../Struct/TextMap.hpp"
#include <algorithm>
#include <cctype>
#include <functional>
#include <iterator>
#include <map>
#include <numeric>
#include <optional>
#include <ostream>
#include <sstream>
#include <utility>
#include <vector>

Include dependency graph for Corpus.hpp:

This graph shows which files directly or indirectly include this file:

Go to the source code of this file.

Classes
class	crawlservpp::Data::Corpus
	Class representing a text corpus. More...

class	crawlservpp::Data::Corpus::Exception
	Class for corpus-specific exceptions. More...

Namespaces
	crawlservpp::Data
	Namespace for different types of data.

Constants
constexpr auto	crawlservpp::Data::dateLength {10}
	The length of a date string in the format YYYY-MM-DD. More...

constexpr std::uint8_t	crawlservpp::Data::utf8MaxBytes {4}
	Maximum number of bytes used by one UTF-8-encoded multibyte character. More...

constexpr auto	crawlservpp::Data::mergeUpdateEvery {10000}
	After how many sentences the status is updated when merging corpora. More...

constexpr auto	crawlservpp::Data::tokenizeUpdateEvery {10000}
	After how many sentences the status is updated when tokenizing a corpus. More...

constexpr auto	crawlservpp::Data::filterUpdateEvery {10000}
	After how many articles the status is updated when filtering a corpus (by queries). More...

constexpr auto	crawlservpp::Data::minSingleUtf8CharSize {2}
	Minimum length of single UTF-8 code points to remove. More...

constexpr auto	crawlservpp::Data::maxSingleUtf8CharSize {4}
	Maximum length of single UTF-8 code points to remove. More...

Sentence and Token Manipulation
constexpr std::uint16_t	crawlservpp::Data::corpusManipNone {0}
	Do not manipulate anything. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipTagger {1}
	The POS (position of speech) tagger based on `Wapiti` by Thomas Lavergne. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipTaggerPosterior {2}
	The posterior POS tagger based on `Wapiti` by Thomas Lavergne (slow, but more accurate). More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipEnglishStemmer {3}
	The `porter2_stemmer` algorithm for English only, implemented by Sean Massung. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipGermanStemmer {4}
	Simple stemmer for German only, based on `CISTEM` by Leonie Weißweiler and Alexander Fraser. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipLemmatizer {5}
	Multilingual lemmatizer. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipRemove {6}
	Remove single tokens found in a dictionary. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipTrim {7}
	Trim tokens by tokens found in a dictionary. More...

constexpr std::uint16_t	crawlservpp::Data::corpusManipCorrect {8}
	Correct single tokens using a `aspell` dictionary. More...

Classes

Namespaces

Constants

Sentence and Token Manipulation