Class representing a text corpus. More...

#include <Corpus.hpp>

Classes
class	Exception
	Class for corpus-specific exceptions. More...

Public Types
using	Sizes = std::vector< std::size_t >

using	Tokens = std::vector< std::string >

using	ArticleFunc = std::function< bool(const Tokens &, std::size_t, std::size_t)>

using	SentenceFunc = std::function< void(Tokens::iterator, Tokens::iterator)>

using	DateArticleSentenceMap = std::map< std::string, std::map< std::string, std::vector< Tokens > >>

using	SentenceMap = std::vector< std::pair< std::size_t, std::size_t > >

using	PositionLength = std::pair< std::size_t, std::size_t >

using	SentenceMapEntry = PositionLength

Construction
	Corpus (bool consistencyChecks)
	Constructor setting the internal property. More...

	Corpus (std::vector< Corpus > &others, bool consistencyChecks, StatusSetter &statusSetter)
	Constructor creating a tokenized corpus from multiple other tokenized corpora. More...

Getters
std::string &	getCorpus ()
	Gets a reference to the continous text corpus. More...

const std::string &	getcCorpus () const
	Gets a constant reference to the continous text corpus. More...

bool	isTokenized () const
	Gets whether the corpus has been tokenized. More...

Tokens &	getTokens ()
	Gets a reference to the tokens in a tokenized text corpus. More...

const Tokens &	getcTokens () const
	Gets a constant reference to the tokens in a tokenized text corpus. More...

std::size_t	getNumTokens () const
	Gets the number of tokens in the corpus. More...

bool	hasArticleMap () const
	Checks whether the corpus has an article map. More...

TextMap &	getArticleMap ()
	Gets a reference to the article map of the corpus. More...

const TextMap &	getcArticleMap () const
	Gets a constant reference to the article map of the corpus. More...

bool	hasDateMap () const
	Checks whether the corpus has a date map. More...

TextMap &	getDateMap ()
	Gets a reference to the date map of the corpus. More...

const TextMap &	getcDateMap () const
	Gets a constant reference to the date map of the corpus. More...

bool	hasSentenceMap () const
	Checks whether the corpus has sentence map. More...

SentenceMap &	getSentenceMap ()
	Gets a reference to the sentence map of the corpus. More...

const SentenceMap &	getcSentenceMap () const
	Gets a constant reference to the sentence map of the corpus. More...

std::string	get (std::size_t index) const
	Gets the article with the specified index from a continous text corpus. More...

std::string	get (const std::string &id) const
	Gets the article with the specified ID from a coninuous corpus. More...

std::string	getDate (const std::string &date) const
	Gets all articles at the specified date from a continous text corpus. More...

Tokens	getTokenized (std::size_t index) const
	Gets the article with the specified index from a tokenized text corpus. More...

Tokens	getTokenized (const std::string &id) const
	Gets the article with the specified ID from a tokenized corpus. More...

Tokens	getDateTokenized (const std::string &date) const
	Gets the tokens of all articles at the specified date from a tokenized text corpus. More...

std::vector< Tokens >	getArticles () const
	Gets the tokens of all articles from a tokenized corpus. More...

std::size_t	size () const
	Gets the size of the text corpus, in bytes. More...

bool	empty () const
	Checks whether the corpus is empty. More...

std::string	substr (std::size_t from, std::size_t len)
	Gets a substring from the corpus. More...

Creation
void	create (Tokens &texts, bool deleteInputData)
	Creates text corpus from a vector of strings. More...

void	create (Tokens &texts, std::vector< std::string > &articleIds, std::vector< std::string > &dateTimes, bool deleteInputData)
	Creates text corpus from parsed data, including article and date maps. More...

void	combineContinuous (Tokens &chunks, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, bool deleteInputData)
	Creates continuous text corpus by combining previously separated chunks as well as their article and date maps. More...

void	combineTokenized (Tokens &chunks, Sizes &tokenNums, std::vector< TextMap > &articleMaps, std::vector< TextMap > &dateMaps, std::vector< SentenceMap > &sentenceMaps, bool deleteInputData)
	Creates a tokenized text corpus by combining previously separated chunks, as well as their article, date and sentence maps. More...

Copying
void	copyContinuous (std::string &to) const
	Copies the underlying continuous text corpus to the given string. More...

void	copyContinuous (std::string &to, TextMap &articleMapTo, TextMap &dateMapTo) const
	Copies the underlying continous text corpus, as well as its article and date map. More...

void	copyChunksContinuous (std::size_t chunkSize, Tokens &to, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo) const
	Copies the underlying continous text corpus into chunks of the given size. More...

void	copyChunksTokenized (std::size_t chunkSize, Tokens &to, Sizes &tokenNumsTo, std::vector< TextMap > &articleMapsTo, std::vector< TextMap > &dateMapsTo, std::vector< SentenceMap > &sentenceMapsTo) const
	Copies the underlying tokenized text corpus into chunks of the given size. More...

Filtering
bool	filterByDate (const std::string &from, const std::string &to)
	Filters a text corpus by the given date(s). More...

std::size_t	filterArticles (const ArticleFunc &callbackArticle, StatusSetter &statusSetter)
	Filters a tokenized corpus by removing articles. More...

Tokenization
bool	tokenize (const std::vector< std::uint16_t > &manipulators, const std::vector< std::string > &models, const std::vector< std::string > &dictionaries, const std::vector< std::string > &languages, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
	Converts a text corpus into processed tokens. More...

bool	tokenizeCustom (const std::optional< SentenceFunc > &callback, std::uint64_t freeMemoryEvery, StatusSetter &statusSetter)
	Converts a text corpus into processed tokens, using custom manipulators. More...

Cleanup
void	clear ()
	Clears the corpus. More...

Data
std::string	corpus
	Continuous text corpus. More...

Tokens	tokens
	Tokenized text corpus. More...

TextMap	articleMap
	Index of articles and their IDs. More...

TextMap	dateMap
	Index of dates. More...

SentenceMap	sentenceMap
	Index of sentences. More...

Detailed Description

Class representing a text corpus.

The corpus might include article and date maps that can be sliced into smaller chunks to fit into the database.

Article and date maps are saved as text map structures, referencing a part of the text corpus and containing a describing label that indicates the ID or date associated with the referenced part of the corpus.

The corpus can be preprocessed using a number of manipulators, resulting in a tokenized corpus. If not preprocessed, it will be stored as continuous text.

Note: For the filtering by date to work, all input data needs to be sorted by date, while texts without date need to be added first.

See also: Struct::TextMap

Member Typedef Documentation

◆ ArticleFunc

using crawlservpp::Data::Corpus::ArticleFunc = std::function<bool(const Tokens&, std::size_t, std::size_t)>

◆ DateArticleSentenceMap

using crawlservpp::Data::Corpus::DateArticleSentenceMap = std::map<std::string, std::map<std::string, std::vector<Tokens> >>

◆ PositionLength

using crawlservpp::Data::Corpus::PositionLength = std::pair<std::size_t, std::size_t>

◆ SentenceFunc

using crawlservpp::Data::Corpus::SentenceFunc = std::function<void(Tokens::iterator, Tokens::iterator)>

◆ SentenceMap

using crawlservpp::Data::Corpus::SentenceMap = std::vector<std::pair<std::size_t, std::size_t> >

◆ SentenceMapEntry

using crawlservpp::Data::Corpus::SentenceMapEntry = PositionLength

◆ Sizes

using crawlservpp::Data::Corpus::Sizes = std::vector<std::size_t>

◆ Tokens

using crawlservpp::Data::Corpus::Tokens = std::vector<std::string>

Constructor & Destructor Documentation

◆ Corpus() [1/2]

crawlservpp::Data::Corpus::Corpus ( bool consistencyChecks )

inlineexplicit

Constructor setting the internal property.

Parameters

consistencyChecks If true, consistency checks will be performed on the text corpus.

◆ Corpus() [2/2]

crawlservpp::Data::Corpus::Corpus	(	std::vector< Corpus > &	others,
		bool	consistencyChecks,
		StatusSetter &	statusSetter
	)

inline

Constructor creating a tokenized corpus from multiple other tokenized corpora.

Moves the data out of the previous corpora and frees their memory.

Parameters

others	Reference to a vector of corpora from which to move the data into the newly created corpus.
consistencyChecks	If true, consistency checks will be performed on the text corpus.
statusSetter	Reference to a Struct::StatusSetter to be used for updating the status while combining the sources.

Exceptions

Corpus::Exception if one of the corpora is not tokenized and non-empty, or if one of their date, article, and/or sentence maps are incoherent.

References crawlservpp::Struct::StatusSetter::change().

Member Function Documentation

◆ clear()

void crawlservpp::Data::Corpus::clear ( )

inline

Clears the corpus.

Clears the text of the corpus, as well as article and date map, if they exist.

Frees the corresponding memory.

References articleMap, crawlservpp::Helper::Container::bytes(), crawlservpp::Struct::StatusSetter::change(), clear(), corpus, crawlservpp::Data::dateLength, dateMap, empty(), crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Struct::StatusSetter::finish(), crawlservpp::Helper::Memory::free(), crawlservpp::Helper::Memory::freeIf(), hasArticleMap(), hasDateMap(), crawlservpp::Helper::Utf8::isLastCharValidUtf8(), crawlservpp::Struct::StatusSetter::isRunning(), isTokenized(), crawlservpp::Helper::Utf8::length(), crawlservpp::Struct::TextMapEntry::length(), crawlservpp::Helper::CommaLocale::locale(), crawlservpp::Data::mergeUpdateEvery, crawlservpp::Helper::Container::moveInto(), crawlservpp::Struct::TextMapEntry::pos(), sentenceMap, size(), crawlservpp::Data::tokenizeUpdateEvery, tokens, crawlservpp::Struct::StatusSetter::update(), crawlservpp::Data::utf8MaxBytes, and crawlservpp::Struct::TextMapEntry::value.

Referenced by crawlservpp::Module::Analyzer::Database::checkSources(), clear(), combineContinuous(), combineTokenized(), create(), filterByDate(), and crawlservpp::Module::Analyzer::Database::getCorpus().

◆ combineContinuous()

void crawlservpp::Data::Corpus::combineContinuous	(	Tokens &	chunks,
		std::vector< TextMap > &	articleMaps,
		std::vector< TextMap > &	dateMaps,
		bool	deleteInputData
	)

inline

Creates continuous text corpus by combining previously separated chunks as well as their article and date maps.

Performs consistency checks of the provided article maps, if consistency checks have been enabled on construction.

Note: If a corpus already exists, it will be cleared.

Parameters

chunks	Reference to a vector containing strings with the text of the chunks.
articleMaps	Reference to a vector containing the article maps of the chunks.
dateMaps	Reference to a vector containing the date maps of the chunks.
deleteInputData	If true, the given texts, article and date maps will be cleared, freeing the used memory early.

Exceptions

Corpus::Exception if consistency checks are enabled and an article map does not start in the beginning of its corpus chunk.

See also: copyChunksContinuous

References articleMap, clear(), corpus, dateMap, crawlservpp::Helper::Bytes::first, crawlservpp::Helper::Memory::freeIf(), crawlservpp::Struct::TextMapEntry::length(), and crawlservpp::Struct::TextMapEntry::pos().

Referenced by crawlservpp::Module::Analyzer::Database::checkSources().

◆ combineTokenized()

void crawlservpp::Data::Corpus::combineTokenized	(	Tokens &	chunks,
		Sizes &	tokenNums,
		std::vector< TextMap > &	articleMaps,
		std::vector< TextMap > &	dateMaps,
		std::vector< SentenceMap > &	sentenceMaps,
		bool	deleteInputData
	)

inline

Creates a tokenized text corpus by combining previously separated chunks, as well as their article, date and sentence maps.

Performs consistency checks of the provided article maps, if consistency checks have been enabled on construction.

Note: If a corpus already exists, it will be cleared.

Parameters

chunks	Reference to a vector containing strings with the text of the chunks.
tokenNums	Reference to a vector containing the number of tokens in each chunk.
articleMaps	Reference to a vector containing the article maps of the chunks.
dateMaps	Reference to a vector containing the date maps of the chunks.
sentenceMaps	Reference to a vector containing the sentence maps of the chunks.
deleteInputData	If true, the given texts, token counts, as well as article, date and sentence maps will be cleared, freeing the used memory early.

Exceptions

Corpus::Exception if the corpus is not empty and no sentence map is given or the combined sentence map is empty, consistency checks are enabled and an article map or a sentence map does not start in the beginning of its corpus chunk, if the length of the first sentence in a chunk conflicts with the length given in the previous chunk, if the length of the last sentence in the combined corpus exceeds the length of the corpus itself, or if there are more token counts, article maps, date maps and/or sentence maps given than corpus chunks.

See also: copyChunksTokenized

References articleMap, clear(), dateMap, crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Helper::Memory::freeIf(), crawlservpp::Struct::TextMapEntry::length(), crawlservpp::Struct::TextMapEntry::pos(), sentenceMap, and tokens.

Referenced by crawlservpp::Module::Analyzer::Database::checkSources().

◆ copyChunksContinuous()

void crawlservpp::Data::Corpus::copyChunksContinuous	(	std::size_t	chunkSize,
		Tokens &	to,
		std::vector< TextMap > &	articleMapsTo,
		std::vector< TextMap > &	dateMapsTo
	)		const

inline

Copies the underlying continous text corpus into chunks of the given size.

If the text corpus has an article and/or a date map, a corresponding article and/or date map will be created for each of the corpus chunks.

If the text corpus contains UTF8-encoded characters, they will not be split, creating the possibility of chunks with slightly different sizes.

Parameters

chunkSize	The maximum chunk size in bytes.
to	Reference to a vector of strings to which the texts of the corpus chunks will be appended.
articleMapsTo	Reference to a vector of text map structures, to which the article maps of the chunks will be appended. The vector will not be changed if the text corpus does not possess an article map.
dateMapsTo	Reference to a vector of text map structures, to which the date maps of the chunks will be appended. The vector will not be changed if the text corpus does not possess an date map.

Exceptions

Corpus::Exception

if the chunk size is zero and the corpus is non-empty, if the corpus has already been tokenized, or if consistency checks have been enabled and

the article and the date map contradict each other
one of the chunks created is larger than the maximum chunk size given
the article map does not describe the whole corpus
the last chunk created is empty

See also: combineContinuous

References articleMap, corpus, dateMap, crawlservpp::Struct::TextMapEntry::end(), and crawlservpp::Struct::TextMapEntry::length().

Referenced by crawlservpp::Module::Analyzer::Database::checkSources().

◆ copyChunksTokenized()

void crawlservpp::Data::Corpus::copyChunksTokenized	(	std::size_t	chunkSize,
		Tokens &	to,
		Sizes &	tokenNumsTo,
		std::vector< TextMap > &	articleMapsTo,
		std::vector< TextMap > &	dateMapsTo,
		std::vector< SentenceMap > &	sentenceMapsTo
	)		const

inline

Copies the underlying tokenized text corpus into chunks of the given size.

A corresponding sentence map will be created for each of the corpus chunks, although the length of the last sentence in a chunk might exceed the length of the chunk itself.

If the text corpus has an article and/or a date map, a corresponding article and/or date map will be created for each of the corpus chunks.

If the text corpus contains UTF8-encoded characters, they will not be split, creating the possibility of chunks with slightly different sizes.

Parameters

chunkSize	The maximum chunk size in bytes.
to	Reference to a vector of strings to which the texts of the corpus chunks will be appended.
tokenNumsTo	Reference to a vector to which the number of tokens for each chunk will be written.
articleMapsTo	Reference to a vector of text map structures, to which the article maps of the chunks will be appended. The vector will not be changed if the text corpus does not possess an article map.
dateMapsTo	Reference to a vector of text map structures, to which the date maps of the chunks will be appended. The vector will not be changed if the text corpus does not possess an date map.
sentenceMapsTo	Reference to a vector of [pos;length] pairs, to which the sentence maps of the chunks will be appended. The vector will not be changed if the text corpus does not possess an sentence map.

Exceptions

Corpus::Exception

if the chunk size is zero and the corpus is non-empty, the corpus has not been tokenized, the sentence map for a non-empty corpus is empty, or if consistency checks have been enabled and

the article, the date map and/or the sentence map contradict each other
one of the chunks created is larger than the maximum chunk size given
the article map or the sentence map does not describe the whole corpus
the last chunk created is empty

See also: combineTokenized

References articleMap, dateMap, sentenceMap, size(), and tokens.

Referenced by crawlservpp::Module::Analyzer::Database::checkSources().

◆ copyContinuous() [1/2]

void crawlservpp::Data::Corpus::copyContinuous ( std::string & to ) const

inline

Copies the underlying continuous text corpus to the given string.

Parameters

to	Reference to a string to which the text corpus will be copied.

Exceptions

Corpus::Exception if the corpus has been tokenized.

References corpus.

◆ copyContinuous() [2/2]

void crawlservpp::Data::Corpus::copyContinuous	(	std::string &	to,
		TextMap &	articleMapTo,
		TextMap &	dateMapTo
	)		const

inline

Copies the underlying continous text corpus, as well as its article and date map.

Parameters

to	Reference to a string to which the text corpus will be copied.
articleMapTo	Reference to a text map structure to which the article map will be copied.
dateMapTo	Reference to a text map structure to which the date map will be copied.

Exceptions

Corpus::Exception if the corpus has been tokenized.

References articleMap, corpus, and dateMap.

◆ create() [1/2]

void crawlservpp::Data::Corpus::create	(	Tokens &	texts,
		bool	deleteInputData
	)

inline

Creates text corpus from a vector of strings.

Concatenates all given texts and delimits them with spaces.

Note: If a corpus already exists, it will be cleared.

Parameters

texts	A reference to the vector containing texts to create the corpus from.
deleteInputData	If true, the given texts will be cleared, freeing the used memory early.

References clear(), corpus, and crawlservpp::Helper::Memory::freeIf().

Referenced by crawlservpp::Module::Analyzer::Database::checkSources(), and create().

◆ create() [2/2]

void crawlservpp::Data::Corpus::create	(	Tokens &	texts,
		std::vector< std::string > &	articleIds,
		std::vector< std::string > &	dateTimes,
		bool	deleteInputData
	)

inline

Creates text corpus from parsed data, including article and date maps.

Concatenates all given texts and delimits them with spaces. Saves given article IDs and date/times in article and date map.

Note: If a corpus already exists, it will be cleared.

Parameters

texts	A reference to the vector containing texts to create the corpus from.
articleIds	A reference to the vector containing the article IDs for the given texts.
dateTimes	A reference to the vector containing the date/times for the given texts.
deleteInputData	If true, the given texts, article IDs and date/times will be cleared, freeing the used memory early.

References clear(), corpus, create(), dateMap, crawlservpp::Helper::Memory::freeIf(), and crawlservpp::Struct::TextMapEntry::value.

◆ empty()

bool crawlservpp::Data::Corpus::empty ( ) const

inline

Checks whether the corpus is empty.

Note: If the corpus is tokenized, the tokens will be checked whether all of them are empty.

Returns: True, if the corpus is empty. False otherwise.

References corpus, and tokens.

Referenced by clear(), get(), and getTokenized().

◆ filterArticles()

std::size_t crawlservpp::Data::Corpus::filterArticles	(	const ArticleFunc &	callbackArticle,
		StatusSetter &	statusSetter
	)

inline

Filters a tokenized corpus by removing articles.

Parameters

callbackArticle	Callback function that returns whether to keep the given article. It will receive a constant reference to all tokens in the corpus, as well as the start and the length of the article. If it the callback function returns true, the given article will be kept. If not, its tokens will be removed.
statusSetter	Reference to a structure containing callbacks for updating the status and checking whether the thread is still supposed to be running.

Returns: The number of articles that have been removed, or zero if no changes have been made to the corpus.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References articleMap, crawlservpp::Struct::StatusSetter::change(), crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Data::filterUpdateEvery, crawlservpp::Struct::StatusSetter::finish(), crawlservpp::Helper::Memory::free(), crawlservpp::Struct::TextMapEntry::pos(), tokens, and crawlservpp::Struct::StatusSetter::update().

◆ filterByDate()

bool crawlservpp::Data::Corpus::filterByDate	(	const std::string &	from,
		const std::string &	to
	)

inline

Filters a text corpus by the given date(s).

Afterwards, the corpus will only contain text marked with the given date(s), or be empty if the given date(s) do not correspond with any parts of the corpus.

If the given strings are empty, no action will be performed.

Parameters

from	Constant reference to a string containing the date to be filtered from, in the format `YYY-MM-DD`.
to	Constant reference to a string containing the date to be filtered to, in the format `YYY-MM-DD`.

Returns: True, if the corpus has been changed as result of the filtering by the given date. False, if it remains unchanged.

Exceptions

Corpus::Exception if consistency checks have been enabled on construction and the date map does not start at the beginning of the corpus, the date map and the article map contradict each other, or they contain other inconsistencies.

References articleMap, clear(), corpus, dateMap, crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Helper::DateTime::isISODateInRange(), crawlservpp::Struct::TextMapEntry::pos(), sentenceMap, and tokens.

Referenced by crawlservpp::Module::Analyzer::Database::getCorpus().

◆ get() [1/2]

std::string crawlservpp::Data::Corpus::get ( std::size_t index ) const

inline

Gets the article with the specified index from a continous text corpus.

Parameters

index The index of the article in the article map of the text corpus, starting with zero.

Returns: A copy of the article inside the text corpus with the given index.

Exceptions

Corpus::Exception if the article map of the corpus is empty, the given index is out of bounds (larger than the article map), or if the corpus has already been tokenized.

References articleMap, corpus, crawlservpp::Struct::TextMapEntry::length(), and crawlservpp::Struct::TextMapEntry::pos().

◆ get() [2/2]

std::string crawlservpp::Data::Corpus::get ( const std::string & id ) const

inline

Gets the article with the specified ID from a coninuous corpus.

Parameters

id	A constant reference to a string containing the ID of the article to be retrieved, as specified in the article map of the text corpus.

Returns: A copy of the article with the given ID, or an empty string if an article with this ID does not exist.

Exceptions

Corpus::Exception if no ID has been specified, i.e. id references an empty string, or if the corpus has already been tokenized.

References articleMap, corpus, and empty().

◆ getArticleMap()

Struct::TextMap & crawlservpp::Data::Corpus::getArticleMap ( )

inline

Gets a reference to the article map of the corpus.

Returns: A reference to the article map of the corpus, if specified.

References articleMap.

◆ getArticles()

std::vector< Corpus::Tokens > crawlservpp::Data::Corpus::getArticles ( ) const

inline

Gets the tokens of all articles from a tokenized corpus.

Returns: A vector containing all articles, each represented by a vector containing copies of their tokens.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References articleMap, crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Struct::TextMapEntry::length(), crawlservpp::Struct::TextMapEntry::pos(), and tokens.

◆ getcArticleMap()

const Struct::TextMap & crawlservpp::Data::Corpus::getcArticleMap ( ) const

inline

Gets a constant reference to the article map of the corpus.

Returns: A constant reference to the article map of the corpus, if specified.

References articleMap.

◆ getcCorpus()

const std::string & crawlservpp::Data::Corpus::getcCorpus ( ) const

inline

Gets a constant reference to the continous text corpus.

Returns: A constant reference to the text corpus represented by the class.

Exceptions

Corpus::Exception if the corpus has already been tokenized.

References corpus.

◆ getcDateMap()

const Struct::TextMap & crawlservpp::Data::Corpus::getcDateMap ( ) const

inline

Gets a constant reference to the date map of the corpus.

Returns: A constant reference to the date map of the corpus, if specified.

References dateMap.

◆ getCorpus()

std::string & crawlservpp::Data::Corpus::getCorpus ( )

inline

Gets a reference to the continous text corpus.

Returns: A reference to the text corpus represented by the class.

Exceptions

Corpus::Exception if the corpus has already been tokenized.

References corpus.

◆ getcSentenceMap()

const Corpus::SentenceMap & crawlservpp::Data::Corpus::getcSentenceMap ( ) const

inline

Gets a constant reference to the sentence map of the corpus.

Note: The corpus needs to be tokenized.

Returns: A constant reference to the sentence map of the corpus.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References sentenceMap.

◆ getcTokens()

const Corpus::Tokens & crawlservpp::Data::Corpus::getcTokens ( ) const

inline

Gets a constant reference to the tokens in a tokenized text corpus.

Returns: A constant reference to the vector containing the tokens of the corpus represented by the class.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References tokens.

◆ getDate()

std::string crawlservpp::Data::Corpus::getDate ( const std::string & date ) const

inline

Gets all articles at the specified date from a continous text corpus.

Parameters

date	A constant reference to a string containing the date of the articles to be retrieved, as specified in the date map of the text corpus. The string should have the format `YYYY-MM-DD`.

Returns: A copy of all concatenated articles at the given date, or an empty string if no articles have been found at this date.

Exceptions

Corpus::Exception if the given date has an invalid length, or if the corpus has already been tokenized.

References corpus, crawlservpp::Data::dateLength, and dateMap.

◆ getDateMap()

Struct::TextMap & crawlservpp::Data::Corpus::getDateMap ( )

inline

Gets a reference to the date map of the corpus.

Returns: A reference to the date map of the corpus, if specified.

References dateMap.

◆ getDateTokenized()

Corpus::Tokens crawlservpp::Data::Corpus::getDateTokenized ( const std::string & date ) const

inline

Gets the tokens of all articles at the specified date from a tokenized text corpus.

Parameters

date	A constant reference to a string containing the date of the articles to be retrieved, as specified in the date map of the text corpus. The string should have the format `YYYY-MM-DD`.

Returns: A vector containing copies of all tokens of the articles at the given date, or an empty vector if no articles have been found at this date.

Exceptions

Corpus::Exception if the given date has an invalid length, or if the corpus has not been tokenized.

References crawlservpp::Data::dateLength, dateMap, and tokens.

◆ getNumTokens()

std::size_t crawlservpp::Data::Corpus::getNumTokens ( ) const

inline

Gets the number of tokens in the corpus.

Returns: The number of tokens in the corpus.

Exceptions

Corpus::Exception if the corpus has not been tokenized

References tokens.

◆ getSentenceMap()

Corpus::SentenceMap & crawlservpp::Data::Corpus::getSentenceMap ( )

inline

Gets a reference to the sentence map of the corpus.

Note: The corpus needs to be tokenized.

Returns: A reference to the sentence map of the corpus.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References sentenceMap.

◆ getTokenized() [1/2]

Corpus::Tokens crawlservpp::Data::Corpus::getTokenized ( std::size_t index ) const

inline

Gets the article with the specified index from a tokenized text corpus.

Parameters

index The index of the article in the article map of the text corpus, starting with zero.

Returns: A vector containing copies of the tokens of the article inside the text corpus with the given index.

Exceptions

Corpus::Exception if the article map of the corpus is empty, the given index is out of bounds (larger than the article map), or if the corpus has not been tokenized.

References articleMap, crawlservpp::Struct::TextMapEntry::end(), crawlservpp::Struct::TextMapEntry::length(), crawlservpp::Struct::TextMapEntry::pos(), and tokens.

Referenced by crawlservpp::Module::Analyzer::Algo::TopicModelling::resetAlgo().

◆ getTokenized() [2/2]

Corpus::Tokens crawlservpp::Data::Corpus::getTokenized ( const std::string & id ) const

inline

Gets the article with the specified ID from a tokenized corpus.

Parameters

id	A constant reference to a string containing the ID of the article to be retrieved, as specified in the article map of the text corpus.

Returns: A vector containing copies of the tokens of the article with the given ID, or an empty vector if an article with this ID does not exist.

Exceptions

Corpus::Exception if no ID has been specified, i.e. id references an empty string, or if the corpus has not been tokenized.

References articleMap, empty(), and tokens.

◆ getTokens()

Corpus::Tokens & crawlservpp::Data::Corpus::getTokens ( )

inline

Gets a reference to the tokens in a tokenized text corpus.

Returns: A reference to the vector containing the tokens of the corpus represented by the class.

Exceptions

Corpus::Exception if the corpus has not been tokenized.

References tokens.

◆ hasArticleMap()

bool crawlservpp::Data::Corpus::hasArticleMap ( ) const

inline

Checks whether the corpus has an article map.

Returns: True, if the corpus has an article map, i.e. consists of articles and is not empty.

References articleMap.

Referenced by clear().

◆ hasDateMap()

bool crawlservpp::Data::Corpus::hasDateMap ( ) const

inline

Checks whether the corpus has a date map.

Returns: True, if the corpus has a date map, i.e. consists of dates and is not empty.

References dateMap.

Referenced by clear().

◆ hasSentenceMap()

bool crawlservpp::Data::Corpus::hasSentenceMap ( ) const

inline

Checks whether the corpus has sentence map.

Returns: True, if the corpus has a sentence map, i.e. consists of sentences and is not empty.

References sentenceMap.

◆ isTokenized()

bool crawlservpp::Data::Corpus::isTokenized ( ) const

inline

Gets whether the corpus has been tokenized.

Returns: True, if the corpus has been tokenized. False, if it has been not and is still continuous.

Referenced by crawlservpp::Module::Analyzer::Database::checkSources(), and clear().

◆ size()

std::size_t crawlservpp::Data::Corpus::size ( ) const

inline

Gets the size of the text corpus, in bytes.

Note: The number of characters in the corpus may differ, as it might be UTF-8-encoded.

Returns: The size of the corpus, in bytes.

References corpus.

Referenced by crawlservpp::Module::Analyzer::Database::checkSources(), clear(), copyChunksTokenized(), and crawlservpp::Module::Analyzer::Database::getCorpus().

◆ substr()

std::string crawlservpp::Data::Corpus::substr	(	std::size_t	from,
		std::size_t	len
	)

inline

Gets a substring from the corpus.

Parameters

from	The zero-based index of the first byte of the substring to be retrieved.
len	The length of the substring to be retreved, in bytes.

Returns: A copy of the specified substring.

Exceptions

Corpus::Exception	if the corpus has already been tokenized.
std::out_of_range	if the index or length are invalid, std::bad_alloc if the function needs to allocate storage and fails.

References corpus.

◆ tokenize()

bool crawlservpp::Data::Corpus::tokenize	(	const std::vector< std::uint16_t > &	manipulators,
		const std::vector< std::string > &	models,
		const std::vector< std::string > &	dictionaries,
		const std::vector< std::string > &	languages,
		std::uint64_t	freeMemoryEvery,
		StatusSetter &	statusSetter
	)

inline

Converts a text corpus into processed tokens.

It will use the given manipulators to manipulate each sentence (or their respective tokens) first.

Please make sure that the corpus is tidied beforehand, i.e. UTF-8 and other non-space whitespaces are replaced by simple spaces. If needed, sentences are created by simple punctuation analysis.

Note: Once tokenized, the continous text corpus will be lost. Create a copy beforehand if you still need the original corpus.

Warning: The vectors containing the manipulators, models, dictionaries, and languages must have the same number of elements.

Parameters

manipulators	A vector containing the IDs of the manipulators that will be used on all sentences (or all of their tokens) in the corpus, where every sentence is separated by one of the following punctuations from the others: .:!?; or by the end of the current article, date, or the whole corpus.
models	A vector of strings containing the model to be used by the manipulator with the same array index, or an empty string if the respective manipulator does not require a model.
dictionaries	A vector of strings containing the dictionary to be used by the manipulator with the same array index, or an empty string if the respective manipulator does not require a dictionary.
languages	A vector of strings containing the language to be used by the manipulator with the same array index, or an empty string if the respective manipulator does not require a dictionary or its default language should be used.
freeMemoryEvery	Number of processed bytes in a continuous corpus after which memory will be freed. If zero, memory will only be freed after processing is complete.
statusSetter	Reference to a structure containing callbacks for updating the status and checking whether the thread is still supposed to be running.

Returns: True, if the corpus has been successfully tokenized. False, if tokenization has been cancelled.

Exceptions

Corpus::Exception if an invalid manipulator has been specified, a model or dictionary is missing for a manipulator requiring one, or a model, dictionary or language is set for a manipulator that does not use one.

References crawlservpp::Data::corpusManipCorrect, crawlservpp::Data::corpusManipEnglishStemmer, crawlservpp::Data::corpusManipGermanStemmer, crawlservpp::Data::corpusManipLemmatizer, crawlservpp::Data::corpusManipNone, crawlservpp::Data::corpusManipRemove, crawlservpp::Data::corpusManipTagger, crawlservpp::Data::corpusManipTaggerPosterior, crawlservpp::Data::corpusManipTrim, crawlservpp::Data::Stemmer::stemEnglish(), crawlservpp::Data::Stemmer::stemGerman(), and tokenizeCustom().

Referenced by crawlservpp::Module::Analyzer::Database::checkSources().

◆ tokenizeCustom()

bool crawlservpp::Data::Corpus::tokenizeCustom	(	const std::optional< SentenceFunc > &	callback,
		std::uint64_t	freeMemoryEvery,
		StatusSetter &	statusSetter
	)

inline

Converts a text corpus into processed tokens, using custom manipulators.

If a sentence manipulator is given, first the sentence as a whole will be manipulated, then the individual tokens contained in this sentence.

Please make sure that the corpus is tidied beforehand, i.e. UTF-8 and other non-space whitespaces are replaced by simple spaces. If needed, sentences are created by simple punctuation analysis.

Warning: Once tokenized, the continous text corpus will be lost. Create a copy beforehand if you still need the original corpus.

Parameters

callback	Optional callback function (or lambda) that will be used on all sentences in the corpus, where every sentence is separated by one of the following punctuations from the others: .:;!? or by the end of the current article, date, or the whole corpus. A token will not be added to the tokens of the corpus, if the callback function empties it.
freeMemoryEvery	Number of processed bytes in a continuous corpus after which memory will be freed. If zero, memory will only be freed after processing is complete.
statusSetter	Reference to a structure containing callbacks for updating the status and checking whether the thread is still supposed to be running.

Returns: True, if the corpus has been successfully tokenized. False, if tokenization has been cancelled.

Exceptions

Corpus::Exception if consistency checks are enabled, and article and/or date map are inconsistent with the content of the corpus.

References crawlservpp::Struct::StatusSetter::finish().

Referenced by crawlservpp::Module::Analyzer::Database::checkSources(), and tokenize().

Member Data Documentation

◆ articleMap

TextMap crawlservpp::Data::Corpus::articleMap

protected

Index of articles and their IDs.

Referenced by clear(), combineContinuous(), combineTokenized(), copyChunksContinuous(), copyChunksTokenized(), copyContinuous(), filterArticles(), filterByDate(), get(), getArticleMap(), getArticles(), getcArticleMap(), getTokenized(), and hasArticleMap().

◆ corpus

std::string crawlservpp::Data::Corpus::corpus

protected

Continuous text corpus.

Referenced by clear(), combineContinuous(), copyChunksContinuous(), copyContinuous(), create(), empty(), filterByDate(), get(), getcCorpus(), getCorpus(), getDate(), size(), and substr().

◆ dateMap

TextMap crawlservpp::Data::Corpus::dateMap

protected

Index of dates.

Referenced by clear(), combineContinuous(), combineTokenized(), copyChunksContinuous(), copyChunksTokenized(), copyContinuous(), create(), filterByDate(), getcDateMap(), getDate(), getDateMap(), getDateTokenized(), and hasDateMap().

◆ sentenceMap

SentenceMap crawlservpp::Data::Corpus::sentenceMap

protected

Index of sentences.

Referenced by clear(), combineTokenized(), copyChunksTokenized(), filterByDate(), getcSentenceMap(), getSentenceMap(), and hasSentenceMap().

◆ tokens

Tokens crawlservpp::Data::Corpus::tokens

protected

Tokenized text corpus.

Referenced by clear(), combineTokenized(), copyChunksTokenized(), empty(), filterArticles(), filterByDate(), getArticles(), getcTokens(), getDateTokenized(), getNumTokens(), getTokenized(), and getTokens().

The documentation for this class was generated from the following file:

Data/Corpus.hpp

Classes

Public Types

Construction

Getters

Creation

Copying

Filtering

Tokenization

Cleanup

Data

Detailed Description

Member Typedef Documentation

◆ ArticleFunc

◆ DateArticleSentenceMap

◆ PositionLength

◆ SentenceFunc

◆ SentenceMap

◆ SentenceMapEntry

◆ Sizes

◆ Tokens

Constructor & Destructor Documentation

◆ Corpus() [1/2]

◆ Corpus() [2/2]

Member Function Documentation

◆ clear()

◆ combineContinuous()

◆ combineTokenized()

◆ copyChunksContinuous()

◆ copyChunksTokenized()

◆ copyContinuous() [1/2]

◆ copyContinuous() [2/2]

◆ create() [1/2]

◆ create() [2/2]

◆ empty()

◆ filterArticles()

◆ filterByDate()

◆ get() [1/2]

◆ get() [2/2]

◆ getArticleMap()

◆ getArticles()

◆ getcArticleMap()

◆ getcCorpus()

◆ getcDateMap()

◆ getCorpus()

◆ getcSentenceMap()

◆ getcTokens()

◆ getDate()

◆ getDateMap()

◆ getDateTokenized()

◆ getNumTokens()

◆ getSentenceMap()

◆ getTokenized() [1/2]

◆ getTokenized() [2/2]

◆ getTokens()

◆ hasArticleMap()

◆ hasDateMap()

◆ hasSentenceMap()

◆ isTokenized()

◆ size()

◆ substr()

◆ tokenize()

◆ tokenizeCustom()

Member Data Documentation

◆ articleMap

◆ corpus

◆ dateMap

◆ sentenceMap

◆ tokens