31 #ifndef MODULE_ANALYZER_CONFIG_HPP_ 32 #define MODULE_ANALYZER_CONFIG_HPP_ 34 #include "../Config.hpp" 339 void reset()
override;
386 StringParsingOption::SQL
392 StringParsingOption::SQL
401 StringParsingOption::SQL
450 "Invalid corpus chunk size reset to " 452 +
"% of the maximum packet size allowed by the MySQL server." 457 const auto completeInputs{
465 bool incompleteInputs{
false};
471 incompleteInputs =
true;
478 incompleteInputs =
true;
486 incompleteInputs =
true;
490 if(incompleteInputs) {
492 "'input.fields', '.sources' and '.tables'" 493 " should have the same number of elements." 496 this->
warning(
"Incomplete input field(s) removed from configuration.");
505 "The configuration contains" 506 " more dictionaries than manipulators," 507 " redundant dictionaries will be ignored." 516 "The configuration contains" 517 " more models than manipulators," 518 " redundant models will be ignored." 527 "The configuration contains" 528 " more languages than manipulators," 529 " redundant languages will be ignored." void option(const std::string &name, bool &target)
Checks for a configuration option of type bool.
Definition: Config.hpp:573
constexpr std::uint8_t generalInputSourcesParsing
An analyzer uses a parsing table as data source.
Definition: Config.hpp:52
Namespace for analyzer classes.
constexpr auto maxPercentageCorpusSlices
Maximum percentage of the maximum length for corpus slices.
Definition: Config.hpp:88
std::vector< std::string > generalInputTables
Names of tables to be used as input.
Definition: Config.hpp:153
bool uploadVerbose
Specified whether FTP network information will be printed to the server console while uploading the r...
Definition: Config.hpp:304
bool groupDateFillGaps
Enables filling the gaps inbetween dates.
Definition: Config.hpp:175
std::uint64_t tokenizerFreeMemoryEvery
Number of processed bytes in a continuous corpus after which memory will be freed.
Definition: Config.hpp:234
Config & operator=(Config &)=delete
Deleted copy assignment operator.
Abstract class as base for module-specific configurations.
Definition: Config.hpp:122
std::string uploadFTP
URL to upload a JSON file containing the results to.
Definition: Config.hpp:289
std::uint64_t generalSleepWhenFinished
Time (in ms) to wait each tick when finished.
Definition: Config.hpp:165
std::uint8_t generalCorpusSlicing
Corpus chunk size in percent of the maximum allowed package size by the MySQL server.
Definition: Config.hpp:138
std::uint8_t groupDateResolution
The resolution to be used when grouping dates.
Definition: Config.hpp:187
std::vector< std::string > tokenizerLanguages
Language for the (token-based aspell) manipulator with the same array index.
Definition: Config.hpp:244
constexpr std::uint8_t generalLoggingVerbose
Verbose logging is enabled.
Definition: Config.hpp:73
~Config() override=default
Default destructor.
constexpr std::uint8_t generalLoggingSilent
Logging is disabled.
Definition: Config.hpp:64
constexpr auto minPercentageCorpusSlices
Minimum percentage of the maximum length for corpus slices.
Definition: Config.hpp:85
constexpr auto defaultPercentageCorpusSlices
Default percentage of the maximum length for corpus slices.
Definition: Config.hpp:91
std::vector< std::string > generalInputFields
Columns to be used from the input tables.
Definition: Config.hpp:141
constexpr std::uint64_t defaultSleepMySqlS
Default time (in s) to wait before last try to re-connect to MySQL server.
Definition: Config.hpp:79
bool generalCorpusChecks
Check the consistency of text corpora.
Definition: Config.hpp:135
void parseOption() override
Parses an analyzer-specific configuration option.
Definition: Config.hpp:378
void reset() override
Resets the analyzer-specific configuration options.
Definition: Config.hpp:551
constexpr std::uint64_t defaultSleepWhenFinishedMs
Default time (in ms) to wait each tick when finished.
Definition: Config.hpp:82
constexpr std::uint8_t generalLoggingDefault
Default logging is enabled.
Definition: Config.hpp:67
std::vector< std::uint16_t > tokenizerSavePoints
Steps after which the corpus will be stored in the database.
Definition: Config.hpp:279
std::vector< std::uint64_t > filterQueryQueries
Queries which need to be fulfilled for at least one token in an article in order to keep it...
Definition: Config.hpp:210
bool filterQueryAll
Specifies whether articles must contain a word fulfilling all of the queries instead of only of one o...
Definition: Config.hpp:213
constexpr std::uint8_t generalInputSourcesExtracting
An analyzer uses an extracting table as data source.
Definition: Config.hpp:55
std::string uploadProxy
URL of proxy to use while uploading a JSON file containing the results.
Definition: Config.hpp:295
std::vector< std::uint8_t > generalInputSources
Types of tables to be used as input.
Definition: Config.hpp:150
constexpr std::uint8_t generalInputSourcesCrawling
An analyzer uses a crawling table as data source.
Definition: Config.hpp:61
void category(const std::string &category)
Sets the category of the subsequent configuration items to be checked for.
Definition: Config.hpp:527
std::vector< std::string > tokenizerModels
Model for the (sentence-based) manipulator with the same array index.
Definition: Config.hpp:267
std::string generalTargetTable
Table name to save analyzed data to.
Definition: Config.hpp:168
void checkOptions() override
Checks the analyzer-specific configuration options.
Definition: Config.hpp:441
struct crawlservpp::Module::Analyzer::Config::Entries config
Configuration of the analyzer.
std::uint8_t generalLogging
Level of logging activity.
Definition: Config.hpp:156
virtual void checkAlgoOptions()=0
Checks the algorithm-specific configuration.
std::uint64_t generalSleepMySql
Time (in s) to wait before last try to re-connect to mySQL server.
Definition: Config.hpp:162
virtual void parseAlgoOption()=0
Parses an algorithm-specific configuration entry.
constexpr std::int32_t defaultRestartAfter
Default time (in s) after which to restart analysis once it has been completed (-1=deactivated).
Definition: Config.hpp:76
constexpr std::uint8_t generalLoggingExtended
Extended logging is enabled.
Definition: Config.hpp:70
constexpr auto defaultFreeMemoryEvery
Default number of processed bytes in a continuous corpus after which memory will be freed...
Definition: Config.hpp:94
std::string filterDateFrom
The date from which to filter the parsed data.
Definition: Config.hpp:197
Abstract configuration for analyzers, to be implemented by algorithm classes.
Definition: Config.hpp:103
std::vector< std::string > tokenizerDicts
Dictionary for the (token-based) manipulator with the same array index.
Definition: Config.hpp:227
std::string uploadTargetColumn
Name of the column in the target table to create the JSON file for uploading from.
Definition: Config.hpp:301
std::string filterDateTo
The date until which to filter the parsed data.
Definition: Config.hpp:200
Config()=default
Default constructor.
constexpr std::uint8_t generalInputSourcesAnalyzing
An analyzer uses an analyzing table as data source.
Definition: Config.hpp:58
std::int32_t generalRestartAfter
Time (in s) after which to restart analysis once it has been completed (-1=deactivated).
Definition: Config.hpp:159
void warning(const std::string &warning)
Adds a warning to the logging queue.
Definition: Config.hpp:2427
Configuration entries for analyzer threads.
Definition: Config.hpp:130
bool filterDateEnable
Enable filtering source data by date (only applies to parsed data).
Definition: Config.hpp:194
std::vector< std::uint16_t > tokenizerManipulators
Manipulators used on the text corpus.
Definition: Config.hpp:257
virtual void resetAlgo()=0
Resets the algorithm-specific configuration.