crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
CorpusGenerator.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * CorpusGenerator.hpp
24  *
25  * This algorithm uses the built-in functionality for building text
26  * corpora from its input data.
27  *
28  * Additionally, it writes some basic statistics (number and length of
29  * tokens and sentences) to the target table.
30  *
31  * Created on: Mar 5, 2020
32  * Author: ans
33  */
34 
35 #ifndef MODULE_ANALYZER_ALGO_CORPUSGENERATOR_HPP_
36 #define MODULE_ANALYZER_ALGO_CORPUSGENERATOR_HPP_
37 
38 #include "../Thread.hpp"
39 
40 #include "../../../Data/Corpus.hpp"
41 #include "../../../Helper/Math.hpp"
42 #include "../../../Helper/Memory.hpp"
43 #include "../../../Helper/Utf8.hpp"
44 #include "../../../Main/Database.hpp"
45 #include "../../../Struct/StatusSetter.hpp"
46 #include "../../../Struct/TextMap.hpp"
47 #include "../../../Struct/ThreadOptions.hpp"
48 #include "../../../Struct/ThreadStatus.hpp"
49 
50 #include <cstddef> // std::size_t
51 #include <cstdint> // std::uint64_t
52 #include <limits> // std::numeric_limits
53 #include <string> // std::string
54 #include <string_view> // std::string_view
55 #include <utility> // std::pair
56 #include <vector> // std::vector
57 
59 
60  /*
61  * CONSTANTS
62  */
63 
66 
68  inline constexpr auto corpusNumFields{9};
69 
71 
72  /*
73  * DECLARATION
74  */
75 
77 
81  // for convenience
82  using DataType = Data::Type;
83  using DataValue = Data::Value;
84 
86 
91 
92  using StringString = std::pair<std::string, std::string>;
93 
94  public:
97 
99  Main::Database& dbBase,
100  const ThreadOptions& threadOptions,
101  const ThreadStatus& threadStatus
102  );
104  Main::Database& dbBase,
105  const ThreadOptions& threadOptions
106  );
107 
111 
112  std::string_view getName() const override;
113 
117 
118  void onAlgoInitTarget() override;
119  void onAlgoInit() override;
120  void onAlgoTick() override;
121  void onAlgoPause() override;
122  void onAlgoUnpause() override;
123  void onAlgoClear() override;
124 
128 
129  void parseAlgoOption() override;
130  void checkAlgoOptions() override;
131  void resetAlgo() override;
132 
134 
135  private:
136  // status message saved in-class
137  std::string status;
138 
139  // internal static helper function
140  static bool isSentenceEmpty(
141  const std::pair<std::size_t, std::size_t>& sentence,
142  const std::vector<std::string>& tokens
143  );
144  };
145 
146 } /* namespace crawlservpp::Module::Analyzer::Algo */
147 
148 #endif /* MODULE_ANALYZER_ALGO_CORPUSGENERATOR_HPP_ */
CorpusGenerator(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: CorpusGenerator.cpp:47
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: CorpusGenerator.cpp:92
Namespace for algorithm classes.
Definition: All.cpp:52
void onAlgoUnpause() override
Does nothing.
Definition: CorpusGenerator.cpp:352
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Algorithm building a text corpus and creating corpus statistics from the input data.
Definition: CorpusGenerator.hpp:80
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void onAlgoClear() override
Does nothing.
Definition: CorpusGenerator.cpp:355
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Type
Data types.
Definition: Data.hpp:66
void parseAlgoOption() override
Does nothing.
Definition: CorpusGenerator.cpp:362
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
constexpr auto corpusNumFields
Number of target fields.
Definition: CorpusGenerator.hpp:68
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::string_view getName() const override
Returns the name of the algorithm.
Definition: CorpusGenerator.cpp:79
void onAlgoTick() override
Sleeps until the thread is terminated.
Definition: CorpusGenerator.cpp:340
A generic value.
Definition: Data.hpp:96
void onAlgoPause() override
Does nothing.
Definition: CorpusGenerator.cpp:349
void checkAlgoOptions() override
Does nothing.
Definition: CorpusGenerator.cpp:365
void resetAlgo() override
Resets the algorithm.
Definition: CorpusGenerator.cpp:373
void onAlgoInit() override
Generates the corpus.
Definition: CorpusGenerator.cpp:119