crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
ExtractIds.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2023 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * ExtractIds.hpp
24  *
25  * Extracts the parsed IDs from a filtered corpus.
26  *
27  * Created on: Jul 31, 2023
28  * Author: ans
29  */
30 
31 #ifndef MODULE_ANALYZER_ALGO_ExtractIds_HPP_
32 #define MODULE_ANALYZER_ALGO_ExtractIds_HPP_
33 
34 #include "../Thread.hpp"
35 
36 #include "../../../Data/Data.hpp"
37 #include "../../../Helper/DateTime.hpp"
38 #include "../../../Helper/Memory.hpp"
39 #include "../../../Main/Database.hpp"
40 #include "../../../Struct/StatusSetter.hpp"
41 #include "../../../Struct/TextMap.hpp"
42 #include "../../../Struct/ThreadOptions.hpp"
43 #include "../../../Struct/ThreadStatus.hpp"
44 
45 #include <cstddef> // std::size_t
46 #include <cstdint> // std::uint64_t
47 #include <limits> // std::numeric_limits
48 #include <set> // std::set
49 #include <string> // std::string
50 #include <string_view> // std::string_view
51 #include <unordered_set> // std::unordered_set
52 #include <utility> // std::pair
53 #include <vector> // std::vector
54 
56 
57  /*
58  * CONSTANTS
59  */
60 
63 
65  inline constexpr auto extractIdsUpdateProgressEvery{1000};
66 
68 
69  /*
70  * DECLARATION
71  */
72 
74  class ExtractIds final : public Module::Analyzer::Thread {
75  // for convenience
77 
82 
83  using StringString = std::pair<std::string, std::string>;
84 
85  public:
88 
89  ExtractIds(
90  Main::Database& dbBase,
91  const ThreadOptions& threadOptions,
92  const ThreadStatus& threadStatus
93  );
94  ExtractIds(
95  Main::Database& dbBase,
96  const ThreadOptions& threadOptions
97  );
98 
102 
103  std::string_view getName() const override;
104 
108 
109  void onAlgoInitTarget() override;
110  void onAlgoInit() override;
111  void onAlgoTick() override;
112  void onAlgoPause() override;
113  void onAlgoUnpause() override;
114  void onAlgoClear() override;
115 
119 
120  void parseAlgoOption() override;
121  void checkAlgoOptions() override;
122  void resetAlgo() override;
123 
125 
126  private:
127  // algorithm state
128  bool firstTick{true};
129 
130  // results
131  std::set<std::string> results;
132 
133  // algorithm functions
134  void extract();
135  void save();
136 
137  // internal helper function
138  void insertDataSet(const std::string& table, const std::string& result);
139  };
140 
141 } /* namespace crawlservpp::Module::Analyzer::Algo */
142 
143 #endif /* MODULE_ANALYZER_ALGO_ExtractIds_HPP_ */
void checkAlgoOptions() override
Does nothing.
Definition: ExtractIds.cpp:187
void onAlgoInit() override
Generates the corpus.
Definition: ExtractIds.cpp:105
Extracts the parsed IDs from a filtered corpus.
Definition: ExtractIds.hpp:74
Namespace for algorithm classes.
Definition: All.cpp:52
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void parseAlgoOption() override
Does nothing.
Definition: ExtractIds.cpp:184
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
void onAlgoClear() override
Does nothing.
Definition: ExtractIds.cpp:177
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
void onAlgoPause() override
Does nothing.
Definition: ExtractIds.cpp:171
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
constexpr auto extractIdsUpdateProgressEvery
Indicates after how many articles the progress of the thread will be updated.
Definition: ExtractIds.hpp:65
void onAlgoUnpause() override
Does nothing.
Definition: ExtractIds.cpp:174
void resetAlgo() override
Resets the algorithm.
Definition: ExtractIds.cpp:195
void onAlgoTick() override
Extracts IDs from corpus.
Definition: ExtractIds.cpp:157
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: ExtractIds.cpp:88
ExtractIds(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: ExtractIds.cpp:43
std::string_view getName() const override
Returns the name of the algorithm.
Definition: ExtractIds.cpp:74