crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TermsOverTime.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TermsOverTime.hpp
24  *
25  * Count occurrences of specific terms in a text corpus over time.
26  *
27  * Created on: Aug 2, 2020
28  * Author: ans
29  */
30 
31 #ifndef MODULE_ANALYZER_ALGO_TERMSOVERTIME_HPP_
32 #define MODULE_ANALYZER_ALGO_TERMSOVERTIME_HPP_
33 
34 #include "../Thread.hpp"
35 
36 #include "../../../Data/Corpus.hpp"
37 #include "../../../Helper/Memory.hpp"
38 #include "../../../Main/Database.hpp"
39 #include "../../../Struct/StatusSetter.hpp"
40 #include "../../../Struct/ThreadOptions.hpp"
41 #include "../../../Struct/ThreadStatus.hpp"
42 
43 #include <cstdint> // std::uint64_t
44 #include <limits> // std::numeric_limits
45 #include <string> // std::string
46 #include <string_view> // std::string_view
47 #include <unordered_map> // std::unordered_map
48 #include <vector> // std::vector
49 
51 
52  /*
53  * DECLARATION
54  */
55 
57  /*
58  * \todo Not implemented yet.
59  */
60  class TermsOverTime final : public Module::Analyzer::Thread {
61  // for convenience
62  using DataType = Data::Type;
63  using DataValue = Data::Value;
64 
66 
70 
71  using DateOccurrences = std::unordered_map<std::string, std::uint64_t>;
72  using DateArticlesOccurrences = std::unordered_map<std::string, DateOccurrences>;
73 
74  public:
77 
79  Main::Database& dbBase,
80  const ThreadOptions& threadOptions,
81  const ThreadStatus& threadStatus
82  );
84  Main::Database& dbBase,
85  const ThreadOptions& threadOptions
86  );
87 
91 
92  std::string_view getName() const override;
93 
97 
98  void onAlgoInitTarget() override;
99  void onAlgoInit() override;
100  void onAlgoTick() override;
101  void onAlgoPause() override;
102  void onAlgoUnpause() override;
103  void onAlgoClear() override;
104 
108 
109  void parseAlgoOption() override;
110  void checkAlgoOptions() override;
111  void resetAlgo() override;
112 
114 
115  private:
116  // algorithm options
117  struct Entries {
118  //TODO add additional algo options
119  } algoConfig;
120 
121  // algorithm state
122  bool firstTick{true};
123 
124  // counts
125  std::vector<DateArticlesOccurrences> dateCounts;
126 
127  // internal helper functions
128  void count();
129  void save();
130  };
131 
132 } /* namespace crawlservpp::Module::Analyzer::Algo */
133 
134 #endif /* MODULE_ANALYZER_ALGO_TERMSOVERTIME_HPP_ */
void onAlgoClear() override
Does nothing.
Definition: TermsOverTime.cpp:179
TermsOverTime(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: TermsOverTime.cpp:43
void onAlgoUnpause() override
Does nothing.
Definition: TermsOverTime.cpp:176
Namespace for algorithm classes.
Definition: All.cpp:52
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: TermsOverTime.cpp:186
void checkAlgoOptions() override
Does nothing.
Definition: TermsOverTime.cpp:195
std::string_view getName() const override
Returns the name of the algorithm.
Definition: TermsOverTime.cpp:75
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: TermsOverTime.cpp:89
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Type
Data types.
Definition: Data.hpp:66
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
void onAlgoPause() override
Does nothing.
Definition: TermsOverTime.cpp:173
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
void onAlgoTick() override
Counts the terms in the text corpus.
Definition: TermsOverTime.cpp:159
void onAlgoInit() override
Generates the corpus.
Definition: TermsOverTime.cpp:102
A generic value.
Definition: Data.hpp:96
Algorithm counting specific terms in a text corpus over time.
Definition: TermsOverTime.hpp:60
void resetAlgo() override
Resets the algorithm.
Definition: TermsOverTime.cpp:203