crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
WordsOverTime.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * WordsOverTime.hpp
24  *
25  * Counts the occurrence of articles, sentences, and tokens in a corpus
26  * over time.
27  *
28  * Created on: Jan 03, 2021
29  * Author: ans
30  */
31 
32 #ifndef MODULE_ANALYZER_ALGO_WORDSOVERTIME_HPP_
33 #define MODULE_ANALYZER_ALGO_WORDSOVERTIME_HPP_
34 
35 #include "../Thread.hpp"
36 
37 #include "../../../Data/Data.hpp"
38 #include "../../../Helper/DateTime.hpp"
39 #include "../../../Helper/Memory.hpp"
40 #include "../../../Main/Database.hpp"
41 #include "../../../Struct/StatusSetter.hpp"
42 #include "../../../Struct/TextMap.hpp"
43 #include "../../../Struct/ThreadOptions.hpp"
44 #include "../../../Struct/ThreadStatus.hpp"
45 
46 #include <cstddef> // std::size_t
47 #include <cstdint> // std::uint64_t
48 #include <limits> // std::numeric_limits
49 #include <map> // std::map
50 #include <string> // std::string
51 #include <string_view> // std::string_view
52 #include <unordered_set> // std::unordered_set
53 #include <utility> // std::pair
54 #include <vector> // std::vector
55 
57 
58  /*
59  * CONSTANTS
60  */
61 
64 
66  inline constexpr auto wordsUpdateProgressEvery{100};
67 
69  inline constexpr auto wordsNumberOfColumns{4};
70 
72 
73  /*
74  * DECLARATION
75  */
76 
78  class WordsOverTime final : public Module::Analyzer::Thread {
79  // structure for results
80  struct DateResults {
81  std::unordered_set<std::string> articles;
82  std::uint64_t sentences{};
83  std::uint64_t words{};
84  };
85 
86  // for convenience
88 
93 
94  using ResultMap = std::map<std::string, DateResults>;
95  using StringString = std::pair<std::string, std::string>;
96 
97  public:
100 
102  Main::Database& dbBase,
103  const ThreadOptions& threadOptions,
104  const ThreadStatus& threadStatus
105  );
107  Main::Database& dbBase,
108  const ThreadOptions& threadOptions
109  );
110 
114 
115  std::string_view getName() const override;
116 
120 
121  void onAlgoInitTarget() override;
122  void onAlgoInit() override;
123  void onAlgoTick() override;
124  void onAlgoPause() override;
125  void onAlgoUnpause() override;
126  void onAlgoClear() override;
127 
131 
132  void parseAlgoOption() override;
133  void checkAlgoOptions() override;
134  void resetAlgo() override;
135 
137 
138  private:
139  // algorithm state
140  bool firstTick{true};
141 
142  // results
143  std::map<std::string, DateResults> dateResults;
144  std::string previousDate;
145 
146  // algorithm functions
147  void count();
148  void save();
149 
150  // internal helper functions
151  ResultMap::iterator addDateGroup(const std::string& group);
152  void fillGap(const std::string& table, const std::string& date);
153  void insertDataSet(const std::string& table, const std::string& date, const DateResults& results);
154  };
155 
156 } /* namespace crawlservpp::Module::Analyzer::Algo */
157 
158 #endif /* MODULE_ANALYZER_ALGO_WORDSOVERTIME_HPP_ */
Namespace for algorithm classes.
Definition: All.cpp:52
void onAlgoInit() override
Generates the corpus.
Definition: WordsOverTime.cpp:111
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: WordsOverTime.cpp:89
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
void onAlgoUnpause() override
Does nothing.
Definition: WordsOverTime.cpp:180
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::string_view getName() const override
Returns the name of the algorithm.
Definition: WordsOverTime.cpp:75
void parseAlgoOption() override
Does nothing.
Definition: WordsOverTime.cpp:190
constexpr auto wordsNumberOfColumns
The number of columns to write to the target table.
Definition: WordsOverTime.hpp:69
Counts the occurrence of articles, sentences, and tokens in a corpus over time.
Definition: WordsOverTime.hpp:78
void onAlgoClear() override
Does nothing.
Definition: WordsOverTime.cpp:183
void checkAlgoOptions() override
Does nothing.
Definition: WordsOverTime.cpp:193
WordsOverTime(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: WordsOverTime.cpp:44
void onAlgoTick() override
Counts articles, sentences, and words.
Definition: WordsOverTime.cpp:163
constexpr auto wordsUpdateProgressEvery
Indicates after how many date groups the progress of the thread will be updated.
Definition: WordsOverTime.hpp:66
void onAlgoPause() override
Does nothing.
Definition: WordsOverTime.cpp:177
void resetAlgo() override
Resets the algorithm.
Definition: WordsOverTime.cpp:201