crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
AssocOverTime.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * AssocOverTime.hpp
24  *
25  * Algorithm counting associations between the keyword and
26  * different categories over time.
27  *
28  * Created on: Oct 10, 2020
29  * Author: ans
30  */
31 
32 #ifndef MODULE_ANALYZER_ALGO_ASSOCOVERTIME_HPP_
33 #define MODULE_ANALYZER_ALGO_ASSOCOVERTIME_HPP_
34 
35 #include "../Thread.hpp"
36 
37 #include "../../../Data/Data.hpp"
38 #include "../../../Helper/DateTime.hpp"
39 #include "../../../Helper/Memory.hpp"
40 #include "../../../Main/Database.hpp"
41 #include "../../../Struct/QueryStruct.hpp"
42 #include "../../../Struct/StatusSetter.hpp"
43 #include "../../../Struct/TextMap.hpp"
44 #include "../../../Struct/ThreadOptions.hpp"
45 #include "../../../Struct/ThreadStatus.hpp"
46 
47 #include <algorithm> // std::all_of, std::min, std::sort
48 #include <cstddef> // std::size_t
49 #include <cstdint> // std::uint16_t, std::uint64_t
50 #include <limits> // std::numeric_limits
51 #include <queue> // std::queue
52 #include <string> // std::string, std::to_string
53 #include <string_view> // std::string_view
54 #include <unordered_map> // std::unordered_map
55 #include <utility> // std::pair
56 #include <vector> // std::vector
57 
59 
60  /*
61  * CONSTANTS
62  */
63 
66 
68  inline constexpr auto assocOverTimeUpdateProgressEvery{100};
69 
71  inline constexpr auto assocOverTimeAddColumns{2}; /* number of articles, occurrences */
72 
74  inline constexpr auto assocOverTimeMinColumns{assocOverTimeAddColumns + 1 /* date */};
75 
77 
78  /*
79  * DECLARATION
80  */
81 
83 
87  class AssocOverTime final : public Module::Analyzer::Thread {
88  // for convenience
91  using TextMap = Struct::TextMap;
95 
97 
98  using StringString = std::pair<std::string, std::string>;
99  using Results = std::vector<std::pair<std::string, std::vector<std::uint64_t>>>;
100 
101  public:
104 
106  Main::Database& dbBase,
107  const ThreadOptions& threadOptions,
108  const ThreadStatus& threadStatus
109  );
111  Main::Database& dbBase,
112  const ThreadOptions& threadOptions
113  );
114 
118 
119  std::string_view getName() const override;
120 
124 
125  void onAlgoInitTarget() override;
126  void onAlgoInit() override;
127  void onAlgoTick() override;
128  void onAlgoPause() override;
129  void onAlgoUnpause() override;
130  void onAlgoClear() override;
131 
135 
136  void parseAlgoOption() override;
137  void checkAlgoOptions() override;
138  void resetAlgo() override;
139 
141 
142  private:
143  // custom structure
144  struct Associations {
145  std::vector<std::uint64_t> keywordPositions;
146  std::vector<std::vector<std::uint64_t>> categoriesPositions;
147  std::uint64_t offset{};
148  };
149 
150  using DateAssociationMap = std::unordered_map<std::string, std::unordered_map<std::string, Associations>>;
151  using DateAssociation = std::pair<std::string, std::unordered_map<std::string, Associations>>;
152  using ArticleAssociationMap = std::unordered_map<std::string, Associations>;
153  using ArticleAssociation = std::pair<std::string, Associations>;
154 
155  // algorithm options
156  struct Entries {
157  std::vector<std::string> categoryLabels;
158  std::vector<std::uint64_t> categoryQueries;
159  bool combineSources{true};
160  bool ignoreEmptyDate{true};
161  std::uint64_t keyWordQuery{};
162  std::uint16_t windowSize{1};
163  } algoConfig;
164 
165  // algorithm queries
166  QueryStruct queryKeyWord;
167 
168  std::vector<QueryStruct> queriesCategories;
169 
170  // algorithm state
171  DateAssociationMap associations;
172 
173  std::string previousDate;
174 
175  std::size_t currentCorpus{};
176  std::size_t dateCounter{};
177  std::size_t firstDatePos{};
178  std::size_t dateMapSize{};
179  std::size_t articleIndex{};
180  std::size_t tokenIndex{};
181  std::size_t processedDates{};
182 
183  bool dateSaved{false};
184 
185  // algorithm functions
186  void addCurrent();
187  void saveAssociations();
188  [[nodiscard]] Results processDates();
189  void saveResults(const Results& results);
190 
191  // query functions
192  void initQueries() override;
193  void deleteQueries() override;
194 
195  // internal helper functions
196  void addArticlesForDate(
197  const TextMapEntry& date,
198  DateAssociationMap::iterator& dateIt,
199  const TextMap& articleMap,
200  const std::vector<std::string>& tokens,
201  std::queue<std::string>& warningsTo
202  );
203  DateAssociationMap::iterator addDate(const std::string& date);
204  ArticleAssociationMap::iterator addArticleToDate(
205  const std::string& article,
206  DateAssociationMap::iterator date
207  );
208  void processToken(
209  const std::string& token,
210  Associations& associationsTo,
211  std::queue<std::string>& warningsTo
212  );
213  void processDate(
214  const DateAssociation& date,
215  Results& resultsTo
216  );
217  void processArticle(
218  const ArticleAssociation& article,
219  std::size_t& occurrencesTo,
220  std::vector<std::uint64_t>& catsCountersTo
221  );
222  void processTermOccurrence(
223  const ArticleAssociation& article,
224  std::uint64_t occurrence,
225  std::size_t& occurrencesTo,
226  std::vector<std::uint64_t>& catsCountersTo
227  );
228  void processCategory(
229  const ArticleAssociation& article,
230  std::uint64_t termOccurrence,
231  std::size_t index,
232  std::vector<std::uint64_t>& catsCountersTo
233  );
234  bool processCategoryOccurrence(
235  std::uint64_t termOccurrence,
236  std::uint64_t catOccurrence,
237  std::size_t catIndex,
238  std::vector<std::uint64_t>& catsCountersTo
239  ) const;
240 
241  void fillGap(const std::string& table, const std::string& date, std::size_t numColumns);
242  void insertDataSet(
243  const std::string& table,
244  const std::string& date,
245  const std::vector<std::uint64_t>& dataSet,
246  std::size_t numColumns
247  );
248  };
249 
250 } /* namespace crawlservpp::Module::Analyzer::Algo */
251 
252 #endif /* MODULE_ANALYZER_ALGO_ASSOCOVERTIME_HPP_ */
void onAlgoPause() override
Does nothing.
Definition: AssocOverTime.cpp:191
constexpr auto assocOverTimeAddColumns
Number of extra columns included in a dataset (except date).
Definition: AssocOverTime.hpp:71
std::string_view getName() const override
Returns the name of the algorithm.
Definition: AssocOverTime.cpp:75
constexpr auto assocOverTimeMinColumns
Minimum number of columns included in a dataset (including date).
Definition: AssocOverTime.hpp:74
Namespace for algorithm classes.
Definition: All.cpp:52
Empty algorithm template.
Definition: AssocOverTime.hpp:87
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
void onAlgoUnpause() override
Does nothing.
Definition: AssocOverTime.cpp:194
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: AssocOverTime.cpp:89
Text map entry.
Definition: TextMap.hpp:49
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
AssocOverTime(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: AssocOverTime.cpp:44
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: AssocOverTime.cpp:204
void onAlgoClear() override
Does nothing.
Definition: AssocOverTime.cpp:197
void onAlgoInit() override
Generates the corpus.
Definition: AssocOverTime.cpp:125
void onAlgoTick() override
Calculates the associations in the text corpus.
Definition: AssocOverTime.cpp:177
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: AssocOverTime.cpp:220
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
constexpr auto assocOverTimeUpdateProgressEvery
Indicates, while saving, after how many rows the progress of the thread will be updated.
Definition: AssocOverTime.hpp:68
void resetAlgo() override
Resets the algorithm.
Definition: AssocOverTime.cpp:299