crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Assoc.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Assoc.hpp
24  *
25  * Algorithm counting associations between the keyword and
26  * different categories per article.
27  *
28  * Created on: Nov 12, 2021
29  * Author: ans
30  */
31 
32 #ifndef MODULE_ANALYZER_ALGO_ASSOC_HPP_
33 #define MODULE_ANALYZER_ALGO_ASSOC_HPP_
34 
35 #include "../Thread.hpp"
36 
37 #include "../../../Data/Data.hpp"
38 #include "../../../Helper/DateTime.hpp"
39 #include "../../../Helper/Memory.hpp"
40 #include "../../../Main/Database.hpp"
41 #include "../../../Struct/QueryStruct.hpp"
42 #include "../../../Struct/StatusSetter.hpp"
43 #include "../../../Struct/TextMap.hpp"
44 #include "../../../Struct/ThreadOptions.hpp"
45 #include "../../../Struct/ThreadStatus.hpp"
46 
47 #include <algorithm> // std::all_of, std::min, std::sort
48 #include <cstddef> // std::size_t
49 #include <cstdint> // std::uint16_t, std::uint64_t
50 #include <limits> // std::numeric_limits
51 #include <queue> // std::queue
52 #include <string> // std::string, std::to_string
53 #include <string_view> // std::string_view
54 #include <unordered_map> // std::unordered_map
55 #include <utility> // std::pair
56 #include <vector> // std::vector
57 
59 
60  /*
61  * CONSTANTS
62  */
63 
66 
68  inline constexpr auto assocUpdateProgressEvery{1000};
69 
71  inline constexpr auto assocAddColumns{2}; /* ID of article, occurrences */
72 
74  inline constexpr auto assocMinColumns{assocAddColumns + 1 /* date */};
75 
77 
78  /*
79  * DECLARATION
80  */
81 
83 
87  class Assoc final : public Module::Analyzer::Thread {
88  // for convenience
91  using TextMap = Struct::TextMap;
95 
97 
98  using StringString = std::pair<std::string, std::string>;
99 
100  public:
103 
104  Assoc(
105  Main::Database& dbBase,
106  const ThreadOptions& threadOptions,
107  const ThreadStatus& threadStatus
108  );
109  Assoc(
110  Main::Database& dbBase,
111  const ThreadOptions& threadOptions
112  );
113 
117 
118  std::string_view getName() const override;
119 
123 
124  void onAlgoInitTarget() override;
125  void onAlgoInit() override;
126  void onAlgoTick() override;
127  void onAlgoPause() override;
128  void onAlgoUnpause() override;
129  void onAlgoClear() override;
130 
134 
135  void parseAlgoOption() override;
136  void checkAlgoOptions() override;
137  void resetAlgo() override;
138 
140 
141  private:
142  // custom structures
143  struct Associations {
144  std::vector<std::uint64_t> keywordPositions;
145  std::vector<std::vector<std::uint64_t>> categoriesPositions;
146  std::uint64_t offset{};
147  };
148 
149  struct Result {
150  std::string articleId;
151  std::string date;
152  std::uint64_t nOccurences{};
153  std::vector<std::uint64_t> catCounters;
154  };
155 
156  using DateAssociationMap = std::unordered_map<std::string, std::unordered_map<std::string, Associations>>;
157  using DateAssociation = std::pair<std::string, std::unordered_map<std::string, Associations>>;
158  using ArticleAssociationMap = std::unordered_map<std::string, Associations>;
159  using ArticleAssociation = std::pair<std::string, Associations>;
160 
161  // algorithm options
162  struct Entries {
163  std::vector<std::string> categoryLabels;
164  std::vector<std::uint64_t> categoryQueries;
165  bool combineSources{true};
166  bool ignoreEmptyDate{true};
167  std::uint64_t keyWordQuery{};
168  std::uint16_t windowSize{1};
169  } algoConfig;
170 
171  // algorithm queries
172  QueryStruct queryKeyWord;
173 
174  std::vector<QueryStruct> queriesCategories;
175 
176  // algorithm state
177  DateAssociationMap associations;
178 
179  std::string previousDate;
180 
181  std::size_t currentCorpus{};
182  std::size_t dateCounter{};
183  std::size_t firstDatePos{};
184  std::size_t dateMapSize{};
185  std::size_t articleIndex{};
186  std::size_t tokenIndex{};
187  std::size_t processedDates{};
188 
189  bool dateSaved{false};
190 
191  // algorithm functions
192  void addCurrent();
193  void saveAssociations();
194  [[nodiscard]] std::vector<Result> processDates();
195  void saveResults(const std::vector<Result>& results);
196 
197  // query functions
198  void initQueries() override;
199  void deleteQueries() override;
200 
201  // internal helper functions
202  void addArticlesForDate(
203  const TextMapEntry& date,
204  DateAssociationMap::iterator& dateIt,
205  const TextMap& articleMap,
206  const std::vector<std::string>& tokens,
207  std::queue<std::string>& warningsTo
208  );
209  DateAssociationMap::iterator addDate(const std::string& date);
210  ArticleAssociationMap::iterator addArticleToDate(
211  const std::string& article,
212  DateAssociationMap::iterator date
213  );
214  void processToken(
215  const std::string& token,
216  Associations& associationsTo,
217  std::queue<std::string>& warningsTo
218  );
219  void processDate(
220  const DateAssociation& date,
221  std::vector<Result>& resultsTo
222  );
223  void processArticle(
224  const std::string& date,
225  const ArticleAssociation& article,
226  std::vector<Result>& resultsTo
227  );
228  void processTermOccurrence(
229  const ArticleAssociation& article,
230  std::uint64_t occurrence,
231  std::size_t& occurrencesTo,
232  std::vector<std::uint64_t>& catsCountersTo
233  );
234  void processCategory(
235  const ArticleAssociation& article,
236  std::uint64_t termOccurrence,
237  std::size_t index,
238  std::vector<std::uint64_t>& catsCountersTo
239  );
240  bool processCategoryOccurrence(
241  std::uint64_t termOccurrence,
242  std::uint64_t catOccurrence,
243  std::size_t catIndex,
244  std::vector<std::uint64_t>& catsCountersTo
245  ) const;
246 
247  void insertDataSet(
248  const std::string& table,
249  const Result& dataSet,
250  std::size_t numColumns
251  );
252  };
253 
254 } /* namespace crawlservpp::Module::Analyzer::Algo */
255 
256 #endif /* MODULE_ANALYZER_ALGO_ASSOC_HPP_ */
void onAlgoInit() override
Generates the corpus.
Definition: Assoc.cpp:125
constexpr auto assocMinColumns
Minimum number of columns included in a dataset (including date).
Definition: Assoc.hpp:74
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: Assoc.cpp:204
Namespace for algorithm classes.
Definition: All.cpp:52
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
Text map entry.
Definition: TextMap.hpp:49
Empty algorithm template.
Definition: Assoc.hpp:87
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
void onAlgoUnpause() override
Does nothing.
Definition: Assoc.cpp:194
void onAlgoClear() override
Does nothing.
Definition: Assoc.cpp:197
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: Assoc.cpp:89
std::string_view getName() const override
Returns the name of the algorithm.
Definition: Assoc.cpp:75
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
constexpr auto assocAddColumns
Number of extra columns included in a dataset (except date).
Definition: Assoc.hpp:71
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: Assoc.cpp:220
void onAlgoPause() override
Does nothing.
Definition: Assoc.cpp:191
void resetAlgo() override
Resets the algorithm.
Definition: Assoc.cpp:299
Assoc(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: Assoc.cpp:44
void onAlgoTick() override
Calculates the associations in the text corpus.
Definition: Assoc.cpp:177
constexpr auto assocUpdateProgressEvery
Indicates, while saving, after how many articles the progress of the thread will be updated...
Definition: Assoc.hpp:68