crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
SentimentOverTime.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2022 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * SentimentOverTime.hpp
24  *
25  * Calculate the average sentiment over time
26  * associated with specific categories
27  * using the VADER algorithm.
28  *
29  * If you use it, please cite:
30  *
31  * Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
32  * Sentiment Analysis of Social Media Text. Eighth International Conference on
33  * Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
34  *
35  * !!! FOR ENGLISH LANGUAGE ONLY !!!
36  *
37  * Created on: Dec 30, 2020
38  * Author: ans
39  */
40 
41 #ifndef MODULE_ANALYZER_ALGO_SENTIMENTOVERTIME_HPP_
42 #define MODULE_ANALYZER_ALGO_SENTIMENTOVERTIME_HPP_
43 
44 #include "../Thread.hpp"
45 
46 #include "../../../Data/Data.hpp"
47 #include "../../../Data/Dictionary.hpp"
48 #include "../../../Data/Sentiment.hpp"
49 #include "../../../Helper/DateTime.hpp"
50 #include "../../../Helper/FileSystem.hpp"
51 #include "../../../Helper/Memory.hpp"
52 #include "../../../Main/Database.hpp"
53 #include "../../../Struct/QueryStruct.hpp"
54 #include "../../../Struct/StatusSetter.hpp"
55 #include "../../../Struct/TextMap.hpp"
56 #include "../../../Struct/ThreadOptions.hpp"
57 #include "../../../Struct/ThreadStatus.hpp"
58 
59 #include <algorithm> // std::all_of, std::find_if, std::min
60 #include <cmath> // std::fabs, std::round
61 #include <cstddef> // std::size_t
62 #include <cstdint> // std::uint8_t, std::uint64_t
63 #include <limits> // std::numeric_limits
64 #include <map> // std::map
65 #include <memory> // std::make_unique, std::unique_ptr
66 #include <queue> // std::queue
67 #include <string> // std::string, std::to_string
68 #include <string_view> // std::string_view, std::string_view_literals
69 #include <unordered_map> // std::unordered_map
70 #include <unordered_set> // std::unordered_set
71 #include <utility> // std::pair
72 #include <vector> // std::vector
73 
75 
76  using std::string_view_literals::operator""sv;
77 
78  /*
79  * CONSTANTS
80  */
81 
84 
86  inline constexpr auto sentimentUpdateCalculateProgressEvery{250000};
87 
89  inline constexpr auto sentimentUpdateSavingProgressEvery{10};
90 
92  inline constexpr auto sentimentMinNumColumns{1};
93 
95  inline constexpr auto sentimentMinColumnsPerCategory{2};
96 
98  inline constexpr auto sentimentArticleColumnsPerCategory{4};
99 
101  inline constexpr auto sentimentDefaultThreshold{10};
102 
104  inline constexpr auto sentimentDictionary{"sentiment-en"sv};
105 
107  inline constexpr auto sentimentEmojis{"emojis-en"sv};
108 
110  inline constexpr auto sentimentPercentageFactor{100.F};
111 
113 
114  /*
115  * DECLARATION
116  */
117 
119 
134  // internal structure for temporarily saving data linked to a specific date and category
135  struct DateCategoryData {
136  // sum of all sentence-based sentiment scores
137  double sentimentSum{};
138 
139  // count of all sentence-based sentiment scores
140  std::uint64_t sentimentCount{};
141 
142  // (if needed) articles associated with this date and containing this category
143  std::unordered_set<std::string> articles;
144  };
145 
146  // for convenience
148 
151  using TextMap = Struct::TextMap;
155 
156  using DoubleUInt = std::pair<double, std::uint64_t>;
157  using StringString = std::pair<std::string, std::string>;
158 
159  using ArticleData = std::unordered_map<std::string, DoubleUInt>;
160  using DateData = std::map<std::string, std::vector<DateCategoryData>>;
161 
162  public:
165 
167  Main::Database& dbBase,
168  const ThreadOptions& threadOptions,
169  const ThreadStatus& threadStatus
170  );
172  Main::Database& dbBase,
173  const ThreadOptions& threadOptions
174  );
175 
179 
180  std::string_view getName() const override;
181 
185 
186  void onAlgoInitTarget() override;
187  void onAlgoInit() override;
188  void onAlgoTick() override;
189  void onAlgoPause() override;
190  void onAlgoUnpause() override;
191  void onAlgoClear() override;
192 
196 
197  void parseAlgoOption() override;
198  void checkAlgoOptions() override;
199  void resetAlgo() override;
200 
202 
203  private:
204  // algorithm options
205  struct Entries {
206  std::vector<std::string> categoryLabels;
207  std::vector<std::uint64_t> categoryQueries;
208  bool combineSources{true};
209  std::uint8_t threshold{sentimentDefaultThreshold};
210  bool addArticleSentiment{false};
211  bool ignoreEmptyDate{true};
212  bool useThreshold{false};
213  std::string dictionary{sentimentDictionary};
214  std::string emojis{sentimentEmojis};
215  } algoConfig;
216 
217  // sentiment analyzer
218  std::unique_ptr<Data::Sentiment> sentimentAnalyzer;
219 
220  // algorithm queries
221  std::vector<QueryStruct> queriesCategories;
222 
223  // algorithm state
224  DateData dateData;
225  ArticleData articleData;
226 
227  std::string previousDate;
228 
229  std::size_t currentCorpus{};
230 
231  // algorithm functions
232  void addCurrent();
233  void saveSentiments();
234 
235  // query functions
236  void initQueries() override;
237  void deleteQueries() override;
238 
239  // internal helper functions
240  [[nodiscard]] DateData::iterator addDate(const std::string& date);
241  void processSentence(
242  const std::vector<std::string>& tokens,
243  const std::pair<std::size_t, std::size_t>& sentence,
244  const DateData::iterator& dateIt,
245  const std::string& article
246  );
247  [[nodiscard]] float getSentenceScore(
248  const std::pair<std::size_t, std::size_t>& sentence,
249  const std::vector<std::string>& tokens
250  );
251  [[nodiscard]] DoubleUInt calculateArticleSentiment(
252  const std::unordered_set<std::string>& articles
253  );
254  [[nodiscard]] DoubleUInt calculateArticle(
255  const std::string& article
256  );
257  void fillGap(const std::string& table, const std::string& date, std::size_t numColumns);
258  void insertDataSet(
259  const std::string& table,
260  const std::string& date,
261  const std::vector<DateCategoryData>& dataSet,
262  std::size_t numColumns
263  );
264 
265  // internal static helper functions
266  static bool selectFirst(
267  const TextMap& map,
268  std::size_t& numberTo
269  );
270  static bool identifyCurrent(
271  std::size_t sentenceBegin,
272  std::size_t& numberFromTo,
273  const TextMap& map,
274  bool& isLastFromTo
275  );
276  [[nodiscard]] static bool meetsThreshold(
277  float sentiment,
278  std::uint8_t threshold
279  );
280  };
281 
282 } /* namespace crawlservpp::Module::Analyzer::Algo */
283 
284 #endif /* MODULE_ANALYZER_ALGO_SENTIMENTOVERTIME_HPP_ */
void onAlgoUnpause() override
Does nothing.
Definition: SentimentOverTime.cpp:237
void onAlgoPause() override
Does nothing.
Definition: SentimentOverTime.cpp:234
Namespace for algorithm classes.
Definition: All.cpp:52
SentimentOverTime(Main::Database &dbBase, const ThreadOptions &threadOptions, const ThreadStatus &threadStatus)
Continues a previously interrupted algorithm run.
Definition: SentimentOverTime.cpp:53
constexpr auto sentimentEmojis
The default emoji dictionary to be used.
Definition: SentimentOverTime.hpp:107
void checkAlgoOptions() override
Checks the configuration options for the algorithm.
Definition: SentimentOverTime.cpp:266
Abstract class providing thread functionality to algorithm (child) classes.
Definition: Thread.hpp:84
Thread status containing its ID, status message, pause state, and progress.
Definition: ThreadStatus.hpp:54
constexpr auto sentimentArticleColumnsPerCategory
Number of columns per category if article-based sentiment is activated.
Definition: SentimentOverTime.hpp:98
constexpr auto sentimentDictionary
The default sentiment dictionary to be used.
Definition: SentimentOverTime.hpp:104
Text map entry.
Definition: TextMap.hpp:49
void onAlgoInit() override
Generates the corpus.
Definition: SentimentOverTime.cpp:148
constexpr auto sentimentUpdateCalculateProgressEvery
Indicates, while calculating, after how many sentences the progress of the thread will be updated...
Definition: SentimentOverTime.hpp:86
Thread options containing the name of the module run, as well as the IDs of the website, URL list, and configuration used.
Definition: ThreadOptions.hpp:40
Class handling database access for the command-and-control and its threads.
Definition: Database.hpp:366
Class for analyzer exceptions to be used by algorithms.
Definition: Thread.hpp:242
constexpr auto sentimentMinNumColumns
Number of default columns to be written to the target table.
Definition: SentimentOverTime.hpp:92
Structure containing all the data needed to keep the status of a thread updated.
Definition: StatusSetter.hpp:57
std::vector< TextMapEntry > TextMap
A text map is defined as a vector of text map entries.
Definition: TextMap.hpp:280
void onAlgoClear() override
Does nothing.
Definition: SentimentOverTime.cpp:240
constexpr auto sentimentMinColumnsPerCategory
Number of columns per category if article-based sentiment is deactivated.
Definition: SentimentOverTime.hpp:95
constexpr auto sentimentPercentageFactor
Factor to convert value to percentage.
Definition: SentimentOverTime.hpp:110
Sentiment analysis using the VADER algorithm.
Definition: SentimentOverTime.hpp:133
constexpr auto sentimentDefaultThreshold
The default threshold (sentiments lower than that number will be ignored).
Definition: SentimentOverTime.hpp:101
Structure to identify a query including its type and result type(s).
Definition: QueryStruct.hpp:40
constexpr auto sentimentUpdateSavingProgressEvery
Indicates, while saving, after how many rows the progress of the thread will be updated.
Definition: SentimentOverTime.hpp:89
std::string_view getName() const override
Returns the name of the algorithm.
Definition: SentimentOverTime.cpp:84
void parseAlgoOption() override
Parses a configuration option for the algorithm.
Definition: SentimentOverTime.cpp:247
void onAlgoTick() override
Calculates the sentence-based sentiment scores in the text corpus.
Definition: SentimentOverTime.cpp:219
void onAlgoInitTarget() override
Initializes the target table for the algorithm.
Definition: SentimentOverTime.cpp:100
void resetAlgo() override
Resets the algorithm.
Definition: SentimentOverTime.cpp:335