crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Sentiment.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2020 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Sentiment.hpp
24  *
25  * Port of VADER sentiment analysis from Python to C++.
26  *
27  * Original: https://github.com/cjhutto/vaderSentiment/
28  *
29  * If you use the VADER sentiment analysis tools, please cite:
30  *
31  * Hutto, C.J. & Gilbert, E.E. (2014). VADER: A Parsimonious Rule-based Model for
32  * Sentiment Analysis of Social Media Text. Eighth International Conference on
33  * Weblogs and Social Media (ICWSM-14). Ann Arbor, MI, June 2014.
34  *
35  * !!! FOR ENGLISH LANGUAGE ONLY !!!
36  *
37  * Created on: Dec 29, 2020
38  * Author: ans
39  */
40 
41 #ifndef DATA_SENTIMENT_HPP_
42 #define DATA_SENTIMENT_HPP_
43 
44 #include <algorithm> // std::all_of, std::find, std::find_if, std::transform
45 #include <array> // std::array
46 #include <cctype> // std::ispunct, std::isupper, std::tolower
47 #include <cmath> // std::fabs, std::sqrt
48 #include <cstddef> // std::size_t
49 #include <cstdint> // std::uint8_t, std::uint64_t
50 #include <fstream> // std::ifstream
51 #include <iterator> // std::back_inserter
52 #include <limits> // std::numeric_limits
53 #include <numeric> // std::accumulate
54 #include <stdexcept> // std::runtime_error
55 #include <string> // std::getline, std::stof, std::string
56 #include <string_view> // std::string_view
57 #include <unordered_map> // std::unordered_set
58 #include <unordered_set> // std::unordered_map
59 #include <utility> // std::pair
60 #include <vector> // std::vector
61 
62 namespace crawlservpp::Data {
63 
64  /*
65  * CONSTANTS
66  */
67 
70 
72  inline constexpr auto VaderZero{0};
73 
75  inline constexpr auto VaderOne{1};
76 
78  inline constexpr auto VaderTwo{2};
79 
81  inline constexpr auto VaderThree{3};
82 
84  inline constexpr auto VaderFour{4};
85 
87  inline constexpr auto VaderFOne{1.F};
88 
90  inline constexpr auto VaderDampOne{0.95F};
91 
93  inline constexpr auto VaderDampTwo{0.9F};
94 
96  inline constexpr auto VaderButFactorBefore{0.5F};
97 
99  inline constexpr auto VaderButFactorAfter{1.5F};
100 
102  inline constexpr auto VaderNeverFactor{1.25F};
103 
104  /*
106  inline constexpr auto VaderEPFactor{0.292F};
107 
109  inline constexpr auto VaderQMFactor{0.18F};
110 
112  inline constexpr auto VaderQMFactorMax{0.96F};
113  */
114 
116  inline constexpr auto VaderB_INCR{0.293F};
117 
119  inline constexpr auto VaderB_DECR{-0.293F};
120 
122  inline constexpr auto VaderC_INCR{0.733F};
123 
125  inline constexpr auto VaderN_SCALAR{-0.74F};
126 
128 
129  /*
130  * DECLARATION
131  */
132 
136 
149  float positive{};
150 
152 
165  float neutral{};
166 
168 
181  float negative{};
182 
184 
199  float compound{};
200  };
201 
203 
212  class Sentiment {
213  // for convinience
214  using Tokens = std::vector<std::string>;
215 
216  /*
217  * CONSTANTS
218  */
219 
220  //TODO: use constexpr set
221  inline static const std::unordered_set<std::string_view> NEGATE{
222  "aint",
223  "arent",
224  "cannot",
225  "cant",
226  "couldnt",
227  "darent",
228  "didnt",
229  "doesnt",
230  "ain't",
231  "aren't",
232  "can't",
233  "couldn't",
234  "daren't",
235  "didn't",
236  "doesn't",
237  "dont",
238  "hadnt",
239  "hasnt",
240  "havent",
241  "isnt",
242  "mightnt",
243  "mustnt",
244  "neither",
245  "don't",
246  "hadn't",
247  "hasn't",
248  "haven't",
249  "isn't",
250  "mightn't",
251  "mustn't",
252  "neednt",
253  "needn't",
254  "never",
255  "none",
256  "nope",
257  "nor",
258  "not",
259  "nothing",
260  "nowhere",
261  "oughtnt",
262  "shant",
263  "shouldnt",
264  "uhuh",
265  "wasnt",
266  "werent",
267  "oughtn't",
268  "shan't",
269  "shouldn't",
270  "uh-uh",
271  "wasn't",
272  "weren't",
273  "without",
274  "wont",
275  "wouldnt",
276  "won't",
277  "wouldn't",
278  "rarely",
279  "seldom",
280  "despite"
281  };
282 
283  // booster/dampener 'intensifiers' or 'degree adverbs'
284  // http://en.wiktionary.org/wiki/Category:English_degree_adverbs
285  //TODO: use constexpr map
286  inline static const std::unordered_map<std::string_view, float> BOOSTER_DICT{
287  { "absolutely", VaderB_INCR },
288  { "amazingly", VaderB_INCR },
289  { "awfully", VaderB_INCR },
290  { "completely", VaderB_INCR },
291  { "considerable", VaderB_INCR },
292  { "considerably", VaderB_INCR },
293  { "decidedly", VaderB_INCR },
294  { "deeply", VaderB_INCR },
295  { "effing", VaderB_INCR },
296  { "enormous", VaderB_INCR },
297  { "enormously", VaderB_INCR },
298  { "entirely", VaderB_INCR },
299  { "especially", VaderB_INCR },
300  { "exceptional", VaderB_INCR },
301  { "exceptionally", VaderB_INCR },
302  { "extreme", VaderB_INCR },
303  { "extremely", VaderB_INCR },
304  { "fabulously", VaderB_INCR },
305  { "flipping", VaderB_INCR },
306  { "flippin", VaderB_INCR },
307  { "frackin", VaderB_INCR },
308  { "fracking", VaderB_INCR },
309  { "fricking", VaderB_INCR },
310  { "frickin", VaderB_INCR },
311  { "frigging", VaderB_INCR },
312  { "friggin", VaderB_INCR },
313  { "fully", VaderB_INCR },
314  { "fuckin", VaderB_INCR },
315  { "fucking", VaderB_INCR },
316  { "fuggin", VaderB_INCR },
317  { "fugging", VaderB_INCR },
318  { "greatly", VaderB_INCR },
319  { "hella", VaderB_INCR },
320  { "highly", VaderB_INCR },
321  { "hugely", VaderB_INCR },
322  { "incredible", VaderB_INCR },
323  { "incredibly", VaderB_INCR },
324  { "intensely", VaderB_INCR },
325  { "major", VaderB_INCR },
326  { "majorly", VaderB_INCR },
327  { "more", VaderB_INCR },
328  { "most", VaderB_INCR },
329  { "particularly", VaderB_INCR },
330  { "purely", VaderB_INCR },
331  { "quite", VaderB_INCR },
332  { "really", VaderB_INCR },
333  { "remarkably", VaderB_INCR },
334  { "so", VaderB_INCR },
335  { "substantially", VaderB_INCR },
336  { "thoroughly", VaderB_INCR },
337  { "total", VaderB_INCR },
338  { "totally", VaderB_INCR },
339  { "tremendous", VaderB_INCR },
340  { "tremendously", VaderB_INCR },
341  { "uber", VaderB_INCR },
342  { "unbelievably", VaderB_INCR },
343  { "unusually", VaderB_INCR },
344  { "utter", VaderB_INCR },
345  { "utterly", VaderB_INCR },
346  { "very", VaderB_INCR },
347  { "almost", VaderB_DECR },
348  { "barely", VaderB_DECR },
349  { "hardly", VaderB_DECR },
350  { "just enough", VaderB_DECR },
351  { "kind of", VaderB_DECR },
352  { "kinda", VaderB_DECR },
353  { "kindof", VaderB_DECR },
354  { "kind-of", VaderB_DECR },
355  { "less", VaderB_DECR },
356  { "little", VaderB_DECR },
357  { "marginal", VaderB_DECR },
358  { "marginally", VaderB_DECR },
359  { "occasional", VaderB_DECR },
360  { "occasionally", VaderB_DECR },
361  { "partly", VaderB_DECR },
362  { "scarce", VaderB_DECR },
363  { "scarcely", VaderB_DECR },
364  { "slight", VaderB_DECR },
365  { "slightly", VaderB_DECR },
366  { "somewhat", VaderB_DECR },
367  { "sort of", VaderB_DECR },
368  { "sorta", VaderB_DECR },
369  { "sortof", VaderB_DECR },
370  { "sort-of", VaderB_DECR}
371  };
372 
373  // check for special case idioms and phrases containing lexicon tokens
374  //TODO: use constexpr map
375  inline static const std::unordered_map<std::string_view, float> SPECIAL_CASES{
376  { "the shit", 3.F },
377  { "the bomb", 3.F },
378  { "bad ass", 1.5F },
379  { "badass", 1.5F },
380  { "bus stop", 0.F },
381  { "yeah right", -2.F },
382  { "kiss of death", -1.5F },
383  { "to die for", 3.F },
384  { "beating heart", 3.1F },
385  { "broken heart", -2.9F }
386  };
387 
388  public:
391 
392  Sentiment(const std::string& dictionaryFile, const std::string& emojiFile);
393 
397 
398  [[nodiscard]] std::size_t getDictSize() const;
399  [[nodiscard]] std::size_t getEmojiNum() const;
400 
404 
405  [[nodiscard]] SentimentScores analyze(const Tokens& tokens);
406 
408 
409  private:
410  // dictionaries
411  std::unordered_map<std::string, float> dictMap;
412  std::unordered_map<std::string, std::string> emojiMap;
413 
414  // internal helper functions
415  void sentimentValence(
416  float& valence,
417  const Tokens& tokens,
418  const Tokens& tokensLower,
419  std::size_t index,
420  std::vector<float>& sentiments,
421  bool isCapDifference
422  );
423  static SentimentScores scoreValence(const std::vector<float>& sentiments, const Tokens& /*tokens*/);
424  void leastCheck(float& valence, const Tokens& tokensLower, std::size_t index) const;
425 
426  // internal static helper functions
427  [[nodiscard]] static std::vector<std::string> toLower(const Tokens& tokens);
428  [[nodiscard]] static bool isNegated(const std::string& tokenLower);
429  [[nodiscard]] static bool isNegated(const Tokens& tokensLower);
430  [[nodiscard]] static float normalize(float score);
431  [[nodiscard]] static bool isAllCaps(const std::string& token);
432  [[nodiscard]] static bool isAllCapDifferential(const Tokens& tokens);
433  [[nodiscard]] static float scalarIncDec(
434  const std::string& token,
435  const std::string& tokenLower,
436  float valence,
437  bool isCapDiff
438  );
439  /*
440  * (the following functions are not used, because punctuation is removed by the tokenizer)
441  *
442  [[nodiscard]] static float punctuationEmphasis(const Tokens& tokens);
443  [[nodiscard]] static float amplifyEP(const TOKENS& tokens);
444  [[nodiscard]] static float amplifyQM(const TOKENS& tokens);
445  */
446 
447  static void butCheck(const Tokens& tokensLower, std::vector<float>& sentiments);
448  static void negationCheck(float& valence, const Tokens& tokensLower, std::uint8_t startIndex, std::size_t index);
449  static void specialIdiomsCheck(float& valence, const Tokens& tokensLower, std::size_t index);
450  static void siftSentimentScores(
451  const std::vector<float>& sentiments,
452  float& positiveSumTo,
453  float& negativeSumTo,
454  std::size_t& neutralCountTo
455  );
456  };
457 
458  /*
459  * IMPLEMENTATION
460  */
461 
462  /*
463  * CONSTRUCTION
464  */
465 
467 
477  inline Sentiment::Sentiment(const std::string& dictionaryFile, const std::string& emojiFile) {
478  std::ifstream dictIn(dictionaryFile.c_str());
479  std::string line;
480 
481  if(!dictIn.is_open()) {
482  throw std::runtime_error("Could not open dictionary file: '" + dictionaryFile + "'");
483  }
484 
485  while(std::getline(dictIn, line)) {
486  const auto firstTab{line.find('\t')};
487 
488  if(firstTab != std::string::npos) {
489  const auto term{line.substr(0, firstTab)};
490  const auto secondTab{line.find('\t', firstTab + 1)};
491  const auto value{std::stof(line.substr(firstTab + 1, secondTab - firstTab))};
492 
493  this->dictMap.emplace_hint(this->dictMap.end(), term, value);
494  }
495  }
496 
497  dictIn.close();
498 
499  std::ifstream emojiIn(emojiFile.c_str());
500 
501  if(!emojiIn.is_open()) {
502  throw std::runtime_error("Could not open emoji file: '" + emojiFile + "'");
503  }
504 
505  while(std::getline(emojiIn, line)) {
506  const auto tab{line.find('\t')};
507 
508  if(tab != std::string::npos) {
509  const auto emoji{line.substr(0, tab)};
510  const auto value{line.substr(tab + 1)};
511 
512  this->emojiMap.emplace_hint(this->emojiMap.end(), emoji, value);
513  }
514  }
515 
516  emojiIn.close();
517  }
518 
519  /*
520  * GETTERS
521  */
522 
524 
527  inline std::size_t Sentiment::getDictSize() const {
528  return this->dictMap.size();
529  }
530 
532 
535  inline std::size_t Sentiment::getEmojiNum() const {
536  return this->emojiMap.size();
537  }
538 
539  /*
540  * SENTIMENT ANALYSIS
541  */
542 
544 
553  inline SentimentScores Sentiment::analyze(const Tokens& tokens) {
554  const bool isCapDifference{
555  Sentiment::isAllCapDifferential(tokens)
556  };
557 
558  // replace emojis
559  std::vector<std::string> newTokens;
560 
561  // copy trimmed string
562  newTokens.reserve(tokens.size());
563 
564  for(const auto& token : tokens) {
565  std::string tokenCopy;
566 
567  std::size_t beg{};
568 
569  for(; beg < token.length(); ++beg) {
570  const auto c{token[beg]};
571 
572  if(std::ispunct(c) == 0 && std::iscntrl(c) == 0 && c != ' ') {
573  break;
574  }
575  }
576 
577  std::size_t len{token.length() - beg};
578 
579  do {
580  const auto c{token[beg + len - VaderOne]};
581 
582  if(std::ispunct(c) == 0 && std::iscntrl(c) == 0 && c != ' ') {
583  break;
584  }
585 
586  --len;
587  } while(len > 0);
588 
589  tokenCopy = token.substr(beg, len);
590 
591  const auto it = this->emojiMap.find(tokenCopy);
592 
593  if(it != this->emojiMap.end()) {
594  std::string emojiToken;
595 
596  emojiToken.reserve(it->second.length());
597 
598  for(auto c : it->second) {
599  if(c == ' ') {
600  if(!emojiToken.empty()) {
601  newTokens.emplace_back(emojiToken);
602 
603  emojiToken.clear();
604  }
605  }
606  else {
607  emojiToken.push_back(c);
608  }
609  }
610 
611  if(!emojiToken.empty()) {
612  newTokens.emplace_back(emojiToken);
613  }
614  }
615  else {
616  newTokens.emplace_back(tokenCopy);
617  }
618  }
619 
620  // create copy with lower-case tokens
621  std::vector<std::string> tokensLower{Sentiment::toLower(newTokens)};
622 
623  // calculate sentiments
624  std::vector<float> sentiments;
625 
626  sentiments.reserve(newTokens.size());
627 
628  for(std::size_t index{}; index < newTokens.size(); ++index) {
629  float valence{};
630 
631  if(std::find_if(BOOSTER_DICT.begin(), BOOSTER_DICT.end(), [&tokensLower, &index](const auto& booster) {
632  return booster.first == tokensLower[index];
633  }) != BOOSTER_DICT.end()) {
634  sentiments.push_back(valence);
635 
636  continue;
637  }
638 
639  if(index < newTokens.size() - 1 && tokensLower[index] == "kind" && tokensLower[index + 1] == "of") {
640  sentiments.push_back(valence);
641 
642  continue;
643  }
644 
645  this->sentimentValence(valence, newTokens, tokensLower, index, sentiments, isCapDifference);
646  }
647 
648  Sentiment::butCheck(tokensLower, sentiments);
649 
650  return Sentiment::scoreValence(sentiments, newTokens);
651  }
652 
653  /*
654  * INTERNAL HELPER FUNCTIONS (private)
655  */
656 
657  // calculate sentiment valence
658  inline void Sentiment::sentimentValence(
659  float& valence,
660  const Tokens& tokens,
661  const Tokens& tokensLower,
662  std::size_t index,
663  std::vector<float>& sentiments,
664  bool isCapDifference
665  ) {
666  // get the sentiment valence
667  const auto it{this->dictMap.find(tokensLower[index])};
668 
669  if(it != this->dictMap.end()) {
670  valence = it->second;
671 
672  // check for "no" as negation for an adjacent lexicon item vs "no" as its own stand-alone lexicon item
673  if(
674  tokensLower[index] == "no"
675  && index < tokens.size() - VaderOne
676  && this->dictMap.find(tokensLower[index + VaderOne]) != this->dictMap.end()
677  ) {
678  // don't use valence of "no" as a lexicon item. Instead set it's valence to 0.0 and negate the next item
679  valence = 0.F;
680  }
681 
682  if(
683  (index > 0 && tokensLower[index - VaderOne] == "no")
684  || (index > 1 && tokensLower[index - VaderTwo] == "no")
685  || (
686  index > 2
687  && tokensLower[index - VaderThree] == "no"
688  && (
689  tokensLower[index - VaderOne] == "or"
690  || tokensLower[index - VaderOne] == "nor"
691  )
692  )
693  ) {
694  valence = it->second;
695  }
696 
697  // check if sentiment-laden token is in ALL CAPS (while others aren't)
698  if(Sentiment::isAllCaps(tokens[index]) && isCapDifference) {
699  if(valence > 0.F) {
700  valence += VaderC_INCR;
701  }
702  else {
703  valence -= VaderC_INCR;
704  }
705  }
706 
707  for(std::uint8_t startIndex{}; startIndex < VaderThree; ++startIndex) {
708  // dampen the scalar modifier of preceding tokens and emoticons
709  // (excluding the ones that immediately preceed the item) based
710  // on their distance from the current item.
711  if(index > startIndex) {
712  const auto& precToken{tokens[index - (startIndex + VaderOne)]};
713  const auto& precTokenLower{tokensLower[index - (startIndex + VaderOne)]};
714 
715  if(this->dictMap.find(precTokenLower) == this->dictMap.end()) {
716  float s{
717  Sentiment::scalarIncDec(
718  precToken,
719  precTokenLower,
720  valence,
721  isCapDifference
722  )
723  };
724 
725  if(std::fabs(s) <= std::numeric_limits<float>::epsilon()) {
726  if(startIndex == VaderOne) {
727  s *= VaderDampOne;
728  }
729  else if(startIndex == VaderTwo) {
730  s *= VaderDampTwo;
731  }
732  }
733 
734  valence += s;
735 
736  Sentiment::negationCheck(valence, tokensLower, startIndex, index);
737 
738  if(startIndex == VaderTwo) {
739  Sentiment::specialIdiomsCheck(valence, tokensLower, index);
740  }
741  }
742  }
743  }
744 
745  this->leastCheck(valence, tokensLower, index);
746  }
747 
748  sentiments.push_back(valence);
749  }
750 
751  // calculate valence score
752  inline SentimentScores Sentiment::scoreValence(
753  const std::vector<float>& sentiments,
754  const Tokens& /*tokenss*/
755  ) {
756  if(sentiments.empty()) {
757  return SentimentScores{};
758  }
759 
760  auto sum{std::accumulate(sentiments.begin(), sentiments.end(), 0.F)};
761 
762  /*
763  * (the following code is not used, because punctuation is removed by the tokenizer)
764  *
765  const auto punctEmphAmp{Sentiment::punctuationEmphasis(tokens)};
766 
767  // compute and add emphasis from punctuation in text
768  if(sum > 0.F) {
769  sum += punctEmphAmp;
770  }
771  else if(sum < 0.F) {
772  sum -= punctEmphAmp;
773  }
774  */
775 
776  SentimentScores result;
777  std::size_t neuCount{};
778 
779  result.compound = Sentiment::normalize(sum);
780 
781  Sentiment::siftSentimentScores(sentiments, result.positive, result.negative, neuCount);
782 
783  /*
784  * (the following code is not used, because punctuation is removed by the tokenizer)
785  *
786  if(result.positive > std::fabs(result.negative)) {
787  result.positive += punctEmphAmp;
788  }
789  else if(result.positive < std::fabs(result.negative)) {
790  result.negative -= punctEmphAmp;
791  }
792  */
793 
794  const auto total{result.positive + std::fabs(result.negative) + neuCount};
795 
796  result.positive = std::fabs(result.positive / total);
797  result.negative = std::fabs(result.negative / total);
798  result.neutral = std::fabs(static_cast<float>(neuCount) / total);
799 
800  return result;
801  }
802 
803  // check for negation case using "least"
804  inline void Sentiment::leastCheck(float& valence, const Tokens& tokensLower, std::size_t index) const {
805  if(
806  index > VaderOne
807  && this->dictMap.find(tokensLower[index - VaderOne]) == this->dictMap.end()
808  && tokensLower[index - VaderOne] == "least"
809  ) {
810  if(
811  tokensLower[index - VaderTwo] != "at"
812  && tokensLower[index - VaderTwo] != "very"
813  ) {
814  valence *= VaderN_SCALAR;
815  }
816  }
817  else if(
818  index > VaderZero
819  && this->dictMap.find(tokensLower[index - VaderOne]) == this->dictMap.end()
820  && tokensLower[index - VaderOne] == "least"
821  ) {
822  valence *= VaderN_SCALAR;
823  }
824  }
825 
826  /*
827  * INTERNAL STATIC HELPER FUNCTIONS (private)
828  */
829 
830  // Create lower-case copies of given tokens
831  inline std::vector<std::string> Sentiment::toLower(const Tokens& tokens) {
832  std::vector<std::string> tokensLower;
833 
834  tokensLower.reserve(tokens.size());
835 
836  std::transform(
837  tokens.cbegin(),
838  tokens.cend(),
839  std::back_inserter(tokensLower),
840  [](const auto& token) {
841  std::string tokenLower;
842 
843  tokenLower.reserve(token.size());
844 
845  std::transform(
846  token.cbegin(),
847  token.cend(),
848  std::back_inserter(tokenLower),
849  [](const auto c) {
850  return std::tolower(c);
851  }
852  );
853 
854  return tokenLower;
855  }
856  );
857 
858  return tokensLower;
859  }
860 
861  // Return whether a token is a negation token.
862  inline bool Sentiment::isNegated(const std::string& tokenLower) {
863  if(Sentiment::NEGATE.find(tokenLower) != Sentiment::NEGATE.end()) {
864  return true;
865  }
866 
867  if(tokenLower.find("n't") != std::string::npos) {
868  return true;
869  }
870 
871  return false;
872  }
873 
874  // Determine if input contains negation tokens (NOTE: strings in vector need to be lowercase!)
875  inline bool Sentiment::isNegated(const Tokens& tokensLower) {
876  for(const auto& tokenLower : tokensLower) {
877  if(Sentiment::isNegated(tokenLower)) {
878  return true;
879  }
880  }
881 
882  return false;
883  }
884 
885  // Normalize the score to be between -1 and 1 using an alpha that approximates the max expected value
886  inline float Sentiment::normalize(float score) {
887  constexpr auto alpha{15};
888 
889  const float normScore{score / std::sqrt((score * score) + alpha)};
890 
891  if(normScore < -1.F) {
892  return -1.F;
893  }
894 
895  if(normScore > 1.F) {
896  return 1.F;
897  }
898 
899  return normScore;
900  }
901 
902  // Check whether a token is ALL CAPS
903  inline bool Sentiment::isAllCaps(const std::string& token) {
904  return std::all_of(token.begin(), token.end(), [](const char c) {
905  return std::isupper(c);
906  });
907  }
908 
909  // Check whether just some tokens in the input are ALL CAPS,
910  // return false if ALL or NONE of the tokens are ALL CAPS
911  inline bool Sentiment::isAllCapDifferential(const Tokens& tokens) {
912  std::size_t allCapTokens{};
913 
914  for(const auto& token : tokens) {
915  if(Sentiment::isAllCaps(token)) {
916  ++allCapTokens;
917  }
918  }
919 
920  return allCapTokens > 0 && allCapTokens < tokens.size();
921  }
922 
923  // Check if the preceding tokens increase, decrease, or negate/nullify the valence
924  inline float Sentiment::scalarIncDec(
925  const std::string& token,
926  const std::string& tokenLower,
927  float valence,
928  bool isCapDiff
929  ) {
930  float scalar{};
931 
932  const auto it{Sentiment::BOOSTER_DICT.find(tokenLower)};
933 
934  if(it != Sentiment::BOOSTER_DICT.end()) {
935  scalar = it->second;
936 
937  if(valence < 0.F) {
938  scalar *= -1.F;
939  }
940 
941  if(isAllCaps(token) && isCapDiff) {
942  if(valence > 0.F) {
943  scalar += VaderC_INCR;
944  }
945  else {
946  scalar -= VaderC_INCR;
947  }
948  }
949  }
950 
951  return scalar;
952  }
953 
954  /*
955  * (the following functions are not used, because punctuation is removed by the tokenizer)
956  *
957  // add emphasis from exclamation points and question marks
958  inline float Sentiment::punctuationEmphasis(const Tokens& tokens) {
959  return Sentiment::amplifyEP(tokens) + amplifyQM(tokens);
960  }
961 
962  inline float Sentiment::amplifyEP(const Tokens& tokens) {
963  auto epCount{
964  std::accumulate(
965  tokens.begin(),
966  tokens.end(),
967  std::uint64_t{},
968  [](auto count, const auto& token) {
969  return count + std::count(token.begin(), token.end(), '!');
970  }
971  )
972  };
973 
974  if(epCount > Four) {
975  epCount = Four;
976  }
977 
978  return VaderEPFactor * epCount;
979  }
980 
981  inline float Sentiment::amplifyQM(const Tokens& tokens) {
982  auto qmCount{
983  std::accumulate(
984  tokens.begin(),
985  tokens.end(),
986  std::uint64_t{},
987  [](auto count, const auto& token) {
988  return count + std::count(token.begin(), token.end(), '?');
989  }
990  )
991  };
992 
993  if(qmCount > One) {
994  if(qmCount <= Three) {
995  return VaderQMFactor * qmCount;
996  }
997 
998  return VaderQMFactorMax;
999  }
1000 
1001  return 0.F;
1002  }
1003  */
1004 
1005  // check for modification in sentiment due to contrastive conjunction 'but'
1006  inline void Sentiment::butCheck(const Tokens& tokensLower, std::vector<float>& sentiments) {
1007  const auto it{std::find(tokensLower.cbegin(), tokensLower.cend(), "but")};
1008 
1009  if(it != tokensLower.cend()) {
1010  const auto butIndex{static_cast<std::size_t>(it - tokensLower.begin())};
1011 
1012  for(std::size_t index{}; index < sentiments.size(); ++index) {
1013  if(index < butIndex) {
1014  sentiments[index] *= VaderButFactorBefore;
1015  }
1016  else if(index > butIndex) {
1017  sentiments[index] *= VaderButFactorAfter;
1018  }
1019  }
1020  }
1021  }
1022 
1023  // check for negation (either by "never so/this" or by "without doubt")
1024  inline void Sentiment::negationCheck(float& valence, const Tokens& tokensLower, std::uint8_t startIndex, std::size_t index) {
1025  switch(startIndex) {
1026  case VaderZero:
1027  if(Sentiment::isNegated(tokensLower[index - (startIndex + VaderOne)])) {
1028  // 1 token preceding lexicon token (without stopwords)
1029  valence *= VaderN_SCALAR;
1030  }
1031  break;
1032 
1033  case VaderOne:
1034  if(
1035  tokensLower[index - VaderTwo] == "never"
1036  && (
1037  tokensLower[index - VaderOne] == "so"
1038  || tokensLower[index - VaderOne] == "this"
1039  )
1040  ) {
1041  valence *= VaderNeverFactor;
1042  }
1043  else if(
1044  tokensLower[index - VaderTwo] == "without"
1045  && tokensLower[index - VaderOne] == "doubt"
1046  ) {
1047  // (ignore)
1048  }
1049  else if(
1050  Sentiment::isNegated(tokensLower[index - (startIndex + VaderOne)])
1051  ) {
1052  // 2 tokens preceding the lexicon token position
1053  valence *= VaderN_SCALAR;
1054  }
1055 
1056  break;
1057 
1058  case VaderTwo:
1059  if(
1060  tokensLower[index - VaderThree] == "never"
1061  && (
1062  tokensLower[index - VaderTwo] == "so"
1063  || tokensLower[index - VaderTwo] == "this"
1064  || (tokensLower[index - VaderOne] == "so"
1065  || tokensLower[index - VaderOne] == "this")
1066  )
1067  ) {
1068  valence *= VaderNeverFactor;
1069  }
1070  else if(
1071  tokensLower[index - VaderThree] == "without"
1072  && (
1073  tokensLower[index - VaderTwo] == "doubt"
1074  || tokensLower[index - VaderOne] == "doubt"
1075  )
1076  ) {
1077  // (ignore)
1078  }
1079  else if(
1080  Sentiment::isNegated(tokensLower[index - (startIndex + VaderOne)])
1081  ) {
1082  // 3 tokens preceding the lexicon token position
1083  valence *= VaderN_SCALAR;
1084  }
1085 
1086  break;
1087 
1088  default:
1089  break;
1090  }
1091  }
1092 
1093  // check for special idioms
1094  inline void Sentiment::specialIdiomsCheck(float& valence, const Tokens& tokensLower, std::size_t index) {
1095  const auto oneZero{
1096  tokensLower[index - VaderOne]
1097  + " "
1098  + tokensLower[index]
1099  };
1100 
1101  const auto twoOneZero{
1102  tokensLower[index - VaderTwo]
1103  + " "
1104  + tokensLower[index - VaderOne]
1105  + " "
1106  + tokensLower[index]
1107  };
1108 
1109  const auto twoOne{
1110  tokensLower[index - VaderTwo]
1111  + " "
1112  + tokensLower[index - VaderOne]
1113  };
1114 
1115  const auto threeTwoOne{
1116  tokensLower[index - VaderThree]
1117  + " "
1118  + tokensLower[index - VaderTwo]
1119  + " "
1120  + tokensLower[index - VaderOne]
1121  };
1122 
1123  const auto threeTwo{
1124  tokensLower[index - VaderThree]
1125  + " "
1126  + tokensLower[index - VaderTwo]
1127  };
1128 
1129  const std::array sequences{oneZero, twoOneZero, twoOne, threeTwoOne, threeTwo};
1130 
1131  for(const auto& sequence : sequences) {
1132  const auto it{Sentiment::SPECIAL_CASES.find(sequence)};
1133 
1134  if(it != Sentiment::SPECIAL_CASES.end()) {
1135  valence = it->second;
1136 
1137  break;
1138  }
1139  }
1140 
1141  if(tokensLower.size() - VaderOne > index) {
1142  const auto zeroOne{
1143  tokensLower[index]
1144  + " "
1145  + tokensLower[index + VaderOne]
1146  };
1147 
1148  const auto it{Sentiment::SPECIAL_CASES.find(zeroOne)};
1149 
1150  if(it != Sentiment::SPECIAL_CASES.end()) {
1151  valence = it->second;
1152  }
1153  }
1154 
1155  if(tokensLower.size() - VaderOne > index + VaderOne) {
1156  const auto zeroOneTwo{
1157  tokensLower[index]
1158  + " "
1159  + tokensLower[index + VaderOne]
1160  + " "
1161  + tokensLower[index + VaderTwo]
1162  };
1163 
1164  const auto it{Sentiment::SPECIAL_CASES.find(zeroOneTwo)};
1165 
1166  if(it != Sentiment::SPECIAL_CASES.end()) {
1167  valence = it->second;
1168  }
1169  }
1170 
1171  // check for booster/dampener bi-grams such as 'sort of' or 'kind of'
1172  const std::array nGrams{threeTwoOne, threeTwo, twoOne};
1173 
1174  for(const auto& nGram : nGrams) {
1175  const auto it{Sentiment::BOOSTER_DICT.find(nGram)};
1176 
1177  if(it != Sentiment::BOOSTER_DICT.end()) {
1178  valence += it->second;
1179  }
1180  }
1181  }
1182 
1183  // calculate final sentiment scores
1184  inline void Sentiment::siftSentimentScores(
1185  const std::vector<float>& sentiments,
1186  float& positiveSumTo,
1187  float& negativeSumTo,
1188  std::size_t& neutralCountTo
1189  ) {
1190  for(const auto sentiment : sentiments) {
1191  if(sentiment > std::numeric_limits<float>::epsilon()) {
1192  /* compensate for neutral tokens that are counted as 1 */
1193  positiveSumTo += sentiment + VaderFOne;
1194  }
1195  else if(sentiment < -std::numeric_limits<float>::epsilon()) {
1196  /* when used with fabs(), compensate for neutrals */
1197  negativeSumTo += sentiment - VaderFOne;
1198  }
1199  else {
1200  ++neutralCountTo;
1201  }
1202  }
1203  }
1204 
1205 } /* namespace crawlservpp::Data */
1206 
1207 #endif /* DATA_SENTIMENT_HPP_ */
constexpr auto VaderButFactorBefore
Factor by which the modifier is dampened before a "but".
Definition: Sentiment.hpp:96
constexpr auto VaderButFactorAfter
Factor by which the modifier is heightened after a "but".
Definition: Sentiment.hpp:99
constexpr auto VaderNeverFactor
Factor by which the modifier is heightened after a "never".
Definition: Sentiment.hpp:102
constexpr auto VaderOne
One.
Definition: Sentiment.hpp:75
Sentiment(const std::string &dictionaryFile, const std::string &emojiFile)
Constructor.
Definition: Sentiment.hpp:477
SentimentScores analyze(const Tokens &tokens)
Get the sentiment strength in the given sentence.
Definition: Sentiment.hpp:553
constexpr auto VaderDampTwo
Factor by which the scalar modifier of previously preceding tokens is dampened.
Definition: Sentiment.hpp:93
std::size_t getEmojiNum() const
Gets the number of entries in the emoji dictionary.
Definition: Sentiment.hpp:535
constexpr auto VaderB_INCR
Empirically derived mean sentiment intensity rating increase for booster tokens.
Definition: Sentiment.hpp:116
float negative
Negative sentiment.
Definition: Sentiment.hpp:181
float neutral
Neutral sentiment.
Definition: Sentiment.hpp:165
constexpr auto VaderDampOne
Factor by which the scalar modifier of immediately preceding tokens is dampened.
Definition: Sentiment.hpp:90
Implementation of the VADER sentiment analysis algorithm.
Definition: Sentiment.hpp:212
constexpr auto VaderFOne
Factor of One.
Definition: Sentiment.hpp:87
constexpr auto VaderFour
Four.
Definition: Sentiment.hpp:84
constexpr auto VaderThree
Three.
Definition: Sentiment.hpp:81
constexpr auto VaderC_INCR
Empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a token...
Definition: Sentiment.hpp:122
float positive
Positive sentiment.
Definition: Sentiment.hpp:149
constexpr auto VaderTwo
Two.
Definition: Sentiment.hpp:78
constexpr auto VaderZero
Zero.
Definition: Sentiment.hpp:72
constexpr auto VaderB_DECR
Empirically derived mean sentiment intensity rating decrease for negative booster tokens...
Definition: Sentiment.hpp:119
std::size_t getDictSize() const
Gets the number of dictionary entries.
Definition: Sentiment.hpp:527
Namespace for different types of data.
float compound
Compound score.
Definition: Sentiment.hpp:199
Structure for VADER sentiment scores.
Definition: Sentiment.hpp:134
constexpr auto VaderN_SCALAR
Negation factor.
Definition: Sentiment.hpp:125