41 #ifndef DATA_SENTIMENT_HPP_ 42 #define DATA_SENTIMENT_HPP_ 56 #include <string_view> 57 #include <unordered_map> 58 #include <unordered_set> 214 using Tokens = std::vector<std::string>;
221 inline static const std::unordered_set<std::string_view> NEGATE{
286 inline static const std::unordered_map<std::string_view, float> BOOSTER_DICT{
375 inline static const std::unordered_map<std::string_view, float> SPECIAL_CASES{
381 {
"yeah right", -2.F },
382 {
"kiss of death", -1.5F },
383 {
"to die for", 3.F },
384 {
"beating heart", 3.1F },
385 {
"broken heart", -2.9F }
392 Sentiment(
const std::string& dictionaryFile,
const std::string& emojiFile);
398 [[nodiscard]] std::size_t getDictSize()
const;
399 [[nodiscard]] std::size_t getEmojiNum()
const;
411 std::unordered_map<std::string, float> dictMap;
412 std::unordered_map<std::string, std::string> emojiMap;
415 void sentimentValence(
417 const Tokens& tokens,
418 const Tokens& tokensLower,
420 std::vector<float>& sentiments,
423 static SentimentScores scoreValence(
const std::vector<float>& sentiments,
const Tokens& );
424 void leastCheck(
float& valence,
const Tokens& tokensLower, std::size_t index)
const;
427 [[nodiscard]]
static std::vector<std::string> toLower(
const Tokens& tokens);
428 [[nodiscard]]
static bool isNegated(
const std::string& tokenLower);
429 [[nodiscard]]
static bool isNegated(
const Tokens& tokensLower);
430 [[nodiscard]]
static float normalize(
float score);
431 [[nodiscard]]
static bool isAllCaps(
const std::string& token);
432 [[nodiscard]]
static bool isAllCapDifferential(
const Tokens& tokens);
433 [[nodiscard]]
static float scalarIncDec(
434 const std::string& token,
435 const std::string& tokenLower,
447 static void butCheck(
const Tokens& tokensLower, std::vector<float>& sentiments);
448 static void negationCheck(
float& valence,
const Tokens& tokensLower, std::uint8_t startIndex, std::size_t index);
449 static void specialIdiomsCheck(
float& valence,
const Tokens& tokensLower, std::size_t index);
450 static void siftSentimentScores(
451 const std::vector<float>& sentiments,
452 float& positiveSumTo,
453 float& negativeSumTo,
454 std::size_t& neutralCountTo
478 std::ifstream dictIn(dictionaryFile.c_str());
481 if(!dictIn.is_open()) {
482 throw std::runtime_error(
"Could not open dictionary file: '" + dictionaryFile +
"'");
485 while(std::getline(dictIn, line)) {
486 const auto firstTab{line.find(
'\t')};
488 if(firstTab != std::string::npos) {
489 const auto term{line.substr(0, firstTab)};
490 const auto secondTab{line.find(
'\t', firstTab + 1)};
491 const auto value{std::stof(line.substr(firstTab + 1, secondTab - firstTab))};
493 this->dictMap.emplace_hint(this->dictMap.end(), term, value);
499 std::ifstream emojiIn(emojiFile.c_str());
501 if(!emojiIn.is_open()) {
502 throw std::runtime_error(
"Could not open emoji file: '" + emojiFile +
"'");
505 while(std::getline(emojiIn, line)) {
506 const auto tab{line.find(
'\t')};
508 if(tab != std::string::npos) {
509 const auto emoji{line.substr(0, tab)};
510 const auto value{line.substr(tab + 1)};
512 this->emojiMap.emplace_hint(this->emojiMap.end(), emoji, value);
528 return this->dictMap.size();
536 return this->emojiMap.size();
554 const bool isCapDifference{
555 Sentiment::isAllCapDifferential(tokens)
559 std::vector<std::string> newTokens;
562 newTokens.reserve(tokens.size());
564 for(
const auto& token : tokens) {
565 std::string tokenCopy;
569 for(; beg < token.length(); ++beg) {
570 const auto c{token[beg]};
572 if(std::ispunct(c) == 0 && std::iscntrl(c) == 0 && c !=
' ') {
577 std::size_t len{token.length() - beg};
580 const auto c{token[beg + len -
VaderOne]};
582 if(std::ispunct(c) == 0 && std::iscntrl(c) == 0 && c !=
' ') {
589 tokenCopy = token.substr(beg, len);
591 const auto it = this->emojiMap.find(tokenCopy);
593 if(it != this->emojiMap.end()) {
594 std::string emojiToken;
596 emojiToken.reserve(it->second.length());
598 for(
auto c : it->second) {
600 if(!emojiToken.empty()) {
601 newTokens.emplace_back(emojiToken);
607 emojiToken.push_back(c);
611 if(!emojiToken.empty()) {
612 newTokens.emplace_back(emojiToken);
616 newTokens.emplace_back(tokenCopy);
621 std::vector<std::string> tokensLower{Sentiment::toLower(newTokens)};
624 std::vector<float> sentiments;
626 sentiments.reserve(newTokens.size());
628 for(std::size_t index{}; index < newTokens.size(); ++index) {
631 if(std::find_if(BOOSTER_DICT.begin(), BOOSTER_DICT.end(), [&tokensLower, &index](
const auto& booster) {
632 return booster.first == tokensLower[index];
633 }) != BOOSTER_DICT.end()) {
634 sentiments.push_back(valence);
639 if(index < newTokens.size() - 1 && tokensLower[index] ==
"kind" && tokensLower[index + 1] ==
"of") {
640 sentiments.push_back(valence);
645 this->sentimentValence(valence, newTokens, tokensLower, index, sentiments, isCapDifference);
648 Sentiment::butCheck(tokensLower, sentiments);
650 return Sentiment::scoreValence(sentiments, newTokens);
658 inline void Sentiment::sentimentValence(
660 const Tokens& tokens,
661 const Tokens& tokensLower,
663 std::vector<float>& sentiments,
667 const auto it{this->dictMap.find(tokensLower[index])};
669 if(it != this->dictMap.end()) {
670 valence = it->second;
674 tokensLower[index] ==
"no" 676 && this->dictMap.find(tokensLower[index +
VaderOne]) != this->dictMap.end()
683 (index > 0 && tokensLower[index -
VaderOne] ==
"no")
684 || (index > 1 && tokensLower[index -
VaderTwo] ==
"no")
689 tokensLower[index -
VaderOne] ==
"or" 690 || tokensLower[index -
VaderOne] ==
"nor" 694 valence = it->second;
698 if(Sentiment::isAllCaps(tokens[index]) && isCapDifference) {
707 for(std::uint8_t startIndex{}; startIndex <
VaderThree; ++startIndex) {
711 if(index > startIndex) {
712 const auto& precToken{tokens[index - (startIndex +
VaderOne)]};
713 const auto& precTokenLower{tokensLower[index - (startIndex +
VaderOne)]};
715 if(this->dictMap.find(precTokenLower) == this->dictMap.end()) {
717 Sentiment::scalarIncDec(
725 if(std::fabs(s) <= std::numeric_limits<float>::epsilon()) {
736 Sentiment::negationCheck(valence, tokensLower, startIndex, index);
739 Sentiment::specialIdiomsCheck(valence, tokensLower, index);
745 this->leastCheck(valence, tokensLower, index);
748 sentiments.push_back(valence);
753 const std::vector<float>& sentiments,
756 if(sentiments.empty()) {
760 auto sum{std::accumulate(sentiments.begin(), sentiments.end(), 0.F)};
777 std::size_t neuCount{};
779 result.
compound = Sentiment::normalize(sum);
781 Sentiment::siftSentimentScores(sentiments, result.positive, result.negative, neuCount);
794 const auto total{result.positive + std::fabs(result.negative) + neuCount};
796 result.positive = std::fabs(result.positive / total);
797 result.negative = std::fabs(result.negative / total);
798 result.neutral = std::fabs(static_cast<float>(neuCount) / total);
804 inline void Sentiment::leastCheck(
float& valence,
const Tokens& tokensLower, std::size_t index)
const {
807 && this->dictMap.find(tokensLower[index -
VaderOne]) == this->dictMap.end()
808 && tokensLower[index -
VaderOne] ==
"least" 811 tokensLower[index -
VaderTwo] !=
"at" 812 && tokensLower[index -
VaderTwo] !=
"very" 819 && this->dictMap.find(tokensLower[index -
VaderOne]) == this->dictMap.end()
820 && tokensLower[index -
VaderOne] ==
"least" 831 inline std::vector<std::string> Sentiment::toLower(
const Tokens& tokens) {
832 std::vector<std::string> tokensLower;
834 tokensLower.reserve(tokens.size());
839 std::back_inserter(tokensLower),
840 [](
const auto& token) {
841 std::string tokenLower;
843 tokenLower.reserve(token.size());
848 std::back_inserter(tokenLower),
850 return std::tolower(c);
862 inline bool Sentiment::isNegated(
const std::string& tokenLower) {
863 if(Sentiment::NEGATE.find(tokenLower) != Sentiment::NEGATE.end()) {
867 if(tokenLower.find(
"n't") != std::string::npos) {
875 inline bool Sentiment::isNegated(
const Tokens& tokensLower) {
876 for(
const auto& tokenLower : tokensLower) {
877 if(Sentiment::isNegated(tokenLower)) {
886 inline float Sentiment::normalize(
float score) {
887 constexpr
auto alpha{15};
889 const float normScore{score / std::sqrt((score * score) + alpha)};
891 if(normScore < -1.F) {
895 if(normScore > 1.F) {
903 inline bool Sentiment::isAllCaps(
const std::string& token) {
904 return std::all_of(token.begin(), token.end(), [](
const char c) {
905 return std::isupper(c);
911 inline bool Sentiment::isAllCapDifferential(
const Tokens& tokens) {
912 std::size_t allCapTokens{};
914 for(
const auto& token : tokens) {
915 if(Sentiment::isAllCaps(token)) {
920 return allCapTokens > 0 && allCapTokens < tokens.size();
924 inline float Sentiment::scalarIncDec(
925 const std::string& token,
926 const std::string& tokenLower,
932 const auto it{Sentiment::BOOSTER_DICT.find(tokenLower)};
934 if(it != Sentiment::BOOSTER_DICT.end()) {
941 if(isAllCaps(token) && isCapDiff) {
1006 inline void Sentiment::butCheck(
const Tokens& tokensLower, std::vector<float>& sentiments) {
1007 const auto it{std::find(tokensLower.cbegin(), tokensLower.cend(),
"but")};
1009 if(it != tokensLower.cend()) {
1010 const auto butIndex{
static_cast<std::size_t
>(it - tokensLower.begin())};
1012 for(std::size_t index{}; index < sentiments.size(); ++index) {
1013 if(index < butIndex) {
1016 else if(index > butIndex) {
1024 inline void Sentiment::negationCheck(
float& valence,
const Tokens& tokensLower, std::uint8_t startIndex, std::size_t index) {
1025 switch(startIndex) {
1027 if(Sentiment::isNegated(tokensLower[index - (startIndex +
VaderOne)])) {
1035 tokensLower[index -
VaderTwo] ==
"never" 1037 tokensLower[index -
VaderOne] ==
"so" 1038 || tokensLower[index -
VaderOne] ==
"this" 1044 tokensLower[index -
VaderTwo] ==
"without" 1045 && tokensLower[index -
VaderOne] ==
"doubt" 1050 Sentiment::isNegated(tokensLower[index - (startIndex +
VaderOne)])
1062 tokensLower[index -
VaderTwo] ==
"so" 1063 || tokensLower[index -
VaderTwo] ==
"this" 1064 || (tokensLower[index -
VaderOne] ==
"so" 1065 || tokensLower[index -
VaderOne] ==
"this")
1073 tokensLower[index -
VaderTwo] ==
"doubt" 1074 || tokensLower[index -
VaderOne] ==
"doubt" 1080 Sentiment::isNegated(tokensLower[index - (startIndex +
VaderOne)])
1094 inline void Sentiment::specialIdiomsCheck(
float& valence,
const Tokens& tokensLower, std::size_t index) {
1098 + tokensLower[index]
1101 const auto twoOneZero{
1106 + tokensLower[index]
1115 const auto threeTwoOne{
1123 const auto threeTwo{
1129 const std::array sequences{oneZero, twoOneZero, twoOne, threeTwoOne, threeTwo};
1131 for(
const auto& sequence : sequences) {
1132 const auto it{Sentiment::SPECIAL_CASES.find(sequence)};
1134 if(it != Sentiment::SPECIAL_CASES.end()) {
1135 valence = it->second;
1141 if(tokensLower.size() -
VaderOne > index) {
1148 const auto it{Sentiment::SPECIAL_CASES.find(zeroOne)};
1150 if(it != Sentiment::SPECIAL_CASES.end()) {
1151 valence = it->second;
1156 const auto zeroOneTwo{
1164 const auto it{Sentiment::SPECIAL_CASES.find(zeroOneTwo)};
1166 if(it != Sentiment::SPECIAL_CASES.end()) {
1167 valence = it->second;
1172 const std::array nGrams{threeTwoOne, threeTwo, twoOne};
1174 for(
const auto& nGram : nGrams) {
1175 const auto it{Sentiment::BOOSTER_DICT.find(nGram)};
1177 if(it != Sentiment::BOOSTER_DICT.end()) {
1178 valence += it->second;
1184 inline void Sentiment::siftSentimentScores(
1185 const std::vector<float>& sentiments,
1186 float& positiveSumTo,
1187 float& negativeSumTo,
1188 std::size_t& neutralCountTo
1190 for(
const auto sentiment : sentiments) {
1191 if(sentiment > std::numeric_limits<float>::epsilon()) {
1195 else if(sentiment < -std::numeric_limits<float>::epsilon()) {
constexpr auto VaderButFactorBefore
Factor by which the modifier is dampened before a "but".
Definition: Sentiment.hpp:96
constexpr auto VaderButFactorAfter
Factor by which the modifier is heightened after a "but".
Definition: Sentiment.hpp:99
constexpr auto VaderNeverFactor
Factor by which the modifier is heightened after a "never".
Definition: Sentiment.hpp:102
constexpr auto VaderOne
One.
Definition: Sentiment.hpp:75
Sentiment(const std::string &dictionaryFile, const std::string &emojiFile)
Constructor.
Definition: Sentiment.hpp:477
SentimentScores analyze(const Tokens &tokens)
Get the sentiment strength in the given sentence.
Definition: Sentiment.hpp:553
constexpr auto VaderDampTwo
Factor by which the scalar modifier of previously preceding tokens is dampened.
Definition: Sentiment.hpp:93
std::size_t getEmojiNum() const
Gets the number of entries in the emoji dictionary.
Definition: Sentiment.hpp:535
constexpr auto VaderB_INCR
Empirically derived mean sentiment intensity rating increase for booster tokens.
Definition: Sentiment.hpp:116
float negative
Negative sentiment.
Definition: Sentiment.hpp:181
float neutral
Neutral sentiment.
Definition: Sentiment.hpp:165
constexpr auto VaderDampOne
Factor by which the scalar modifier of immediately preceding tokens is dampened.
Definition: Sentiment.hpp:90
Implementation of the VADER sentiment analysis algorithm.
Definition: Sentiment.hpp:212
constexpr auto VaderFOne
Factor of One.
Definition: Sentiment.hpp:87
constexpr auto VaderFour
Four.
Definition: Sentiment.hpp:84
constexpr auto VaderThree
Three.
Definition: Sentiment.hpp:81
constexpr auto VaderC_INCR
Empirically derived mean sentiment intensity rating increase for using ALLCAPs to emphasize a token...
Definition: Sentiment.hpp:122
float positive
Positive sentiment.
Definition: Sentiment.hpp:149
constexpr auto VaderTwo
Two.
Definition: Sentiment.hpp:78
constexpr auto VaderZero
Zero.
Definition: Sentiment.hpp:72
constexpr auto VaderB_DECR
Empirically derived mean sentiment intensity rating decrease for negative booster tokens...
Definition: Sentiment.hpp:119
std::size_t getDictSize() const
Gets the number of dictionary entries.
Definition: Sentiment.hpp:527
Namespace for different types of data.
float compound
Compound score.
Definition: Sentiment.hpp:199
Structure for VADER sentiment scores.
Definition: Sentiment.hpp:134
constexpr auto VaderN_SCALAR
Negation factor.
Definition: Sentiment.hpp:125