crawlserv/crawlservpp/Lemmatizer_8hpp_source.html

 /*
  *
  * ---
  *
  *  Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
  *
  *  This program is free software: you can redistribute it and/or modify
  *  it under the terms of the GNU General Public License as published by
  *  the Free Software Foundation, either version 3 of the License, or
  *  (at your option) any later version in addition to the terms of any
  *  licences already herein identified.
  *
  *  This program is distributed in the hope that it will be useful,
  *  but WITHOUT ANY WARRANTY; without even the implied warranty of
  *  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
  *  GNU General Public License for more details.
  *
  *  You should have received a copy of the GNU General Public License
  *  along with this program. If not, see <https://www.gnu.org/licenses/>.
  *
  * ---
  *
  * Lemmatizer.hpp
  *
  * Multilingual lemmatizer.
  *
  *  Created on: Sep 1, 2020
  *      Author: ans
  */

 #ifndef DATA_LEMMATIZER_HPP_
 #define DATA_LEMMATIZER_HPP_

 #include "Dictionary.hpp"

 #include "../Helper/FileSystem.hpp"
 #include "../Helper/Memory.hpp"
 #include "../Helper/Strings.hpp"

 #include <cstddef>          // std::size_t
 #include <cstdint>          // std::uint64_t
 #include <fstream>          // std::ifstream
 #include <string>           // std::getline, std::stoul, std::string
 #include <unordered_map>    // std::unordered_map
 #include <utility>          // std::move
 #include <vector>           // std::vector

 namespace crawlservpp::Data {

     /*
      * CONSTANTS
      */


     inline constexpr auto colLemma{1};


     inline constexpr auto colTag{2};


     inline constexpr auto colCount{3};


     /*
      * DECLARATION
      */

     class Lemmatizer {
         // property of a dictionary entry (each entry, i.e. token, may have multiple such properties)
         struct DictionaryProperty {
             std::string tag;
             std::string lemma;
             std::uint64_t count{};
         };

         // for convenience
         using Dictionary = std::unordered_map<std::string, std::vector<DictionaryProperty>>;
         using DictionaryIterator = std::unordered_map<std::string, Dictionary>::const_iterator;

     public:

         void lemmatize(std::string& token, const std::string& dictionary);


         void clear();


     private:
         // dictionaries
         std::unordered_map<std::string, Dictionary> dictionaries;

         // internal helper functions
         DictionaryIterator build(const std::string& dictionary);
         static std::size_t countEqualChars(
                 const std::string& string,
                 std::size_t pos,
                 const std::string& needle
         );
     };

     /*
      * IMPLEMENTATION
      */

     /*
      * LEMMATIZATION
      */


     inline void Lemmatizer::lemmatize(std::string& token, const std::string& dictionary) {
         // get dictionary or build it if necessary
         DictionaryIterator dict{
             this->dictionaries.find(dictionary)
         };

         if(dict == this->dictionaries.end()) {
             dict = build(dictionary);
         }

         // get length of token and look it up in dictionary
         std::size_t tokenLength{token.find(' ')};

         if(tokenLength > token.length()) {
             tokenLength = token.length();
         }

         const auto entry{
             dict->second.find(token.substr(0, tokenLength))
         };

         if(entry == dict->second.end() || entry->second.empty()) {
             /* token not in dictionary */
             return;
         }

         if(entry->second.size() == 1) {
             /* exactly one entry in dictionary */
             token = entry->second[0].lemma;

             return;
         }

         // compare tags
         std::vector<std::size_t> equalChars;
         std::size_t max{};

         for(const auto& property : entry->second) {
             const std::size_t count{
                 countEqualChars(token, tokenLength + 1, property.tag)
             };

             equalChars.push_back(count);

             if(count > max) {
                 max = count;
             }
         }

         // compare occurences
         max = 0;

         for(std::size_t i = 0; i < entry->second.size(); ++i) {
             if(
                     equalChars[i] == max
                     && entry->second[i].count > max
             ) {
                 max = entry->second[i].count;
             }
         }

         // return lemma with most equal characters in tag and most occurences
         for(const auto& property : entry->second) {
             if(property.count == max) {
                 token = property.lemma;

                 return;
             }
         }
     }

     /*
      * CLEANUP
      */

     inline void Lemmatizer::clear() {
         Helper::Memory::free(this->dictionaries);
     }

     /*
      * INTERNAL HELPER FUNCTIONS (private)
      */

     // build the dictionary for a specific language
     inline Lemmatizer::DictionaryIterator Lemmatizer::build(const std::string& dictionary) {
         Dictionary newDictionary;

         // read dictionary file line by line
         std::string dictFileName{dictDir};

         dictFileName.push_back(Helper::FileSystem::getPathSeparator());

         dictFileName += dictionary;

         std::ifstream in(dictFileName.c_str());
         std::string line;

         while(std::getline(in, line)) {
             if(line.empty()) {
                 continue;
             }

             const auto columns{
                 Helper::Strings::split(line, '\t')
             };

             if(columns.empty()) {
                 continue;
             }

             std::vector<DictionaryProperty> properties(1);

             if(columns.size() > colLemma) {
                 properties.back().lemma = columns[colLemma];
             }

             if(columns.size() > colTag) {
                 properties.back().tag = columns[colTag];
             }

             if(columns.size() > colCount) {
                 properties.back().count = std::stoul(columns[colCount]);
             }

             const auto added{
                 newDictionary.emplace(columns[0], properties)
             };

             if(!added.second) {
                 added.first->second.emplace_back(std::move(properties.back()));
             }
         }

         // move dictionary to the set and return (constant) iterator
         return this->dictionaries.emplace(dictionary, std::move(newDictionary)).first;
     }

     // count number of equal characters (from specific position of string and from beginning of needle)
     inline std::size_t Lemmatizer::countEqualChars(
             const std::string& string,
             std::size_t pos,
             const std::string& needle
     ) {
         for(std::size_t n{}; n < needle.length(); ++n) {
             if(
                     pos + n >= string.length()
                     || string[pos + n] != needle[n]
             ) {
                 return n;
             }
         }

         return needle.length();
     }

 } /* namespace crawlservpp::Data */

 #endif /* DATA_LEMMATIZER_HPP_ */
crawlservpp::Data::colCount
constexpr auto colCount
Column containing the number of occurences in a dictionary file.
Definition: Lemmatizer.hpp:76

crawlservpp::Data::Lemmatizer::clear
void clear()
Clears the lemmatizer, freeing the memory used by all dictionaries.
Definition: Lemmatizer.hpp:216

crawlservpp::Helper::FileSystem::getPathSeparator
char getPathSeparator()
Gets the preferred separator for file paths in the current operating system.
Definition: FileSystem.hpp:187

Dictionary.hpp

crawlservpp::Helper::Utf8::length
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327

crawlservpp::Data::colTag
constexpr auto colTag
Column containing the tag in a dictionary file.
Definition: Lemmatizer.hpp:69

crawlservpp::Data::colLemma
constexpr auto colLemma
Column containing the lemma in a dictionary file.
Definition: Lemmatizer.hpp:62

crawlservpp::Data::dictDir
constexpr auto dictDir
Directory for dictionaries.
Definition: Dictionary.hpp:48

crawlservpp::Data::Lemmatizer
Lemmatizer.
Definition: Lemmatizer.hpp:85

crawlservpp::Data
Namespace for different types of data.

crawlservpp::Helper::Memory::free
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42

crawlservpp::Data::Lemmatizer::lemmatize
void lemmatize(std::string &token, const std::string &dictionary)
Lemmatizes a token.
Definition: Lemmatizer.hpp:140

crawlservpp::Helper::Strings::split
std::vector< std::string > split(const std::string &str, char delimiter)
Splits a string into a vector of strings using the given delimiter.
Definition: Strings.hpp:739