31 #ifndef DATA_LEMMATIZER_HPP_ 32 #define DATA_LEMMATIZER_HPP_ 36 #include "../Helper/FileSystem.hpp" 37 #include "../Helper/Memory.hpp" 38 #include "../Helper/Strings.hpp" 44 #include <unordered_map> 87 struct DictionaryProperty {
90 std::uint64_t count{};
94 using Dictionary = std::unordered_map<std::string, std::vector<DictionaryProperty>>;
95 using DictionaryIterator = std::unordered_map<std::string, Dictionary>::const_iterator;
101 void lemmatize(std::string& token,
const std::string& dictionary);
113 std::unordered_map<std::string, Dictionary> dictionaries;
116 DictionaryIterator build(
const std::string& dictionary);
117 static std::size_t countEqualChars(
118 const std::string&
string,
120 const std::string& needle
142 DictionaryIterator dict{
143 this->dictionaries.find(dictionary)
146 if(dict == this->dictionaries.end()) {
147 dict = build(dictionary);
151 std::size_t tokenLength{token.find(
' ')};
153 if(tokenLength > token.length()) {
154 tokenLength = token.length();
158 dict->second.find(token.substr(0, tokenLength))
161 if(entry == dict->second.end() || entry->second.empty()) {
166 if(entry->second.size() == 1) {
168 token = entry->second[0].lemma;
174 std::vector<std::size_t> equalChars;
177 for(
const auto& property : entry->second) {
178 const std::size_t count{
179 countEqualChars(token, tokenLength + 1, property.tag)
182 equalChars.push_back(count);
192 for(std::size_t i = 0; i < entry->second.size(); ++i) {
195 && entry->second[i].count > max
197 max = entry->second[i].count;
202 for(
const auto& property : entry->second) {
203 if(property.count == max) {
204 token =
property.lemma;
225 inline Lemmatizer::DictionaryIterator Lemmatizer::build(
const std::string& dictionary) {
226 Dictionary newDictionary;
229 std::string dictFileName{
dictDir};
233 dictFileName += dictionary;
235 std::ifstream in(dictFileName.c_str());
238 while(std::getline(in, line)) {
247 if(columns.empty()) {
251 std::vector<DictionaryProperty> properties(1);
254 properties.back().lemma = columns[
colLemma];
257 if(columns.size() >
colTag) {
258 properties.back().tag = columns[
colTag];
262 properties.back().count = std::stoul(columns[
colCount]);
266 newDictionary.emplace(columns[0], properties)
270 added.first->second.emplace_back(std::move(properties.back()));
275 return this->dictionaries.emplace(dictionary, std::move(newDictionary)).first;
279 inline std::size_t Lemmatizer::countEqualChars(
280 const std::string&
string,
282 const std::string& needle
284 for(std::size_t n{}; n < needle.length(); ++n) {
286 pos + n >=
string.
length()
287 ||
string[pos + n] != needle[n]
293 return needle.length();
constexpr auto colCount
Column containing the number of occurences in a dictionary file.
Definition: Lemmatizer.hpp:76
void clear()
Clears the lemmatizer, freeing the memory used by all dictionaries.
Definition: Lemmatizer.hpp:216
char getPathSeparator()
Gets the preferred separator for file paths in the current operating system.
Definition: FileSystem.hpp:187
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
constexpr auto colTag
Column containing the tag in a dictionary file.
Definition: Lemmatizer.hpp:69
constexpr auto colLemma
Column containing the lemma in a dictionary file.
Definition: Lemmatizer.hpp:62
constexpr auto dictDir
Directory for dictionaries.
Definition: Dictionary.hpp:48
Lemmatizer.
Definition: Lemmatizer.hpp:85
Namespace for different types of data.
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
void lemmatize(std::string &token, const std::string &dictionary)
Lemmatizes a token.
Definition: Lemmatizer.hpp:140
std::vector< std::string > split(const std::string &str, char delimiter)
Splits a string into a vector of strings using the given delimiter.
Definition: Strings.hpp:739