crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Lemmatizer.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * Lemmatizer.hpp
24  *
25  * Multilingual lemmatizer.
26  *
27  * Created on: Sep 1, 2020
28  * Author: ans
29  */
30 
31 #ifndef DATA_LEMMATIZER_HPP_
32 #define DATA_LEMMATIZER_HPP_
33 
34 #include "Dictionary.hpp"
35 
36 #include "../Helper/FileSystem.hpp"
37 #include "../Helper/Memory.hpp"
38 #include "../Helper/Strings.hpp"
39 
40 #include <cstddef> // std::size_t
41 #include <cstdint> // std::uint64_t
42 #include <fstream> // std::ifstream
43 #include <string> // std::getline, std::stoul, std::string
44 #include <unordered_map> // std::unordered_map
45 #include <utility> // std::move
46 #include <vector> // std::vector
47 
48 namespace crawlservpp::Data {
49 
50  /*
51  * CONSTANTS
52  */
53 
56 
58 
62  inline constexpr auto colLemma{1};
63 
65 
69  inline constexpr auto colTag{2};
70 
72 
76  inline constexpr auto colCount{3};
77 
79 
80  /*
81  * DECLARATION
82  */
83 
85  class Lemmatizer {
86  // property of a dictionary entry (each entry, i.e. token, may have multiple such properties)
87  struct DictionaryProperty {
88  std::string tag;
89  std::string lemma;
90  std::uint64_t count{};
91  };
92 
93  // for convenience
94  using Dictionary = std::unordered_map<std::string, std::vector<DictionaryProperty>>;
95  using DictionaryIterator = std::unordered_map<std::string, Dictionary>::const_iterator;
96 
97  public:
100 
101  void lemmatize(std::string& token, const std::string& dictionary);
102 
106 
107  void clear();
108 
110 
111  private:
112  // dictionaries
113  std::unordered_map<std::string, Dictionary> dictionaries;
114 
115  // internal helper functions
116  DictionaryIterator build(const std::string& dictionary);
117  static std::size_t countEqualChars(
118  const std::string& string,
119  std::size_t pos,
120  const std::string& needle
121  );
122  };
123 
124  /*
125  * IMPLEMENTATION
126  */
127 
128  /*
129  * LEMMATIZATION
130  */
131 
133 
140  inline void Lemmatizer::lemmatize(std::string& token, const std::string& dictionary) {
141  // get dictionary or build it if necessary
142  DictionaryIterator dict{
143  this->dictionaries.find(dictionary)
144  };
145 
146  if(dict == this->dictionaries.end()) {
147  dict = build(dictionary);
148  }
149 
150  // get length of token and look it up in dictionary
151  std::size_t tokenLength{token.find(' ')};
152 
153  if(tokenLength > token.length()) {
154  tokenLength = token.length();
155  }
156 
157  const auto entry{
158  dict->second.find(token.substr(0, tokenLength))
159  };
160 
161  if(entry == dict->second.end() || entry->second.empty()) {
162  /* token not in dictionary */
163  return;
164  }
165 
166  if(entry->second.size() == 1) {
167  /* exactly one entry in dictionary */
168  token = entry->second[0].lemma;
169 
170  return;
171  }
172 
173  // compare tags
174  std::vector<std::size_t> equalChars;
175  std::size_t max{};
176 
177  for(const auto& property : entry->second) {
178  const std::size_t count{
179  countEqualChars(token, tokenLength + 1, property.tag)
180  };
181 
182  equalChars.push_back(count);
183 
184  if(count > max) {
185  max = count;
186  }
187  }
188 
189  // compare occurences
190  max = 0;
191 
192  for(std::size_t i = 0; i < entry->second.size(); ++i) {
193  if(
194  equalChars[i] == max
195  && entry->second[i].count > max
196  ) {
197  max = entry->second[i].count;
198  }
199  }
200 
201  // return lemma with most equal characters in tag and most occurences
202  for(const auto& property : entry->second) {
203  if(property.count == max) {
204  token = property.lemma;
205 
206  return;
207  }
208  }
209  }
210 
211  /*
212  * CLEANUP
213  */
214 
216  inline void Lemmatizer::clear() {
217  Helper::Memory::free(this->dictionaries);
218  }
219 
220  /*
221  * INTERNAL HELPER FUNCTIONS (private)
222  */
223 
224  // build the dictionary for a specific language
225  inline Lemmatizer::DictionaryIterator Lemmatizer::build(const std::string& dictionary) {
226  Dictionary newDictionary;
227 
228  // read dictionary file line by line
229  std::string dictFileName{dictDir};
230 
231  dictFileName.push_back(Helper::FileSystem::getPathSeparator());
232 
233  dictFileName += dictionary;
234 
235  std::ifstream in(dictFileName.c_str());
236  std::string line;
237 
238  while(std::getline(in, line)) {
239  if(line.empty()) {
240  continue;
241  }
242 
243  const auto columns{
244  Helper::Strings::split(line, '\t')
245  };
246 
247  if(columns.empty()) {
248  continue;
249  }
250 
251  std::vector<DictionaryProperty> properties(1);
252 
253  if(columns.size() > colLemma) {
254  properties.back().lemma = columns[colLemma];
255  }
256 
257  if(columns.size() > colTag) {
258  properties.back().tag = columns[colTag];
259  }
260 
261  if(columns.size() > colCount) {
262  properties.back().count = std::stoul(columns[colCount]);
263  }
264 
265  const auto added{
266  newDictionary.emplace(columns[0], properties)
267  };
268 
269  if(!added.second) {
270  added.first->second.emplace_back(std::move(properties.back()));
271  }
272  }
273 
274  // move dictionary to the set and return (constant) iterator
275  return this->dictionaries.emplace(dictionary, std::move(newDictionary)).first;
276  }
277 
278  // count number of equal characters (from specific position of string and from beginning of needle)
279  inline std::size_t Lemmatizer::countEqualChars(
280  const std::string& string,
281  std::size_t pos,
282  const std::string& needle
283  ) {
284  for(std::size_t n{}; n < needle.length(); ++n) {
285  if(
286  pos + n >= string.length()
287  || string[pos + n] != needle[n]
288  ) {
289  return n;
290  }
291  }
292 
293  return needle.length();
294  }
295 
296 } /* namespace crawlservpp::Data */
297 
298 #endif /* DATA_LEMMATIZER_HPP_ */
constexpr auto colCount
Column containing the number of occurences in a dictionary file.
Definition: Lemmatizer.hpp:76
void clear()
Clears the lemmatizer, freeing the memory used by all dictionaries.
Definition: Lemmatizer.hpp:216
char getPathSeparator()
Gets the preferred separator for file paths in the current operating system.
Definition: FileSystem.hpp:187
std::size_t length(std::string_view str)
Definition: Utf8.hpp:327
constexpr auto colTag
Column containing the tag in a dictionary file.
Definition: Lemmatizer.hpp:69
constexpr auto colLemma
Column containing the lemma in a dictionary file.
Definition: Lemmatizer.hpp:62
constexpr auto dictDir
Directory for dictionaries.
Definition: Dictionary.hpp:48
Lemmatizer.
Definition: Lemmatizer.hpp:85
Namespace for different types of data.
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
void lemmatize(std::string &token, const std::string &dictionary)
Lemmatizes a token.
Definition: Lemmatizer.hpp:140
std::vector< std::string > split(const std::string &str, char delimiter)
Splits a string into a vector of strings using the given delimiter.
Definition: Strings.hpp:739