crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
TokenRemover.hpp
Go to the documentation of this file.
1 /*
2  *
3  * ---
4  *
5  * Copyright (C) 2021 Anselm Schmidt (ans[ät]ohai.su)
6  *
7  * This program is free software: you can redistribute it and/or modify
8  * it under the terms of the GNU General Public License as published by
9  * the Free Software Foundation, either version 3 of the License, or
10  * (at your option) any later version in addition to the terms of any
11  * licences already herein identified.
12  *
13  * This program is distributed in the hope that it will be useful,
14  * but WITHOUT ANY WARRANTY; without even the implied warranty of
15  * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16  * GNU General Public License for more details.
17  *
18  * You should have received a copy of the GNU General Public License
19  * along with this program. If not, see <https://www.gnu.org/licenses/>.
20  *
21  * ---
22  *
23  * TokenRemover.hpp
24  *
25  * Removes tokens found in a (pre-loaded) dictionary.
26  *
27  * Created on: Sep 1, 2020
28  * Author: ans
29  */
30 
31 #ifndef DATA_TOKENREMOVER_HPP_
32 #define DATA_TOKENREMOVER_HPP_
33 
34 #include "Dictionary.hpp"
35 
36 #include "../Helper/FileSystem.hpp"
37 #include "../Helper/Memory.hpp"
38 
39 #include <algorithm> // std::min
40 #include <cstddef> // std::size_t
41 #include <cstdint> // std::uint64_t
42 #include <fstream> // std::ifstream
43 #include <string> // std::getline, std::stoul, std::string
44 #include <unordered_map> // std::unordered_map
45 #include <unordered_set> // std::unordered_set
46 #include <utility> // std::move
47 
48 namespace crawlservpp::Data {
49 
50  /*
51  * DECLARATION
52  */
53 
55  class TokenRemover {
56 
57  // for convenience
58  using DictionaryIterator = std::unordered_map<std::string, std::unordered_set<std::string>>::const_iterator;
59  using MaxLengthIterator = std::unordered_map<std::string, std::size_t>::const_iterator;
60 
61  public:
64 
65  void remove(std::string& token, const std::string& dictionary);
66  void trim(std::string& token, const std::string& dictionary);
67 
71 
72  void clear();
73 
75 
76  private:
77  // dictionaries
78  std::unordered_map<std::string, std::unordered_set<std::string>> dictionaries;
79  std::unordered_map<std::string, std::size_t> maxLengths;
80 
81  // internal helper functions
82  DictionaryIterator build(const std::string& dictionary);
83  };
84 
85  /*
86  * IMPLEMENTATION
87  */
88 
89  /*
90  * TOKEN REMOVAL
91  */
92 
94 
102  inline void TokenRemover::remove(std::string& token, const std::string& dictionary) {
103  // get dictionary or build it if necessary
104  DictionaryIterator dict{
105  this->dictionaries.find(dictionary)
106  };
107 
108  if(dict == this->dictionaries.end()) {
109  dict = build(dictionary);
110  }
111 
112  // get length of token and look it up in dictionary
113  auto tokenLength{token.find(' ')};
114 
115  if(tokenLength > token.length()) {
116  tokenLength = token.length();
117  }
118 
119  const auto entry{
120  dict->second.find(token.substr(0, tokenLength))
121  };
122 
123  if(entry == dict->second.end()) {
124  /* token not in dictionary */
125  return;
126  }
127 
128  // remove token
129  Helper::Memory::free(token);
130  }
131 
133 
142  inline void TokenRemover::trim(std::string& token, const std::string& dictionary) {
143  // get dictionary or build it if necessary
144  DictionaryIterator dict{
145  this->dictionaries.find(dictionary)
146  };
147 
148  if(dict == this->dictionaries.end()) {
149  dict = build(dictionary);
150  }
151 
152  // get maximum length in dictionary
153  MaxLengthIterator maxLengthIt{
154  this->maxLengths.find(dictionary)
155  };
156  auto maxLength{std::numeric_limits<std::size_t>::max()};
157 
158  if(maxLengthIt != this->maxLengths.end()) {
159  maxLength = maxLengthIt->second;
160  }
161 
162  // get length of token
163  auto tokenLength{token.find(' ')};
164 
165  if(tokenLength > token.length()) {
166  tokenLength = token.length();
167  }
168 
169  // get maximum length to check
170  auto max{std::min(tokenLength, maxLength)};
171 
172  // trim from beginning
173  for(std::size_t len{1}; len <= max; ++len) {
174  const auto entry{
175  dict->second.find(token.substr(0, len))
176  };
177 
178  if(entry == dict->second.end()) {
179  /* token not in dictionary */
180  continue;
181  }
182 
183  // trim token and update length
184  token = token.substr(len);
185 
186  tokenLength -= len;
187 
188  max = std::min(tokenLength, maxLength);
189 
190  len = 0;
191  }
192 
193  // trim from end
194  for(std::size_t len{1}; len < max; ++len) {
195  const auto entry{
196  dict->second.find(token.substr(token.length() - len))
197  };
198 
199  if(entry == dict->second.end()) {
200  /* token not in dictionary */
201  continue;
202  }
203 
204  // trim token and update length
205  token = token.substr(0, token.length() - len);
206 
207  tokenLength -= len;
208 
209  max = std::min(tokenLength, maxLength);
210 
211  len = 0;
212  }
213  }
214 
215  /*
216  * CLEANUP
217  */
218 
220  inline void TokenRemover::clear() {
221  std::unordered_map<std::string, std::unordered_set<std::string>>().swap(this->dictionaries);
222  }
223 
224  /*
225  * INTERNAL HELPER FUNCTIONS (private)
226  */
227 
228  // build the dictionary
229  inline TokenRemover::DictionaryIterator TokenRemover::build(const std::string& dictionary) {
230  std::unordered_set<std::string> newDictionary;
231  std::size_t maxLength{};
232 
233  // read dictionary file line by line
234  std::string dictFileName{dictDir};
235 
236  dictFileName.push_back(Helper::FileSystem::getPathSeparator());
237 
238  dictFileName += dictionary;
239 
240  std::ifstream in(dictFileName.c_str());
241  std::string line;
242 
243  while(std::getline(in, line)) {
244  if(line.empty()) {
245  continue;
246  }
247 
248  const auto end{
249  line.find('\t')
250  };
251 
252  if(end == 0) {
253  continue;
254  }
255 
256  newDictionary.emplace(line.substr(0, end));
257 
258  if(end > maxLength) {
259  maxLength = end;
260  }
261  }
262 
263  // add maximum word length in the dictionary
264  this->maxLengths.emplace(dictionary, maxLength);
265 
266  // move dictionary to the set and return (constant) iterator
267  return this->dictionaries.emplace(dictionary, std::move(newDictionary)).first;
268  }
269 
270 } /* namespace crawlservpp::Data */
271 
272 #endif /* DATA_TOKENREMOVER_HPP_ */
void trim(std::string &token, const std::string &dictionary)
Removes dictionary entries from the beginning and the end of a string.
Definition: TokenRemover.hpp:142
Token remover and trimmer.
Definition: TokenRemover.hpp:55
char getPathSeparator()
Gets the preferred separator for file paths in the current operating system.
Definition: FileSystem.hpp:187
constexpr auto dictDir
Directory for dictionaries.
Definition: Dictionary.hpp:48
void clear()
Clears the lemmatizer, freeing the memory used by all dictionaries.
Definition: TokenRemover.hpp:220
Namespace for different types of data.
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
void remove(std::string &token, const std::string &dictionary)
Removes a token if found in the dictionary.
Definition: TokenRemover.hpp:102