31 #ifndef DATA_TOKENREMOVER_HPP_ 32 #define DATA_TOKENREMOVER_HPP_ 36 #include "../Helper/FileSystem.hpp" 37 #include "../Helper/Memory.hpp" 44 #include <unordered_map> 45 #include <unordered_set> 58 using DictionaryIterator = std::unordered_map<std::string, std::unordered_set<std::string>>::const_iterator;
59 using MaxLengthIterator = std::unordered_map<std::string, std::size_t>::const_iterator;
65 void remove(std::string& token,
const std::string& dictionary);
66 void trim(std::string& token,
const std::string& dictionary);
78 std::unordered_map<std::string, std::unordered_set<std::string>> dictionaries;
79 std::unordered_map<std::string, std::size_t> maxLengths;
82 DictionaryIterator build(
const std::string& dictionary);
104 DictionaryIterator dict{
105 this->dictionaries.find(dictionary)
108 if(dict == this->dictionaries.end()) {
109 dict = build(dictionary);
113 auto tokenLength{token.find(
' ')};
115 if(tokenLength > token.length()) {
116 tokenLength = token.length();
120 dict->second.find(token.substr(0, tokenLength))
123 if(entry == dict->second.end()) {
144 DictionaryIterator dict{
145 this->dictionaries.find(dictionary)
148 if(dict == this->dictionaries.end()) {
149 dict = build(dictionary);
153 MaxLengthIterator maxLengthIt{
154 this->maxLengths.find(dictionary)
156 auto maxLength{std::numeric_limits<std::size_t>::max()};
158 if(maxLengthIt != this->maxLengths.end()) {
159 maxLength = maxLengthIt->second;
163 auto tokenLength{token.find(
' ')};
165 if(tokenLength > token.length()) {
166 tokenLength = token.length();
170 auto max{std::min(tokenLength, maxLength)};
173 for(std::size_t len{1}; len <= max; ++len) {
175 dict->second.find(token.substr(0, len))
178 if(entry == dict->second.end()) {
184 token = token.substr(len);
188 max = std::min(tokenLength, maxLength);
194 for(std::size_t len{1}; len < max; ++len) {
196 dict->second.find(token.substr(token.length() - len))
199 if(entry == dict->second.end()) {
205 token = token.substr(0, token.length() - len);
209 max = std::min(tokenLength, maxLength);
221 std::unordered_map<std::string, std::unordered_set<std::string>>().swap(this->dictionaries);
229 inline TokenRemover::DictionaryIterator TokenRemover::build(
const std::string& dictionary) {
230 std::unordered_set<std::string> newDictionary;
231 std::size_t maxLength{};
234 std::string dictFileName{
dictDir};
238 dictFileName += dictionary;
240 std::ifstream in(dictFileName.c_str());
243 while(std::getline(in, line)) {
256 newDictionary.emplace(line.substr(0, end));
258 if(end > maxLength) {
264 this->maxLengths.emplace(dictionary, maxLength);
267 return this->dictionaries.emplace(dictionary, std::move(newDictionary)).first;
void trim(std::string &token, const std::string &dictionary)
Removes dictionary entries from the beginning and the end of a string.
Definition: TokenRemover.hpp:142
Token remover and trimmer.
Definition: TokenRemover.hpp:55
char getPathSeparator()
Gets the preferred separator for file paths in the current operating system.
Definition: FileSystem.hpp:187
constexpr auto dictDir
Directory for dictionaries.
Definition: Dictionary.hpp:48
void clear()
Clears the lemmatizer, freeing the memory used by all dictionaries.
Definition: TokenRemover.hpp:220
Namespace for different types of data.
static void free(T &target)
Frees memory by swapping.
Definition: Memory.hpp:42
void remove(std::string &token, const std::string &dictionary)
Removes a token if found in the dictionary.
Definition: TokenRemover.hpp:102