crawlserv++  [under development]
Application for crawling and analyzing textual content of websites.
Tagger.hpp
Go to the documentation of this file.
1 /*
2  * Tagger.hpp
3  *
4  * Multilingual POS tagger using Wapiti by Thomas Lavergne.
5  *
6  * Use the original wapiti program to train models for the tagger.
7  *
8  * Source: https://github.com/Jekub/Wapiti
9  *
10  * Paper: Lavergne, Thomas / Cappe, Olivier / Yvon, François:
11  * Practical Very Large Scale CRFs, in: Proceedings of the
12  * 48th Annual Meeting of the Association for Computational
13  * Linguistics, Uppsala, 11–16 July 2010, pp. 504–513.
14  *
15  * Created on: Aug 4, 2020
16  * Author: ans
17  */
18 
19 #ifndef DATA_TAGGER_H_
20 #define DATA_TAGGER_H_
21 
22 #include "../Main/Exception.hpp"
23 
24 #include "../_extern/wapiti/wapiti.h"
25 
26 #include <cstdio> // std::fclose, std::FILE, std::fopen
27 #include <cstdlib> // std::free
28 #include <iterator> // std::distance
29 #include <limits> // std::numeric_limits
30 #include <memory> // std::unique_ptr
31 #include <stdexcept> // std::runtime_error
32 #include <string> // std::string
33 #include <string_view> // std::string_view
34 #include <vector> // std::vector
35 
36 namespace crawlservpp::Data {
37  /*
38  * DECLARATION
39  */
40 
42 
59  class Tagger {
60  public:
63 
65  Tagger() = default;
66 
67  virtual ~Tagger();
68 
72 
73  [[nodiscard]] static constexpr std::string_view getVersion();
74 
78 
79  void setPureMaxEntMode(bool isPureMaxEntMode);
80  void setPosteriorDecoding(bool isPosteriorDecoding);
81  void setPartlyLabeledInput(bool isPartlyLabeledInput);
82 
86 
87  void loadModel(const std::string& modelFile);
88  void label(
89  std::vector<std::string>::iterator sentenceBegin,
90  std::vector<std::string>::iterator sentenceEnd
91  );
92 
94 
96 
104 
108 
111  Tagger(Tagger&) = delete;
112 
114  Tagger& operator=(Tagger&) = delete;
115 
117  Tagger(Tagger&&) = default;
118 
120  Tagger& operator=(Tagger&&) = default;
121 
123 
124  private:
125  wapiti::opt_t options{};
126  wapiti::mdl_t * model{nullptr};
127 
128  bool maxEnt{false};
129  bool posterior{false};
130  bool partlyLabeled{false};
131 
132  // general deleter for wapiti C types
133  struct wapitiDelete {
134  void operator()(void * ptr) {
135  //NOLINTNEXTLINE(cppcoreguidelines-no-malloc, cppcoreguidelines-owning-memory, hicpp-no-malloc)
136  std::free(ptr);
137  }
138  };
139 
140  // deleter for wapiti sequences
141  struct wapitiDeleteSeq {
142  void operator()(wapiti::seq_t * seq) {
143  //NOLINTNEXTLINE(cppcoreguidelines-no-malloc, cppcoreguidelines-owning-memory, hicpp-no-malloc)
144  std::free(seq->raw);
145 
146  //NOLINTNEXTLINE(cppcoreguidelines-no-malloc, cppcoreguidelines-owning-memory, hicpp-no-malloc)
147  std::free(seq);
148  }
149  };
150  };
151 
152  /*
153  * IMPLEMENTATION
154  */
155 
157  inline Tagger::~Tagger() {
158  if(this->model != nullptr) {
159  wapiti::mdl_free(this->model);
160 
161  this->model = nullptr;
162  }
163  }
164 
166 
171  inline constexpr std::string_view Tagger::getVersion() {
172  return wapiti::version;
173  }
174 
176 
184  inline void Tagger::setPureMaxEntMode(bool isPureMaxEntMode) {
185  this->maxEnt = isPureMaxEntMode;
186  }
187 
189 
195  inline void Tagger::setPosteriorDecoding(bool isPosteriorDecoding) {
196  this->posterior = isPosteriorDecoding;
197  }
198 
200 
212  inline void Tagger::setPartlyLabeledInput(bool isPartlyLabeledInput) {
213  this->partlyLabeled = isPartlyLabeledInput;
214  }
215 
217 
228  inline void Tagger::loadModel(const std::string& modelFile) {
229  // check argument
230  if(modelFile.empty()) {
231  throw Exception(
232  "Tagger::loadModel():"
233  " No POS-tagging model has been specified"
234  );
235  }
236 
237  // create model
238  this->model = wapiti::mdl_new(wapiti::rdr_new());
239 
240  // set wapiti options
241  //NOLINTNEXTLINE(cppcoreguidelines-pro-bounds-pointer-arithmetic)
242  std::vector<char> modelFileCopy(modelFile.c_str(), modelFile.c_str() + modelFile.size() + 1);
243 
244  this->options.maxent = this->maxEnt; // maxent mode
245  this->options.model = modelFileCopy.data(); // model file
246  this->options.lblpost = this->posterior; // posterior (forward-backward) or Viterbi decoding
247  this->options.force = this->partlyLabeled; // set whether input is partly (and correctly) labeled
248 
249  this->model->opt = &(this->options);
250 
251  //NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
252  auto * file{std::fopen(this->model->opt->model, "re")};
253 
254  if(file == nullptr) {
255  throw Exception(
256  "Tagger::loadModel():"
257  " Cannot open POS-tagging model: "
258  + modelFile
259  );
260  }
261 
262  try {
263  wapiti::mdl_load(this->model, file);
264  }
265  catch(const std::runtime_error& e) {
266  throw Exception(
267  "Tagger::loadModel():"
268  " Error while loading the POS-tagging model – "
269  + std::string(e.what())
270  );
271  }
272 
273  //NOLINTNEXTLINE(cppcoreguidelines-owning-memory)
274  std::fclose(file);
275  }
276 
278 
293  inline void Tagger::label(
294  std::vector<std::string>::iterator sentenceBegin,
295  std::vector<std::string>::iterator sentenceEnd
296  ) {
297  // check length of sentence and ignore final tokens in REALLY long sentences
298  const auto sentenceLength{std::distance(sentenceBegin, sentenceEnd)};
299 
300  if(sentenceLength == 0) {
301  return;
302  }
303 
304  if(sentenceLength > std::numeric_limits<std::uint32_t>::max()) {
305  sentenceEnd = sentenceBegin + std::numeric_limits<std::uint32_t>::max();
306  };
307 
308  // copy tokens into continous memory
309  std::vector<std::vector<char>> tokenCopies;
310  std::vector<char *> tokenPtrs;
311 
312  tokenCopies.reserve(sentenceLength);
313  tokenPtrs.reserve(sentenceLength);
314 
315  for(auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
316  tokenCopies.emplace_back(tokenIt->begin(), tokenIt->end());
317  tokenCopies.back().push_back('\0');
318  }
319 
320  for(auto& tokenCopy : tokenCopies) {
321  tokenPtrs.push_back(tokenCopy.data());
322  }
323 
324  // convert tokens into raw data
325  std::unique_ptr<wapiti::raw_t, wapitiDelete> rawData{
326  static_cast<wapiti::raw_t *>(
327  wapiti::xmalloc(
328  sizeof(wapiti::raw_t) + sizeof(char *) * tokenPtrs.size()
329  )
330  )
331  };
332 
333  rawData->len = tokenPtrs.size();
334 
335  std::uint32_t index{};
336 
337  for(auto * tokenPtr : tokenPtrs) {
338  rawData->lines[index] = tokenPtr;
339 
340  ++index;
341  }
342 
343  // create sequence
344  auto * lbls = this->model->reader->lbl;
345 
346  try {
347  std::unique_ptr<wapiti::seq_t, wapitiDeleteSeq> seq{
348  wapiti::rdr_raw2seq(
349  this->model->reader,
350  rawData.get(),
351  this->model->opt->force
352  )
353  };
354 
355  // label tokens
356  const auto T{seq->len};
357 
358  std::unique_ptr<uint32_t, wapitiDelete> out{
359  static_cast<uint32_t*>(wapiti::xmalloc(sizeof(uint32_t) * T))
360  };
361  std::unique_ptr<double, wapitiDelete> psc{
362  static_cast<double*>(wapiti::xmalloc(sizeof(double ) * T))
363  };
364  std::unique_ptr<double, wapitiDelete> scs{
365  static_cast<double*>(wapiti::xmalloc(sizeof(double )))
366  };
367 
368  wapiti::tag_viterbi(this->model, seq.get(), out.get(), scs.get(), psc.get());
369 
370  // append labels to strings
371  index = 0;
372 
373  for(auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
374  const std::string label{wapiti::qrk_id2str(lbls, out.get()[index])};
375 
376  tokenIt->reserve(tokenIt->size() + label.size() + 1);
377  tokenIt->push_back(' ');
378 
379  (*tokenIt) += label;
380 
381  ++index;
382  }
383  }
384  catch(const std::runtime_error& e) {
385  throw Exception(
386  "Tagger::label():"
387  " Error while loading POS-tagging a sentence – "
388  + std::string(e.what())
389  );
390  }
391  }
392 
393 } /* namespace crawlservpp::Data */
394 
395 #endif /* DATA_TAGGER_H_ */
void setPartlyLabeledInput(bool isPartlyLabeledInput)
Sets whether the input is already partly labelled.
Definition: Tagger.hpp:212
Tagger()=default
Default constructor.
POS (part of speech)-tagging exception.
Definition: Tagger.hpp:103
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
static constexpr std::string_view getVersion()
Gets the underlying version of wapiti.
Definition: Tagger.hpp:171
void setPosteriorDecoding(bool isPosteriorDecoding)
Sets whether posterior decoding is used instead of the classical Viterbi encoding ...
Definition: Tagger.hpp:195
void label(std::vector< std::string >::iterator sentenceBegin, std::vector< std::string >::iterator sentenceEnd)
POS (part of speech)-tags a sentence.
Definition: Tagger.hpp:293
virtual ~Tagger()
Destructor freeing the POS-tagging model, if one has been loaded.
Definition: Tagger.hpp:157
void setPureMaxEntMode(bool isPureMaxEntMode)
Sets whether the pure maxent mode of Wapiti is enabled.
Definition: Tagger.hpp:184
Tagger & operator=(Tagger &)=delete
Deleted copy assignment operator.
Namespace for different types of data.
void loadModel(const std::string &modelFile)
Loads a POS-tagging model trained by using Wapiti.
Definition: Tagger.hpp:228
Multilingual POS (part of speech) tagger using Wapiti by Thomas Lavergne.
Definition: Tagger.hpp:59
static void free(rapidjson::Document &target)
Frees memory by swapping.
Definition: Json.hpp:862