19 #ifndef DATA_TAGGER_H_ 20 #define DATA_TAGGER_H_ 22 #include "../Main/Exception.hpp" 24 #include "../_extern/wapiti/wapiti.h" 33 #include <string_view> 73 [[nodiscard]]
static constexpr std::string_view
getVersion();
87 void loadModel(
const std::string& modelFile);
89 std::vector<std::string>::iterator sentenceBegin,
90 std::vector<std::string>::iterator sentenceEnd
125 wapiti::opt_t options{};
126 wapiti::mdl_t * model{
nullptr};
129 bool posterior{
false};
130 bool partlyLabeled{
false};
133 struct wapitiDelete {
134 void operator()(
void * ptr) {
141 struct wapitiDeleteSeq {
142 void operator()(wapiti::seq_t * seq) {
158 if(this->model !=
nullptr) {
159 wapiti::mdl_free(this->model);
161 this->model =
nullptr;
172 return wapiti::version;
185 this->maxEnt = isPureMaxEntMode;
196 this->posterior = isPosteriorDecoding;
213 this->partlyLabeled = isPartlyLabeledInput;
230 if(modelFile.empty()) {
232 "Tagger::loadModel():" 233 " No POS-tagging model has been specified" 238 this->model = wapiti::mdl_new(wapiti::rdr_new());
242 std::vector<char> modelFileCopy(modelFile.c_str(), modelFile.c_str() + modelFile.size() + 1);
244 this->options.maxent = this->maxEnt;
245 this->options.model = modelFileCopy.data();
246 this->options.lblpost = this->posterior;
247 this->options.force = this->partlyLabeled;
249 this->model->opt = &(this->options);
252 auto * file{std::fopen(this->model->opt->model,
"re")};
254 if(file ==
nullptr) {
256 "Tagger::loadModel():" 257 " Cannot open POS-tagging model: " 263 wapiti::mdl_load(this->model, file);
265 catch(
const std::runtime_error& e) {
267 "Tagger::loadModel():" 268 " Error while loading the POS-tagging model – " 269 + std::string(e.what())
294 std::vector<std::string>::iterator sentenceBegin,
295 std::vector<std::string>::iterator sentenceEnd
298 const auto sentenceLength{std::distance(sentenceBegin, sentenceEnd)};
300 if(sentenceLength == 0) {
304 if(sentenceLength > std::numeric_limits<std::uint32_t>::max()) {
305 sentenceEnd = sentenceBegin + std::numeric_limits<std::uint32_t>::max();
309 std::vector<std::vector<char>> tokenCopies;
310 std::vector<char *> tokenPtrs;
312 tokenCopies.reserve(sentenceLength);
313 tokenPtrs.reserve(sentenceLength);
315 for(
auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
316 tokenCopies.emplace_back(tokenIt->begin(), tokenIt->end());
317 tokenCopies.back().push_back(
'\0');
320 for(
auto& tokenCopy : tokenCopies) {
321 tokenPtrs.push_back(tokenCopy.data());
325 std::unique_ptr<wapiti::raw_t, wapitiDelete> rawData{
326 static_cast<wapiti::raw_t *
>(
328 sizeof(wapiti::raw_t) +
sizeof(
char *) * tokenPtrs.size()
333 rawData->len = tokenPtrs.size();
335 std::uint32_t index{};
337 for(
auto * tokenPtr : tokenPtrs) {
338 rawData->lines[index] = tokenPtr;
344 auto * lbls = this->model->reader->lbl;
347 std::unique_ptr<wapiti::seq_t, wapitiDeleteSeq> seq{
351 this->model->opt->force
356 const auto T{seq->len};
358 std::unique_ptr<uint32_t, wapitiDelete> out{
359 static_cast<uint32_t*
>(wapiti::xmalloc(
sizeof(uint32_t) * T))
361 std::unique_ptr<double, wapitiDelete> psc{
362 static_cast<double*
>(wapiti::xmalloc(
sizeof(
double ) * T))
364 std::unique_ptr<double, wapitiDelete> scs{
365 static_cast<double*
>(wapiti::xmalloc(
sizeof(
double )))
368 wapiti::tag_viterbi(this->model, seq.get(), out.get(), scs.get(), psc.get());
373 for(
auto tokenIt{sentenceBegin}; tokenIt != sentenceEnd; ++tokenIt) {
374 const std::string
label{wapiti::qrk_id2str(lbls, out.get()[index])};
376 tokenIt->reserve(tokenIt->size() +
label.size() + 1);
377 tokenIt->push_back(
' ');
384 catch(
const std::runtime_error& e) {
387 " Error while loading POS-tagging a sentence – " 388 + std::string(e.what())
void setPartlyLabeledInput(bool isPartlyLabeledInput)
Sets whether the input is already partly labelled.
Definition: Tagger.hpp:212
Tagger()=default
Default constructor.
POS (part of speech)-tagging exception.
Definition: Tagger.hpp:103
#define MAIN_EXCEPTION_CLASS()
Macro used to easily define classes for general exceptions.
Definition: Exception.hpp:50
static constexpr std::string_view getVersion()
Gets the underlying version of wapiti.
Definition: Tagger.hpp:171
void setPosteriorDecoding(bool isPosteriorDecoding)
Sets whether posterior decoding is used instead of the classical Viterbi encoding ...
Definition: Tagger.hpp:195
void label(std::vector< std::string >::iterator sentenceBegin, std::vector< std::string >::iterator sentenceEnd)
POS (part of speech)-tags a sentence.
Definition: Tagger.hpp:293
virtual ~Tagger()
Destructor freeing the POS-tagging model, if one has been loaded.
Definition: Tagger.hpp:157
void setPureMaxEntMode(bool isPureMaxEntMode)
Sets whether the pure maxent mode of Wapiti is enabled.
Definition: Tagger.hpp:184
Tagger & operator=(Tagger &)=delete
Deleted copy assignment operator.
Namespace for different types of data.
void loadModel(const std::string &modelFile)
Loads a POS-tagging model trained by using Wapiti.
Definition: Tagger.hpp:228
Multilingual POS (part of speech) tagger using Wapiti by Thomas Lavergne.
Definition: Tagger.hpp:59
static void free(rapidjson::Document &target)
Frees memory by swapping.
Definition: Json.hpp:862