17 #include <string_view> 19 #include <unordered_map> 22 #include <fcitx-utils/inputbuffer.h> 23 #include <fcitx-utils/log.h> 24 #include <fcitx-utils/macros.h> 25 #include <fcitx-utils/utf8.h> 26 #include "libime/core/historybigram.h" 27 #include "libime/core/inputbuffer.h" 28 #include "libime/core/languagemodel.h" 29 #include "libime/core/lattice.h" 30 #include "libime/core/segmentgraph.h" 31 #include "libime/core/userlanguagemodel.h" 32 #include "libime/table/tablebaseddictionary.h" 33 #include "constants.h" 35 #include "tablebaseddictionary_p.h" 36 #include "tabledecoder.h" 37 #include "tableoptions.h" 43 size_t sentenceCodeLength(
const SentenceResult &sentence) {
45 static_cast<const TableLatticeNode *
>(sentence.sentence()[0]);
46 return node->codeLength();
51 bool lengthLessThanLimit(
size_t length,
int limit) {
55 return length < static_cast<size_t>(limit);
58 template <OrderPolicy policy>
59 struct TableCandidateCompare {
60 TableCandidateCompare(
int noSortInputLength,
bool sortByCodeLength)
61 : noSortInputLength_(noSortInputLength),
62 sortByCodeLength_(sortByCodeLength) {}
65 static int64_t index(
const SentenceResult &sentence) {
66 const auto *
const node =
67 static_cast<const TableLatticeNode *
>(sentence.sentence()[0]);
68 if (node->flag() == PhraseFlag::User) {
71 return -
static_cast<int64_t
>(node->index());
74 bool operator()(
const SentenceResult &lhs,
75 const SentenceResult &rhs)
const {
76 const bool lIsAuto = TableContext::isAuto(lhs);
77 const bool rIsAuto = TableContext::isAuto(rhs);
78 if (lIsAuto != rIsAuto) {
79 return lIsAuto < rIsAuto;
83 const bool lIsPinyin = TableContext::isPinyin(lhs);
84 const bool rIsPinyin = TableContext::isPinyin(rhs);
85 const auto lLength = sentenceCodeLength(lhs);
86 const auto rLength = sentenceCodeLength(rhs);
88 static_cast<int>(lLength) <= noSortInputLength_ && !lIsPinyin;
90 static_cast<int>(rLength) <= noSortInputLength_ && !rIsPinyin;
91 if (lShort != rShort) {
92 return lShort > rShort;
95 if (sortByCodeLength_ && lLength != rLength) {
96 return lLength < rLength;
100 return index(lhs) > index(rhs);
103 if constexpr (policy == OrderPolicy::No ||
104 policy == OrderPolicy::Fast) {
105 return index(lhs) > index(rhs);
106 }
else if constexpr (policy == OrderPolicy::Freq) {
107 float lScore = lhs.score();
108 float rScore = rhs.score();
109 if (lScore != rScore) {
110 return lScore > rScore;
112 return index(lhs) > index(rhs);
117 return lhs.score() > rhs.score();
121 const int noSortInputLength_;
122 const bool sortByCodeLength_;
125 struct SelectedCode {
126 SelectedCode(
size_t offset, WordNode word, std::string code,
127 PhraseFlag flag,
bool commit =
true)
128 : offset_(offset), word_(std::move(word)), code_(std::move(code)),
129 flag_(flag), commit_(commit) {}
137 bool shouldReplaceCandidate(
const SentenceResult &oldSentence,
138 const SentenceResult &newSentence,
139 OrderPolicy policy) {
140 if (newSentence.sentence().size() != oldSentence.sentence().size()) {
141 return newSentence.sentence().size() < oldSentence.sentence().size();
144 if (newSentence.sentence().size() == 1) {
145 auto oldCode = sentenceCodeLength(newSentence);
146 auto newCode = sentenceCodeLength(oldSentence);
148 if (oldCode != newCode) {
149 return oldCode < newCode;
152 const auto *newNode =
153 static_cast<const TableLatticeNode *
>(newSentence.sentence()[0]);
155 case OrderPolicy::No:
156 if (newNode->flag() != PhraseFlag::User) {
160 case OrderPolicy::Freq:
161 if (newSentence.score() != oldSentence.score()) {
162 return newSentence.score() > oldSentence.score();
165 case OrderPolicy::Fast:
166 if (newNode->flag() == PhraseFlag::User) {
181 : QPtrHolder(q), dict_(dict), model_(model), decoder_(&dict, &model) {
183 candidates_.reserve(2048);
184 model_.setCodeExtractor([](
const WordNode *word) -> std::string {
185 if (
const auto *node =
186 dynamic_cast<const TableLatticeNode *>(word)) {
194 bool canDoAutoSelect()
const {
195 if (candidates_.empty()) {
198 return !TableContext::isAuto(candidates_[0]);
202 bool hasOnlyOneAutoselectChoice()
const {
203 if (!canDoAutoSelect()) {
206 if (candidates_.size() != 1) {
210 if (candidates_[0].sentence().size() != 1) {
214 return libime::TableContext::code(candidates_[0]) == q->
currentCode() &&
215 (!dict_.tableOptions().exactMatch() ||
219 State currentState() {
220 State state = model_.nullState();
221 if (selected_.empty()) {
225 for (
auto &s : selected_) {
226 for (
auto &item : s) {
227 if (item.word_.word().empty()) {
230 model_.score(state, item.word_, temp);
231 state = std::move(temp);
237 void resetMatchingState() {
243 size_t selectedLength()
const {
244 if (!selected_.empty()) {
245 return selected_.back().back().offset_;
251 if (!selected_.empty()) {
252 selected_.pop_back();
256 bool cancelTill(
size_t pos) {
257 bool cancelled =
false;
258 while (selectedLength() > pos) {
265 bool learnWord(
const std::vector<SelectedCode> &selection) {
266 if (selection.size() == 1) {
267 const auto &select = selection[0];
268 if (select.flag_ == PhraseFlag::None ||
269 select.flag_ == PhraseFlag::User) {
270 dict_.insert(select.code_, select.word_.word(),
272 }
else if (select.flag_ == PhraseFlag::Auto) {
274 dict_.removeWord(select.code_, select.word_.word());
275 dict_.insert(select.code_, select.word_.word(),
282 for (
const auto &selected : selection) {
283 if (!selected.commit_) {
286 word += selected.word_.word();
288 return dict_.insert(word, PhraseFlag::User);
291 bool checkAutoSelect()
const {
292 auto lastSegLength = fcitx::utf8::length(graph_.data());
294 if (dict_.tableOptions().autoSelectLength() &&
295 !lengthLessThanLimit(lastSegLength,
296 dict_.tableOptions().autoSelectLength())) {
301 return dict_.d_func()->autoSelectRegex_ &&
302 std::regex_match(graph_.data(),
303 *dict_.d_func()->autoSelectRegex_,
304 std::regex_constants::match_default);
307 bool checkNoMatchAutoSelect()
const {
308 auto lastSegLength = fcitx::utf8::length(graph_.data());
310 if (dict_.tableOptions().noMatchAutoSelectLength() &&
311 !lengthLessThanLimit(
313 dict_.tableOptions().noMatchAutoSelectLength())) {
318 return dict_.d_func()->noMatchAutoSelectRegex_ &&
319 std::regex_match(graph_.data(),
320 *dict_.d_func()->noMatchAutoSelectRegex_,
321 std::regex_constants::match_default);
329 std::vector<SentenceResult> candidates_;
330 std::vector<std::vector<SelectedCode>> selected_;
331 size_t autoSelectIndex_ = 0;
335 :
InputBuffer(fcitx::InputBufferOption::FixedCursor),
336 d_ptr(std::make_unique<TableContextPrivate>(
this, dict, model)) {}
338 TableContext::~TableContext() {}
360 bool TableContext::isValidInput(uint32_t c)
const {
362 auto matchingKey = d->dict_.tableOptions().matchingKey();
363 return (d->dict_.isInputCode(c) || (matchingKey && matchingKey == c) ||
364 (d->dict_.hasPinyin() && (c <= 'z' && c >=
'a')));
367 bool TableContext::typeImpl(
const char *s,
size_t length) {
368 std::string_view view(s, length);
369 auto utf8len = fcitx::utf8::lengthValidated(view);
370 if (utf8len == fcitx::utf8::INVALID_LENGTH) {
374 bool changed =
false;
375 auto range = fcitx::utf8::MakeUTF8CharRange(view);
376 for (
auto iter = range.begin(), end = range.end(); iter != end; iter++) {
377 auto pair = iter.charRange();
378 std::string_view chr(&*pair.first,
379 std::distance(pair.first, pair.second));
380 if (!typeOneChar(chr)) {
388 void TableContext::erase(
size_t from,
size_t to) {
390 if (from == 0 && to >= size()) {
391 d->resetMatchingState();
392 d->selected_.clear();
393 InputBuffer::erase(from, to);
396 InputBuffer::erase(from, to);
398 auto lastSeg = userInput().substr(selectedLength());
399 d->graph_ = graphForCode(lastSeg, d->dict_);
404 void TableContext::select(
size_t idx) {
406 assert(idx < d->candidates_.size());
407 auto offset = selectedLength();
408 d->selected_.emplace_back();
410 auto &selection = d->selected_.back();
411 for (
const auto &p : d->candidates_[idx].sentence()) {
413 selection.emplace_back(offset + p->to()->index(),
414 WordNode{p->word(), d->model_.index(p->word())},
415 node->code(), node->flag());
421 bool TableContext::typeOneChar(std::string_view chr) {
423 auto lastSeg = userInput().substr(selectedLength());
424 auto lastSegLength = fcitx::utf8::length(lastSeg);
426 if (!InputBuffer::typeImpl(chr.data(), chr.size())) {
430 const auto &option = d->dict_.tableOptions();
436 bool doAutoSelect = option.autoSelect();
441 (!d->dict_.hasPinyin() &&
442 !lengthLessThanLimit(lastSegLength, d->dict_.maxLength()));
444 doAutoSelect = doAutoSelect ||
446 d->dict_.isEndKey(fcitx::utf8::getLastChar(lastSeg)));
451 doAutoSelect || (d->checkNoMatchAutoSelect() &&
452 !d->dict_.hasMatchingWords(lastSeg, chr));
457 d->graph_ = graphForCode(chr, d->dict_);
459 lastSeg.append(chr.data(), chr.size());
460 d->graph_ = graphForCode(lastSeg, d->dict_);
469 d->autoSelectIndex_ = index;
472 void TableContext::autoSelect() {
478 if (d->canDoAutoSelect()) {
479 auto selectIndex = d->autoSelectIndex_;
480 d->autoSelectIndex_ = 0;
481 if (selectIndex >= candidates().size()) {
486 if (currentCode().empty()) {
491 auto offset = selectedLength();
492 d->selected_.emplace_back();
493 d->selected_.back().emplace_back(
494 offset + d->graph_.data().size(),
495 WordNode{d->graph_.data(), d->model_.unknown()}, d->graph_.data(),
496 PhraseFlag::Invalid, d->dict_.tableOptions().commitRawInput());
502 void TableContext::update() {
504 d->autoSelectIndex_ = 0;
510 d->resetMatchingState();
515 State state = d->currentState();
517 auto t0 = std::chrono::high_resolution_clock::now();
519 d->candidates_.clear();
521 constexpr
float max = std::numeric_limits<float>::max();
522 constexpr
float min = -std::numeric_limits<float>::max();
523 constexpr
int beamSize = 20;
524 constexpr
int frameSize = 10;
525 auto lastSegLength = fcitx::utf8::length(d->graph_.data());
527 if (lastSegLength == d->dict_.maxLength() &&
528 !d->dict_.tableOptions().autoRuleSet().empty()) {
531 if (d->decoder_.decode(d->lattice_, d->graph_, nbest, state, max, min,
532 beamSize, frameSize)) {
533 t1 = std::chrono::high_resolution_clock::now();
536 << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0)
539 std::unordered_map<std::string, size_t> dup;
542 auto sentenceString = sentence.toString();
543 auto iter = dup.find(sentenceString);
544 if (iter != dup.end()) {
545 auto idx = iter->second;
546 if (shouldReplaceCandidate(
547 d->candidates_[idx], sentence,
548 d->dict_.tableOptions().orderPolicy())) {
549 d->candidates_[idx] = std::move(sentence);
552 d->candidates_.emplace_back(std::move(sentence));
553 dup[sentenceString] = d->candidates_.size() - 1;
557 auto &graph = d->graph_;
560 constexpr
float pinyinPenalty = -0.5;
561 for (
const auto &latticeNode : d->lattice_.nodes(eos)) {
562 if (latticeNode.from() == bos && latticeNode.to() == eos) {
563 auto sentence = latticeNode.toSentenceResult();
564 if (TableContext::isPinyin(sentence)) {
565 sentence.adjustScore(pinyinPenalty);
567 insertCandidate(std::move(sentence));
572 for (
const auto &cand : d->candidates_) {
573 min = std::min(min, cand.score());
577 const float minDistance = TABLE_DEFAULT_MIN_DISTANCE;
578 for (
size_t i = 0, e = d->lattice_.sentenceSize(); i < e; i++) {
579 auto sentence = d->lattice_.sentence(i);
580 if (TableContext::isPinyin(sentence)) {
581 sentence.adjustScore(pinyinPenalty);
583 auto score = sentence.score();
584 if (!sentence.sentence().empty()) {
585 score = sentence.sentence().back()->score();
588 if (min - score < minDistance || candidates().empty()) {
589 insertCandidate(std::move(sentence));
592 t1 = std::chrono::high_resolution_clock::now();
594 <<
"Insert candidate: " 595 << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0)
599 lastSegLength < d->dict_.tableOptions().noSortInputLength()
601 : d->dict_.tableOptions().noSortInputLength();
603 switch (d->dict_.tableOptions().orderPolicy()) {
604 case OrderPolicy::No:
606 d->candidates_.begin(), d->candidates_.end(),
607 TableCandidateCompare<OrderPolicy::No>(
608 noSortLength, d->dict_.tableOptions().sortByCodeLength()));
610 case OrderPolicy::Fast:
612 d->candidates_.begin(), d->candidates_.end(),
613 TableCandidateCompare<OrderPolicy::Fast>(
614 noSortLength, d->dict_.tableOptions().sortByCodeLength()));
616 case OrderPolicy::Freq:
618 d->candidates_.begin(), d->candidates_.end(),
619 TableCandidateCompare<OrderPolicy::Freq>(
620 noSortLength, d->dict_.tableOptions().sortByCodeLength()));
623 if (!d->candidates_.empty() && isPinyin(d->candidates_[0])) {
625 std::find_if(d->candidates_.begin(), d->candidates_.end(),
626 [](
const auto &cand) {
627 return !isAuto(cand) && !isPinyin(cand);
630 if (iter != d->candidates_.end()) {
631 std::rotate(d->candidates_.begin(), iter, std::next(iter));
635 t1 = std::chrono::high_resolution_clock::now();
638 << std::chrono::duration_cast<std::chrono::milliseconds>(t1 - t0)
640 LIBIME_TABLE_DEBUG() <<
"Number: " << d->candidates_.size();
644 if (d->dict_.tableOptions().autoSelect()) {
645 if (d->hasOnlyOneAutoselectChoice() &&
646 lastSegLength <= d->dict_.maxLength() && d->checkAutoSelect()) {
652 TableContext::CandidateRange TableContext::candidates()
const {
654 return d->candidates_;
657 size_t TableContext::selectedLength()
const {
659 return d->selectedLength();
665 for (
const auto &s : d->selected_) {
666 for (
const auto &item : s) {
668 ss += item.word_.word();
677 return d->graph_.data();
680 bool TableContext::selected()
const {
682 if (userInput().empty() || d->selected_.empty()) {
685 return d->selected_.back().back().offset_ == userInput().size();
688 size_t TableContext::selectedSize()
const {
690 return d->selected_.size();
693 std::tuple<std::string, bool> TableContext::selectedSegment(
size_t idx)
const {
697 for (
const auto &item : d->selected_[idx]) {
701 result += item.word_.word();
703 return {std::move(result), commit};
706 std::string TableContext::selectedCode(
size_t idx)
const {
709 for (
const auto &item : d->selected_[idx]) {
710 result += item.code_;
715 size_t TableContext::selectedSegmentLength(
size_t idx)
const {
719 prev = d->selected_[idx - 1].back().offset_;
721 return d->selected_[idx].back().offset_ - prev;
726 for (
size_t i = 0, e = selectedSize(); i < e; i++) {
727 auto seg = selectedSegment(i);
728 if (std::get<bool>(seg)) {
729 result += std::get<std::string>(seg);
732 result += std::get<std::string>(seg);
736 result += currentCode();
742 if (!d->dict_.tableOptions().learning()) {
746 if (d->selected_.empty()) {
750 for (
auto &s : d->selected_) {
751 if (!d->learnWord(s)) {
755 std::vector<libime::HistoryBigram::WordWithCode> newSentence;
756 for (
auto &s : d->selected_) {
760 if (std::ranges::any_of(
761 s, [](
const auto &item) {
return !item.commit_; })) {
767 word = s[0].word_.word();
770 for (
auto &item : s) {
771 word += item.word_.word();
773 if (!d->dict_.generate(word, code)) {
778 newSentence.emplace_back(std::move(word), std::move(code));
781 if (!newSentence.empty()) {
782 d->model_.history().addWithCode(newSentence);
788 if (!d->dict_.tableOptions().learning() || d->selected_.empty()) {
792 if (!d->learnWord(d->selected_.back())) {
796 std::vector<libime::HistoryBigram::WordWithCode> newSentence;
797 const auto &s = d->selected_.back();
798 if (std::ranges::any_of(s,
799 [](
const auto &item) {
return !item.commit_; })) {
805 word = s[0].word_.word();
808 for (
const auto &item : s) {
809 word += item.word_.word();
811 if (!d->dict_.generate(word, code)) {
816 newSentence.emplace_back(std::move(word), std::move(code));
818 if (!newSentence.empty()) {
819 d->model_.history().addWithCode(newSentence);
824 learnAutoPhrase(history, {});
828 const std::vector<std::string> &hints) {
830 if (!d->dict_.tableOptions().learning() ||
831 !fcitx::utf8::validate(history) ||
832 d->dict_.tableOptions().autoPhraseLength() <= 1) {
836 auto range = fcitx::utf8::MakeUTF8CharRange(history);
839 std::vector<std::string> currentHints;
841 for (
auto iter = std::begin(range); iter != std::end(range); iter++, i++) {
842 auto charBegin = iter.charRange();
843 auto length = fcitx::utf8::length(charBegin.first, history.end());
845 length > static_cast<size_t>(
846 d->dict_.tableOptions().autoPhraseLength())) {
851 history.substr(std::distance(history.begin(), charBegin.first));
852 auto begin = hints.end();
853 if (hints.size() > i) {
854 begin = std::next(hints.begin(), i);
856 currentHints.assign(begin, hints.end());
857 if (!d->dict_.generateWithHint(word, currentHints, code)) {
860 auto wordFlag = d->dict_.wordExists(code, word);
861 if (wordFlag == PhraseFlag::None || wordFlag == PhraseFlag::User) {
864 auto insertResult = d->dict_.insert(code, word, PhraseFlag::Auto);
865 LIBIME_TABLE_DEBUG() <<
"learnAutoPhrase " << word <<
" " << code
866 <<
" AutoPhraseLength: " 867 << d->dict_.tableOptions().autoPhraseLength()
868 <<
" success: " << insertResult;
872 std::string TableContext::candidateHint(
size_t idx,
bool custom)
const {
874 if (d->candidates_[idx].sentence().size() == 1) {
875 const auto *p = d->candidates_[idx].sentence()[0];
876 if (!p->word().empty()) {
878 if (node->flag() == PhraseFlag::Pinyin) {
879 if (fcitx::utf8::length(p->word()) == 1) {
880 auto code = d->dict_.reverseLookup(node->word());
882 return d->dict_.hint(code);
887 std::string_view code = node->code();
888 auto matchingKey = d->dict_.tableOptions().matchingKey();
891 if (!matchingKey || (currentCode().find(fcitx::utf8::UCS4ToUTF8(
892 matchingKey)) == std::string::npos)) {
893 code.remove_prefix(currentCode().size());
896 return d->dict_.hint(code);
898 return std::string{code};
906 if (sentence.size() == 1) {
915 if (sentence.size() == 1) {
920 return PhraseFlag::Auto;
924 return sentence.size() == 1 && flag(sentence) == PhraseFlag::Pinyin;
928 return sentence.size() != 1 || flag(sentence) == PhraseFlag::Auto;
Class provide input method support for table-based ones, like wubi.
const std::string & currentCode() const
Current unselected code.
void learn()
Save the current selected text.
void setAutoSelectIndex(size_t index)
Set the auto select index, usually, this is the candidate cursor index.
Input context for table input method.
void learnLast()
Save the last selected text.
std::string selectedSentence() const
The concatenation of all selectedSegment where bool == true.
void learnAutoPhrase(std::string_view history)
Learn auto word from string.
std::string preedit() const
A simple preedit implementation.