libime
pinyincontext.cpp
1 /*
2  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 #include "pinyincontext.h"
7 #include <algorithm>
8 #include <cassert>
9 #include <cstddef>
10 #include <functional>
11 #include <iterator>
12 #include <limits>
13 #include <list>
14 #include <memory>
15 #include <span>
16 #include <stdexcept>
17 #include <string>
18 #include <string_view>
19 #include <tuple>
20 #include <unordered_map>
21 #include <unordered_set>
22 #include <utility>
23 #include <vector>
24 #include <boost/container_hash/hash.hpp>
25 #include <fcitx-utils/charutils.h>
26 #include <fcitx-utils/inputbuffer.h>
27 #include <fcitx-utils/keysym.h>
28 #include <fcitx-utils/macros.h>
29 #include <fcitx-utils/signals.h>
30 #include <fcitx-utils/stringutils.h>
31 #include <fcitx-utils/utf8.h>
32 #include "libime/core/historybigram.h"
33 #include "libime/core/inputbuffer.h"
34 #include "libime/core/languagemodel.h"
35 #include "libime/core/lattice.h"
36 #include "libime/core/segmentgraph.h"
37 #include "libime/core/userlanguagemodel.h"
38 #include "libime/pinyin/constants.h"
39 #include "pinyindecoder.h"
40 #include "pinyindecoder_p.h"
41 #include "pinyinencoder.h"
42 #include "pinyinime.h"
43 #include "pinyinmatchstate.h"
44 
45 namespace libime {
46 
47 namespace {
48 enum class LearnWordResult {
49  Normal, /// word is consisted all from regular word from dict.
50  Custom, /// word is consisted with custom word (e.g. symbol replacement).
51  Ignored, /// not learned as word.
52 };
53 
54 enum class SelectedPinyinType {
55  Normal,
56  Custom,
57  Separator,
58 };
59 
60 struct SelectedPinyin {
61  SelectedPinyin(size_t s, PinyinWordNode word, SelectedPinyinType type)
62  : offset_(s), word_(std::move(word)), type_(type) {}
63 
64  const std::string &encodedPinyin() const { return word_.encodedPinyin(); }
65 
66  size_t offset_;
67  PinyinWordNode word_;
68  SelectedPinyinType type_;
69 };
70 
71 struct CandidateDedupKey {
72  std::string text_;
73  size_t end_ = 0;
74 
75  bool operator==(const CandidateDedupKey &other) const {
76  return text_ == other.text_ && end_ == other.end_;
77  }
78 };
79 
80 struct CandidateDedupKeyHash {
81  size_t operator()(const CandidateDedupKey &key) const {
82  size_t seed = std::hash<std::string>()(key.text_);
83  boost::hash_combine(seed, key.end_);
84  return seed;
85  }
86 };
87 
88 CandidateDedupKey candidateDedupKey(const SentenceResult &candidate) {
89  return {.text_ = candidate.toString(),
90  .end_ = candidate.sentence().empty()
91  ? 0
92  : candidate.sentence().back()->to()->index()};
93 }
94 
95 } // namespace
96 
97 class PinyinContextPrivate : public fcitx::QPtrHolder<PinyinContext> {
98 public:
100  : QPtrHolder(q), ime_(ime), matchState_(q) {}
101 
102  std::vector<std::vector<SelectedPinyin>> selected_;
103 
104  bool sp_ = false;
105  int maxSentenceLength_ = -1;
106  PinyinIME *ime_;
107  SegmentGraph segs_;
108  Lattice lattice_;
109  PinyinMatchState matchState_;
110  std::vector<SentenceResult> candidates_;
111  std::unordered_set<std::string> candidatesSet_;
112  mutable bool candidatesToCursorNeedUpdate_ = true;
113  mutable std::vector<SentenceResult> candidatesToCursor_;
114  mutable std::unordered_set<std::string> candidatesToCursorSet_;
115  std::vector<fcitx::ScopedConnection> conn_;
116  std::list<PinyinWordNode> contextWords_;
117 
118  size_t alignCursorToNextSegment() const {
119  FCITX_Q();
120  auto currentCursor = q->cursor();
121  auto start = q->selectedLength();
122  if (currentCursor < start) {
123  return start;
124  }
125  while (segs_.nodes(currentCursor - start).empty() &&
126  currentCursor < q->size()) {
127  currentCursor += 1;
128  }
129  return currentCursor;
130  }
131 
132  bool needCandidatesToCursor() const {
133  FCITX_Q();
134  if (q->cursor() == q->selectedLength()) {
135  return false;
136  }
137 
138  return alignCursorToNextSegment() != q->size();
139  }
140 
141  void clearCandidates() {
142  candidates_.clear();
143  candidatesToCursor_.clear();
144  candidatesToCursorNeedUpdate_ = false;
145  candidatesSet_.clear();
146  candidatesToCursorSet_.clear();
147  }
148 
149  void updateCandidatesToCursor() const {
150  FCITX_Q();
151  if (!candidatesToCursorNeedUpdate_) {
152  return;
153  }
154  candidatesToCursorNeedUpdate_ = false;
155  candidatesToCursor_.clear();
156  candidatesToCursorSet_.clear();
157 
158  std::unordered_map<CandidateDedupKey, size_t, CandidateDedupKeyHash>
159  duplicateCandidates;
160  auto insertCandidate = [this, &duplicateCandidates](
161  SentenceResult candidate) {
162  auto key = candidateDedupKey(candidate);
163  auto iter = duplicateCandidates.find(key);
164  if (iter != duplicateCandidates.end()) {
165  auto &oldCandidate = candidatesToCursor_[iter->second];
166  if (candidate.score() > oldCandidate.score()) {
167  oldCandidate = std::move(candidate);
168  }
169  return;
170  }
171 
172  candidatesToCursor_.push_back(std::move(candidate));
173  duplicateCandidates.emplace(key, candidatesToCursor_.size() - 1);
174  candidatesToCursorSet_.insert(std::move(key.text_));
175  };
176 
177  auto start = q->selectedLength();
178  auto currentCursor = alignCursorToNextSegment();
179  // Poke best sentence from lattice, ignore nbest option for now.
180  auto nodeRange = lattice_.nodes(&segs_.node(currentCursor - start));
181  if (!nodeRange.empty()) {
182  insertCandidate(nodeRange.front().toSentenceResult());
183  }
184  for (const auto &candidate : candidates_) {
185  const auto &sentence = candidate.sentence();
186  if (sentence.size() == 1) {
187  if (sentence.back()->to()->index() + start > currentCursor) {
188  continue;
189  }
190  insertCandidate(candidate);
191  } else if (sentence.size() > 1) {
192  auto newSentence = sentence;
193  while (!newSentence.empty() &&
194  newSentence.back()->to()->index() + start >
195  currentCursor) {
196  newSentence.pop_back();
197  }
198  if (!newSentence.empty()) {
199  SentenceResult partial(newSentence,
200  newSentence.back()->score());
201  insertCandidate(std::move(partial));
202  }
203  }
204  }
205  }
206 
207  template <typename FillSentence>
208  void selectHelper(const FillSentence &fillSentence) {
209  FCITX_Q();
210  selected_.emplace_back();
211 
212  auto &selection = selected_.back();
213  fillSentence(selection);
214  // add some special code for handling separator at the end
215  auto remain =
216  std::string_view(q->userInput()).substr(q->selectedLength());
217  if (!remain.empty()) {
218  if (std::all_of(remain.begin(), remain.end(),
219  [](char c) { return c == '\''; })) {
220  selection.emplace_back(q->size(), PinyinWordNode({}, 0),
221  SelectedPinyinType::Separator);
222  }
223  }
224 
225  q->update();
226  }
227 
228  void select(const SentenceResult &sentence) {
229  FCITX_Q();
230  auto offset = q->selectedLength();
231  selectHelper([offset, &sentence,
232  this](std::vector<SelectedPinyin> &selection) {
233  for (const auto &p : sentence.sentence()) {
234  selection.emplace_back(
235  offset + p->to()->index(),
236  PinyinWordNode{
237  {p->word(), p->as<PinyinLatticeNode>().encodedPinyin()},
238  ime_->model()->index(p->word())},
239 
240  SelectedPinyinType::Normal);
241  }
242  });
243  }
244 
245  void selectCustom(size_t inputLength, std::string_view segment,
246  std::string_view encodedPinyin) {
247  FCITX_Q();
248  auto offset = q->selectedLength();
249  selectHelper([this, offset, &segment, inputLength,
250  &encodedPinyin](std::vector<SelectedPinyin> &selection) {
251  auto index = ime_->model()->index(segment);
252  selection.emplace_back(
253  offset + inputLength,
254  PinyinWordNode{{segment, encodedPinyin}, index},
255  SelectedPinyinType::Custom);
256  });
257  }
258 
259  std::tuple<LearnWordResult, std::string> learnWord() {
260  std::string ss;
261  std::string pinyin;
262  if (selected_.empty()) {
263  return {LearnWordResult::Ignored, ""};
264  }
265  // don't learn existing word.
266  if (selected_.size() == 1 && selected_[0].size() == 1) {
267  return {LearnWordResult::Ignored, ""};
268  }
269  // Validate the learning word.
270  // All single || custom || length <= 4
271  bool hasCustom = false;
272  size_t totalPinyinLength = 0;
273  bool isAllSingleWord = true;
274  for (auto &s : selected_) {
275  isAllSingleWord =
276  isAllSingleWord &&
277  (s.empty() || (s.size() == 1 &&
278  (s[0].type_ == SelectedPinyinType::Separator ||
279  s[0].encodedPinyin().size() == 2)));
280  for (auto &item : s) {
281  if (item.type_ == SelectedPinyinType::Separator) {
282  continue;
283  }
284  if (item.type_ == SelectedPinyinType::Custom) {
285  hasCustom = true;
286  }
287  // We can't learn non pinyin word.
288  if (item.encodedPinyin().empty() ||
289  item.encodedPinyin().size() % 2 != 0) {
290  return {LearnWordResult::Ignored, ""};
291  }
292  totalPinyinLength += item.encodedPinyin().size() / 2;
293  }
294  }
295 
296  FCITX_Q();
297  if (!hasCustom) {
298  if ((!isAllSingleWord && totalPinyinLength > 4)) {
299  return {LearnWordResult::Ignored, ""};
300  }
301  if (ime_->model()->containsNonUnigram(q->selectedWords())) {
302  return {LearnWordResult::Ignored, ""};
303  }
304  }
305 
306  for (auto &s : selected_) {
307  for (auto &item : s) {
308  if (item.type_ == SelectedPinyinType::Separator) {
309  continue;
310  }
311  assert(!item.encodedPinyin().empty());
312  assert(item.encodedPinyin().size() % 2 == 0);
313  ss += item.word_.word();
314  if (!pinyin.empty()) {
315  pinyin.push_back('\'');
316  }
317  pinyin += PinyinEncoder::decodeFullPinyin(item.encodedPinyin());
318  }
319  }
320 
321  if (auto opt = ime_->dict()->lookupWord(PinyinDictionary::UserDict,
322  pinyin, ss)) {
323  return {LearnWordResult::Ignored, ""};
324  }
325 
326  ime_->dict()->addWord(PinyinDictionary::UserDict, pinyin, ss,
327  hasCustom ? -1 : 0);
328 
329  auto encodedPinyin = PinyinEncoder::encodeFullPinyinWithFlags(
330  pinyin, PinyinFuzzyFlag::VE_UE);
331 
332  return {hasCustom ? LearnWordResult::Custom : LearnWordResult::Normal,
333  std::string(encodedPinyin.data(), encodedPinyin.size())};
334  }
335 };
336 
337 void matchPinyinCase(std::string_view ref, std::string &actualPinyin) {
338  if (ref.size() != fcitx::utf8::length(actualPinyin)) {
339  return;
340  }
341 
342  auto iter = fcitx::utf8::MakeUTF8CharIterator(actualPinyin.begin(),
343  actualPinyin.end());
344  for (size_t i = 0; i < ref.size(); ++i, ++iter) {
345  if (fcitx::charutils::isupper(ref[i])) {
346  auto charRange = iter.charRange();
347  if (iter.charLength() == 1 &&
348  fcitx::charutils::islower(iter.view()[0])) {
349  *charRange.first = fcitx::charutils::toupper(*charRange.first);
350  } else if (*iter == 0x00fc) {
351  *charRange.first = 0xc3;
352  *std::next(charRange.first) = 0x9c;
353  }
354  }
355  }
356 }
357 
358 PinyinContext::PinyinContext(PinyinIME *ime)
359  : InputBuffer(fcitx::InputBufferOption::AsciiOnly),
360  d_ptr(std::make_unique<PinyinContextPrivate>(this, ime)) {
361  FCITX_D();
362  d->conn_.emplace_back(
363  ime->connect<PinyinIME::optionChanged>([this]() { clear(); }));
364  d->conn_.emplace_back(
365  ime->dict()->connect<PinyinDictionary::dictionaryChanged>(
366  [this](size_t) {
367  FCITX_D();
368  d->matchState_.clear();
369  }));
370 }
371 
372 PinyinContext::~PinyinContext() {}
373 
374 void PinyinContext::setUseShuangpin(bool sp) {
375  FCITX_D();
376  d->sp_ = sp;
377  d->matchState_.clear();
378 }
379 
380 bool PinyinContext::useShuangpin() const {
381  FCITX_D();
382  return d->sp_;
383 }
384 
385 void PinyinContext::setMaxSentenceLength(int length) {
386  FCITX_D();
387  d->maxSentenceLength_ = length;
388  d->matchState_.clear();
389 }
390 
391 int PinyinContext::maxSentenceLength() const {
392  FCITX_D();
393  return d->maxSentenceLength_;
394 }
395 
396 bool PinyinContext::typeImpl(const char *s, size_t length) {
397  FCITX_D();
398  if (d->maxSentenceLength_ > 0 && !d->candidates_.empty()) {
399  auto size = 0;
400  for (const auto &s : d->candidates_[0].sentence()) {
401  // This is pinyin length + 1
402  auto segLength = s->path().size();
403  size +=
404  std::max(static_cast<decltype(segLength)>(1), segLength) - 1;
405  }
406  if (size > d->maxSentenceLength_) {
407  return false;
408  }
409  }
410  auto changed = cancelTill(cursor());
411  changed = InputBuffer::typeImpl(s, length) || changed;
412  if (changed) {
413  update();
414  }
415  return changed;
416 }
417 
418 void PinyinContext::erase(size_t from, size_t to) {
419  if (from == to) {
420  return;
421  }
422 
423  // check if erase everything
424  if (from == 0 && to >= size()) {
425  FCITX_D();
426  d->clearCandidates();
427  d->selected_.clear();
428  d->lattice_.clear();
429  d->matchState_.clear();
430  d->segs_ = SegmentGraph();
431  } else {
432  cancelTill(from);
433  }
434  InputBuffer::erase(from, to);
435 
436  if (!empty()) {
437  update();
438  }
439 }
440 
441 void PinyinContext::setCursor(size_t pos) {
442  FCITX_D();
443  auto oldCursor = cursor();
444  auto cancelled = cancelTill(pos);
445  InputBuffer::setCursor(pos);
446  if (cancelled) {
447  update();
448  } else {
449  if (cursor() != oldCursor) {
450  d->candidatesToCursorNeedUpdate_ = true;
451  }
452  }
453 }
454 
456  FCITX_D();
457  auto len = selectedLength();
458  auto c = cursor();
459  if (c < len) {
460  return -1;
461  }
462  c -= len;
463  if (!d->candidates_.empty()) {
464  for (const auto &s : d->candidates_[0].sentence()) {
465  for (auto iter = s->path().begin(),
466  end = std::prev(s->path().end());
467  iter < end; iter++) {
468  auto from = (*iter)->index();
469  auto to = (*std::next(iter))->index();
470  if (to >= c) {
471  return from + len;
472  }
473  }
474  }
475  }
476  return -1;
477 }
478 
480  FCITX_D();
481  auto len = selectedLength();
482  auto c = cursor();
483  if (c < len) {
484  return -1;
485  }
486  c -= len;
487  if (!d->candidates_.empty()) {
488  for (const auto &s : d->candidates_[0].sentence()) {
489  for (auto iter = s->path().begin(),
490  end = std::prev(s->path().end());
491  iter < end; iter++) {
492  auto to = (*std::next(iter))->index();
493  if (to > c) {
494  return to + len;
495  }
496  }
497  }
498  }
499  return -1;
500 }
501 
502 const std::vector<SentenceResult> &PinyinContext::candidates() const {
503  FCITX_D();
504  return d->candidates_;
505 }
506 
507 const std::unordered_set<std::string> &PinyinContext::candidateSet() const {
508  FCITX_D();
509  return d->candidatesSet_;
510 }
511 
512 const std::vector<SentenceResult> &PinyinContext::candidatesToCursor() const {
513  FCITX_D();
514  if (!d->needCandidatesToCursor()) {
515  return d->candidates_;
516  }
517  d->updateCandidatesToCursor();
518  return d->candidatesToCursor_;
519 }
520 
521 const std::unordered_set<std::string> &
523  FCITX_D();
524  if (!d->needCandidatesToCursor()) {
525  return d->candidatesSet_;
526  }
527  d->updateCandidatesToCursor();
528  return d->candidatesToCursorSet_;
529 }
530 
531 void PinyinContext::select(size_t idx) {
532  FCITX_D();
533  const auto &candidates = this->candidates();
534  assert(idx < candidates.size());
535  d->select(candidates[idx]);
536 }
537 
538 void PinyinContext::selectCandidatesToCursor(size_t idx) {
539  FCITX_D();
540  const auto &candidates = this->candidatesToCursor();
541  assert(idx < candidates.size());
542  d->select(candidates[idx]);
543 }
544 
545 void PinyinContext::selectCustom(size_t inputLength, std::string_view segment,
546  std::string_view encodedPinyin) {
547  FCITX_D();
548  if (inputLength == 0 && selectedLength() + inputLength > size()) {
549  throw std::out_of_range("Invalid input length");
550  }
551  if (encodedPinyin.size() % 2 != 0) {
552  throw std::invalid_argument("Invalid encoded pinyin");
553  }
554  d->selectCustom(inputLength, segment, encodedPinyin);
555 }
556 
557 bool PinyinContext::cancelTill(size_t pos) {
558  bool cancelled = false;
559  while (selectedLength() > pos) {
560  cancel();
561  cancelled = true;
562  }
563  return cancelled;
564 }
565 
566 void PinyinContext::cancel() {
567  FCITX_D();
568  if (!d->selected_.empty()) {
569  d->selected_.pop_back();
570 
571  // There is no point to reuse any existing matching state.
572  // For example, cancel from (tao zi) tao zi to tao zi tao zi.
573  // Even if they share the same prefix, the start state won't be same.
574  d->lattice_.clear();
575  d->matchState_.clear();
576  d->segs_ = SegmentGraph();
577  }
578  update();
579 }
580 
581 State PinyinContext::state() const {
582  FCITX_D();
583  auto *model = d->ime_->model();
584  State state = model->nullState();
585  for (const auto &word : d->contextWords_) {
586  State temp;
587  model->score(state, word, temp);
588  state = std::move(temp);
589  }
590  for (const auto &s : d->selected_) {
591  for (const auto &item : s) {
592  if (item.word_.word().empty()) {
593  continue;
594  }
595  State temp;
596  model->score(state, item.word_, temp);
597  state = std::move(temp);
598  }
599  }
600  return state;
601 }
602 
603 void PinyinContext::update() {
604  FCITX_D();
605  if (empty()) {
606  clear();
607  return;
608  }
609 
610  if (selected()) {
611  d->clearCandidates();
612  } else {
613  size_t start = 0;
614  State state = this->state();
615  if (!d->selected_.empty()) {
616  start = d->selected_.back().back().offset_;
617  }
618  SegmentGraph newGraph;
619  if (auto spProfile = d->matchState_.shuangpinProfile()) {
620  newGraph = PinyinEncoder::parseUserShuangpin(
621  userInput().substr(start), *spProfile, d->ime_->fuzzyFlags());
622  } else {
623  newGraph = PinyinEncoder::parseUserPinyin(
624  userInput().substr(start), d->ime_->correctionProfile().get(),
625  d->ime_->fuzzyFlags());
626  }
627  d->segs_.merge(
628  newGraph,
629  [d](const std::unordered_set<const SegmentGraphNode *> &nodes) {
630  d->lattice_.discardNode(nodes);
631  d->matchState_.discardNode(nodes);
632  });
633  auto &graph = d->segs_;
634 
635  d->ime_->decoder()->decode(d->lattice_, d->segs_, d->ime_->nbest(),
636  state, d->ime_->maxDistance(),
637  d->ime_->minPath(), d->ime_->beamSize(),
638  d->ime_->frameSize(), &d->matchState_);
639 
640  d->clearCandidates();
641 
642  // Add n-best result.
643  for (size_t i = 0, e = d->lattice_.sentenceSize(); i < e; i++) {
644  d->candidates_.push_back(d->lattice_.sentence(i));
645  }
646 
647  const auto *bos = &graph.start();
648 
649  auto beginSize = d->candidates_.size();
650  for (size_t i = graph.size(); i > 0; i--) {
651  float min = 0;
652  float max = -std::numeric_limits<float>::max();
653  auto distancePenalty = d->ime_->model()->unknownPenalty() /
654  PINYIN_DISTANCE_PENALTY_FACTOR;
655 
656  // Enumerate over all the lattice node, if from == bos, this is
657  // a dictionary word match.
658  // Add all words that does not contain pinyin correction.
659  for (const auto &graphNode : graph.nodes(i)) {
660  auto distance = graph.distanceToEnd(graphNode);
661  auto adjust = static_cast<float>(distance) * distancePenalty;
662  for (const auto &latticeNode : d->lattice_.nodes(&graphNode)) {
663  if (latticeNode.from() == bos &&
664  !static_cast<const PinyinLatticeNode &>(latticeNode)
665  .isCorrection()) {
666  if (!d->ime_->model()->isNodeUnknown(latticeNode)) {
667  min = std::min(latticeNode.score(), min);
668  max = std::max(latticeNode.score(), max);
669  }
670  d->candidates_.push_back(
671  latticeNode.toSentenceResult(adjust));
672  }
673  }
674  }
675 
676  // Filter correction word based on score
677  for (const auto &graphNode : graph.nodes(i)) {
678  auto distance = graph.distanceToEnd(graphNode);
679  auto adjust = static_cast<float>(distance) * distancePenalty;
680  for (const auto &latticeNode : d->lattice_.nodes(&graphNode)) {
681  if (latticeNode.from() == bos &&
682  static_cast<const PinyinLatticeNode &>(latticeNode)
683  .isCorrection()) {
684  if ((latticeNode.score() > min &&
685  latticeNode.score() + d->ime_->maxDistance() >
686  max) ||
687  static_cast<const PinyinLatticeNode &>(latticeNode)
688  .encodedPinyin()
689  .size() <= 2) {
690  d->candidates_.push_back(
691  latticeNode.toSentenceResult(adjust));
692  }
693  }
694  }
695  }
696 
697  // This part is the phrase that's constructable from lattice.
698  for (const auto &graphNode : graph.nodes(i)) {
699  auto distance = graph.distanceToEnd(graphNode);
700  auto adjust = static_cast<float>(distance) * distancePenalty;
701  for (const auto &latticeNode : d->lattice_.nodes(&graphNode)) {
702  if (latticeNode.from() != bos &&
703  latticeNode.score() > min &&
704  latticeNode.score() + d->ime_->maxDistance() > max &&
705  !static_cast<const PinyinLatticeNode &>(latticeNode)
706  .anyCorrectionOnPath()) {
707  d->candidates_.push_back(
708  latticeNode.toSentenceResult(adjust));
709  }
710  }
711  }
712  }
713  std::sort(d->candidates_.begin() + beginSize, d->candidates_.end(),
714  std::greater<>());
715  {
716  size_t index = 0;
717  size_t count = 0;
718  const auto limit = d->ime_->wordCandidateLimit();
719  std::unordered_set<CandidateDedupKey, CandidateDedupKeyHash>
720  duplicateCandidates;
721  auto &candidatesSet = d->candidatesSet_;
722  candidatesSet.clear();
723  std::erase_if(d->candidates_,
724  [&candidatesSet, &duplicateCandidates, &index, &count,
725  beginSize, limit](const SentenceResult &candidate) {
726  bool beforeBeginSize = index++ < beginSize;
727  auto key = candidateDedupKey(candidate);
728  if (duplicateCandidates.contains(key)) {
729  return true;
730  }
731 
732  if (!beforeBeginSize && limit) {
733  const bool isSinglePinyinWord =
734  candidate.sentence().size() == 1 &&
735  candidate.sentence()
736  .front()
737  ->as<PinyinLatticeNode>()
738  .encodedPinyin()
739  .size() == 2;
740  if (!isSinglePinyinWord) {
741  if (count >= limit) {
742  return true;
743  }
744  count++;
745  }
746  }
747 
748  candidatesSet.insert(key.text_);
749  duplicateCandidates.insert(std::move(key));
750  return false;
751  });
752  }
753 
754  d->candidatesToCursorNeedUpdate_ = true;
755  }
756 
757  if (cursor() < selectedLength()) {
758  setCursor(selectedLength());
759  }
760 }
761 
763  FCITX_D();
764  if (userInput().empty()) {
765  return false;
766  }
767 
768  if (!d->selected_.empty()) {
769  if (d->selected_.back().back().offset_ == size()) {
770  return true;
771  }
772  }
773 
774  return false;
775 }
776 
777 std::string PinyinContext::selectedSentence() const {
778  FCITX_D();
779  std::string ss;
780  for (const auto &s : d->selected_) {
781  for (const auto &item : s) {
782  ss += item.word_.word();
783  }
784  }
785  return ss;
786 }
787 
789  FCITX_D();
790  if (!d->selected_.empty()) {
791  return d->selected_.back().back().offset_;
792  }
793  return 0;
794 }
795 
796 std::string PinyinContext::preedit() const {
797  return preedit(ime()->preeditMode());
798 }
799 
800 std::pair<std::string, size_t> PinyinContext::preeditWithCursor() const {
801  return preeditWithCursor(ime()->preeditMode());
802 }
803 
804 std::string PinyinContext::preedit(PinyinPreeditMode mode) const {
805  return preeditWithCursor(mode).first;
806 }
807 
808 std::pair<std::string, size_t>
809 PinyinContext::preeditWithCursor(PinyinPreeditMode mode) const {
810  FCITX_D();
811  std::string ss = selectedSentence();
812  const auto len = selectedLength();
813  auto c = cursor();
814  size_t actualCursor = ss.size();
815  // should not happen
816  c = std::max(c, len);
817 
818  auto resultSize = ss.size();
819 
820  if (!d->candidates_.empty()) {
821  bool first = true;
822  for (const auto &node : d->candidates_[0].sentence()) {
823  for (auto iter = node->path().begin(),
824  end = std::prev(node->path().end());
825  iter < end; iter++) {
826  if (!first) {
827  ss += " ";
828  resultSize += 1;
829  } else {
830  first = false;
831  }
832  auto from = (*iter)->index();
833  auto to = (*std::next(iter))->index();
834  size_t cursorInPinyin = c - from - len;
835  const size_t startPivot = resultSize;
836  auto pinyin = d->segs_.segment(from, to);
837  MatchedPinyinSyllables syls;
838  if (mode == PinyinPreeditMode::Pinyin) {
839  // The reason that we don't use fuzzy flag from option is
840  // that we'd like to keep the preedit as is. Otherwise,
841  // "qign" would be displayed as "qing", which would be
842  // confusing to user about what is actually being typed.
843  syls = useShuangpin()
844  ? PinyinEncoder::shuangpinToSyllables(
845  pinyin, *ime()->shuangpinProfile(),
846  PinyinFuzzyFlag::None)
847  : PinyinEncoder::stringToSyllables(
848  pinyin, PinyinFuzzyFlag::None);
849  }
850  std::string actualPinyin;
851  if (!syls.empty() && !syls.front().second.empty()) {
852  std::string_view candidatePinyin =
853  node->as<PinyinLatticeNode>().encodedPinyin();
854  auto nthPinyin = std::distance(node->path().begin(), iter);
855  PinyinInitial bestInitial = syls[0].first;
856  PinyinFinal bestFinal = syls[0].second[0].first;
857 
858  // Try to match the candidate syllables from all possible
859  // none-fuzzy possible syls.
860  if (static_cast<size_t>((nthPinyin * 2) + 2) <=
861  candidatePinyin.size()) {
862  auto candidateInitial = static_cast<PinyinInitial>(
863  candidatePinyin[nthPinyin * 2]);
864  auto candidateFinal = static_cast<PinyinFinal>(
865  candidatePinyin[(nthPinyin * 2) + 1]);
866 
867  bool found = false;
868  for (const auto &initial : syls) {
869  for (const auto &[final, fuzzy] : initial.second) {
870  if (fuzzy) {
871  continue;
872  }
873  if (candidateInitial == initial.first &&
874  (final == PinyinFinal::Invalid ||
875  candidateFinal == final)) {
876  bestInitial = initial.first;
877  if (final != PinyinFinal::Invalid) {
878  bestFinal = final;
879  }
880  found = true;
881  break;
882  }
883  }
884  if (found) {
885  break;
886  }
887  }
888  }
889 
890  actualPinyin = PinyinEncoder::initialFinalToPinyinString(
891  bestInitial, bestFinal);
892  if (!useShuangpin()) {
893  matchPinyinCase(pinyin, actualPinyin);
894  }
895  }
896  if (!actualPinyin.empty()) {
897  if (c > from + len && c <= to + len) {
898  if (useShuangpin()) {
899  switch (cursorInPinyin) {
900  case 0:
901  break;
902  case 1:
903  if (pinyin.size() == 2 &&
904  syls[0].first == PinyinInitial::Zero) {
905  actualPinyin = fcitx::stringutils::concat(
906  "_", actualPinyin);
907  }
908  // Zero case, we just append one.
909  if (syls[0].first != PinyinInitial::Zero) {
910  cursorInPinyin =
911  PinyinEncoder::initialToString(
912  syls[0].first)
913  .size();
914  }
915  break;
916  default:
917  cursorInPinyin = actualPinyin.size();
918  break;
919  }
920  } else {
921  cursorInPinyin =
922  std::min(actualPinyin.size(), cursorInPinyin);
923  cursorInPinyin = fcitx::utf8::ncharByteLength(
924  actualPinyin.begin(), cursorInPinyin);
925  }
926  }
927  ss.append(actualPinyin);
928  resultSize += actualPinyin.size();
929  } else {
930  ss.append(pinyin.data(), pinyin.size());
931  resultSize += pinyin.size();
932  }
933  if (c > from + len && c <= to + len) {
934  actualCursor = startPivot + cursorInPinyin;
935  }
936  }
937  }
938  }
939  if (c == size()) {
940  actualCursor = resultSize;
941  }
942  return {ss, actualCursor};
943 }
944 
945 std::vector<std::string> PinyinContext::selectedWords() const {
946  FCITX_D();
947  std::vector<std::string> newSentence;
948  for (const auto &s : d->selected_) {
949  for (const auto &item : s) {
950  if (item.type_ != SelectedPinyinType::Separator) {
951  newSentence.push_back(item.word_.word());
952  }
953  }
954  }
955  return newSentence;
956 }
957 
958 std::vector<HistoryBigram::WordWithCode>
960  FCITX_D();
961  std::vector<HistoryBigram::WordWithCode> newSentence;
962  for (const auto &s : d->selected_) {
963  for (const auto &item : s) {
964  if (item.type_ != SelectedPinyinType::Separator) {
965  newSentence.emplace_back(item.word_.word(),
966  item.encodedPinyin());
967  }
968  }
969  }
970  return newSentence;
971 }
972 
974  FCITX_D();
975  std::string pinyin;
976  for (const auto &s : d->selected_) {
977  for (const auto &item : s) {
978  if (!item.encodedPinyin().empty()) {
979  if (!pinyin.empty()) {
980  pinyin.push_back('\'');
981  }
982  pinyin += PinyinEncoder::decodeFullPinyin(item.encodedPinyin());
983  }
984  }
985  }
986  return pinyin;
987 }
988 
989 std::string PinyinContext::candidateFullPinyin(size_t idx) const {
990  FCITX_D();
991  return candidateFullPinyin(d->candidates_[idx]);
992 }
993 
994 std::string
996  std::string pinyin;
997  for (const auto &node : candidate.sentence()) {
998  if (!node->as<PinyinLatticeNode>().encodedPinyin().empty()) {
999  if (!pinyin.empty()) {
1000  pinyin.push_back('\'');
1001  }
1002  pinyin += PinyinEncoder::decodeFullPinyin(
1003  node->as<PinyinLatticeNode>().encodedPinyin());
1004  }
1005  }
1006  return pinyin;
1007 }
1008 
1010  FCITX_D();
1011  if (!selected()) {
1012  return;
1013  }
1014 
1015  std::vector<HistoryBigram::WordWithCode> newSentence;
1016  if (auto [result, encodedWordPinyin] = d->learnWord();
1017  result != LearnWordResult::Ignored) {
1018  // Do not insert custom to history for the first time.
1019  if (result == LearnWordResult::Normal) {
1020  // Create new sentence with the whole new learned word.
1021  newSentence.push_back({sentence(), encodedWordPinyin});
1022  } else {
1023  return;
1024  }
1025  } else {
1026  newSentence = selectedWordsWithPinyin();
1027  }
1028 
1029  if (std::ranges::any_of(newSentence, [](const auto &word) {
1030  return word.second.empty();
1031  })) {
1032  // Don't add to history if there is any non-pinyin word.
1033  return;
1034  }
1035 
1036  auto context = contextWordsWithPinyin();
1037  d->ime_->model()->history().addWithContext(contextWordsWithPinyin(),
1038  std::move(newSentence));
1039 }
1040 
1042  const std::vector<std::string> &contextWords) {
1043  FCITX_D();
1044  d->contextWords_.clear();
1045  appendContextWords(contextWords);
1046 }
1047 
1049  FCITX_D();
1050  d->contextWords_.clear();
1051 }
1052 
1054  const std::vector<std::string> &contextWords) {
1055  FCITX_D();
1056 
1057  size_t needed = LanguageModel::maxOrder() - 1;
1058 
1059  for (const auto &word :
1060  std::span{contextWords}.last(std::min(contextWords.size(), needed))) {
1061  d->contextWords_.push_back(
1062  PinyinWordNode({word, ""}, d->ime_->model()->index(word)));
1063  }
1064  while (d->contextWords_.size() > needed) {
1065  d->contextWords_.pop_front();
1066  }
1067 }
1068 
1069 std::vector<std::string> PinyinContext::contextWords() const {
1070  FCITX_D();
1071  std::vector<std::string> words;
1072  words.reserve(d->contextWords_.size());
1073  for (const auto &word : d->contextWords_) {
1074  words.push_back(word.word());
1075  }
1076  return words;
1077 }
1078 
1080  const std::vector<HistoryBigram::WordWithCode> &contextWordsWithPinyin) {
1081  FCITX_D();
1082  d->contextWords_.clear();
1083  appendContextWordsWithPinyin(contextWordsWithPinyin);
1084 }
1085 
1087  const std::vector<HistoryBigram::WordWithCode> &contextWordsWithPinyin) {
1088  FCITX_D();
1089 
1090  size_t needed = LanguageModel::maxOrder() - 1;
1091 
1092  for (const auto &word : std::span{contextWordsWithPinyin}.last(
1093  std::min(contextWordsWithPinyin.size(), needed))) {
1094  d->contextWords_.push_back(
1095  PinyinWordNode(word, d->ime_->model()->index(word.first)));
1096  }
1097  while (d->contextWords_.size() > needed) {
1098  d->contextWords_.pop_front();
1099  }
1100 }
1101 
1102 std::vector<HistoryBigram::WordWithCode>
1104  FCITX_D();
1105  std::vector<HistoryBigram::WordWithCode> words;
1106  words.reserve(d->contextWords_.size());
1107  for (const auto &word : d->contextWords_) {
1108  words.push_back({word.word(), word.encodedPinyin()});
1109  }
1110  return words;
1111 }
1112 
1113 bool PinyinContext::learnWord() { return false; }
1114 
1115 PinyinIME *PinyinContext::ime() const {
1116  FCITX_D();
1117  return d->ime_;
1118 }
1119 } // namespace libime
const std::unordered_set< std::string > & candidateSet() const
Return the set of candidates, useful for deduplication.
const std::unordered_set< std::string > & candidatesToCursorSet() const
Return the set of candidates to current cursor.
std::vector< HistoryBigram::WordWithCode > contextWordsWithPinyin() const
Get context words with pinyin for better prediction.
std::pair< std::string, size_t > preeditWithCursor() const
Mixed preedit (selected hanzi + pinyin).
State state() const
Opaque language model state.
void learn()
Add the selected part to history if selected() == true.
Provides shared data for PinyinContext.
Definition: pinyinime.h:28
size_t selectedLength() const
Selected pinyin length.
std::string candidateFullPinyin(size_t i) const
Get the full pinyin string of certain candidate.
void appendContextWordsWithPinyin(const std::vector< HistoryBigram::WordWithCode > &contextWordsWithPinyin)
Append context words with pinyin for better prediction.
static std::vector< char > encodeFullPinyinWithFlags(std::string_view pinyin, PinyinFuzzyFlags flags)
Encode a quote separated pinyin string.
bool selected() const
Whether the input is fully selected.
void appendContextWords(const std::vector< std::string > &contextWords)
Append context words for better prediction.
void selectCustom(size_t inputLength, std::string_view segment, std::string_view encodedPinyin="")
Create a custom selection.
void setContextWords(const std::vector< std::string > &contextWords)
Set context words for better prediction.
int pinyinAfterCursor() const
Return the position of last pinyin.
std::string selectedSentence() const
Selected hanzi.
std::vector< std::string > selectedWords() const
Selected hanzi segments.
void setContextWordsWithPinyin(const std::vector< HistoryBigram::WordWithCode > &contextWordsWithPinyin)
Set context words with pinyin for better prediction.
void clearContextWords()
Clear context words.
std::vector< std::string > contextWords() const
Get context words for better prediction.
std::string selectedFullPinyin() const
Get the full pinyin string of the selected part.
std::vector< HistoryBigram::WordWithCode > selectedWordsWithPinyin() const
Selected hanzi with encoded pinyin.
int pinyinBeforeCursor() const
Return the position of last pinyin.