libime
tablebaseddictionary.cpp
1 /*
2  * SPDX-FileCopyrightText: 2015-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 
7 #include "libime/table/tablebaseddictionary.h"
8 #include <algorithm>
9 #include <cassert>
10 #include <chrono>
11 #include <cstdint>
12 #include <cstdlib>
13 #include <cstring>
14 #include <exception>
15 #include <fstream>
16 #include <ios>
17 #include <iostream>
18 #include <istream>
19 #include <iterator>
20 #include <memory>
21 #include <optional>
22 #include <ostream>
23 #include <ranges>
24 #include <set>
25 #include <stdexcept>
26 #include <string>
27 #include <string_view>
28 #include <tuple>
29 #include <unordered_set>
30 #include <utility>
31 #include <vector>
32 #include <fcitx-utils/log.h>
33 #include <fcitx-utils/macros.h>
34 #include <fcitx-utils/stringutils.h>
35 #include <fcitx-utils/utf8.h>
36 #include "libime/core/datrie.h"
37 #include "libime/core/dictionary.h"
38 #include "libime/core/languagemodel.h"
39 #include "libime/core/lattice.h"
40 #include "libime/core/segmentgraph.h"
41 #include "libime/core/utils_p.h"
42 #include "libime/core/zstdfilter.h"
43 #include "autophrasedict.h"
44 #include "constants.h"
45 #include "log.h"
46 #include "tablebaseddictionary_p.h"
47 #include "tabledecoder_p.h"
48 #include "tableoptions.h"
49 #include "tablerule.h"
50 
51 namespace libime {
52 
53 namespace {
54 
55 constexpr char keyValueSeparator = '\x01';
56 constexpr char keyValueSeparatorString[] = {keyValueSeparator, '\0'};
57 // "fc" t"ab"l"e"
58 constexpr uint32_t tableBinaryFormatMagic = 0x000fcabe;
59 constexpr uint32_t tableBinaryFormatVersion = 0x2;
60 constexpr uint32_t userTableBinaryFormatMagic = 0x356fcabe;
61 constexpr uint32_t userTableBinaryFormatVersion = 0x3;
62 constexpr uint32_t extraTableBinaryFormatMagic = 0x6b0fcabe;
63 constexpr uint32_t extraTableBinaryFormatVersion = 0x1;
64 
65 enum {
66  STR_KEYCODE,
67  STR_CODELEN,
68  STR_IGNORECHAR,
69  STR_PINYIN,
70  STR_PINYINLEN,
71  STR_DATA,
72  STR_RULE,
73  STR_PROMPT,
74  STR_CONSTRUCTPHRASE,
75  STR_PHRASE,
76  STR_LAST
77 };
78 
79 enum class BuildPhase { PhaseConfig, PhaseRule, PhaseData, PhasePhrase };
80 
81 std::string_view strConst[2][STR_LAST] = {
82  {"键码=", "码长=", "规避字符=", "拼音=", "拼音长度=", "[数据]",
83  "[组词规则]", "提示=", "构词=", "[词组]"},
84  {"KeyCode=", "Length=", "InvalidChar=", "Pinyin=", "PinyinLength=",
85  "[Data]", "[Rule]", "Prompt=", "ConstructPhrase=", "[Phrase]"}};
86 
87 constexpr std::string_view UserDictAutoMark = "[Auto]";
88 constexpr std::string_view UserDictDeleteMark = "[Delete]";
89 
90 // A better version of key + keyValueSeparator + value. It tries to avoid
91 // multiple allocation.
92 inline std::string generateTableEntry(std::string_view key,
93  std::string_view value) {
94  return fcitx::stringutils::concat(key, keyValueSeparatorString, value);
95 }
96 
97 inline std::string generateTableEntry(uint32_t pinyinKey, std::string_view key,
98  std::string_view value) {
99  return fcitx::stringutils::concat(fcitx::utf8::UCS4ToUTF8(pinyinKey), key,
100  keyValueSeparatorString, value);
101 }
102 
103 void maybeUnescapeValue(std::string &value) {
104  if (auto unescape = fcitx::stringutils::unescapeForValue(value)) {
105  value = unescape.value();
106  }
107 }
108 
109 std::string maybeEscapeValue(std::string_view value) {
110  auto escaped = fcitx::stringutils::escapeForValue(value);
111  if (escaped.size() != value.size()) {
112  if (escaped.starts_with("\"") && escaped.ends_with("\"")) {
113  return escaped;
114  }
115  return fcitx::stringutils::concat("\"", escaped, "\"");
116  }
117  return std::string{value};
118 }
119 
120 void updateReverseLookupEntry(DATrie<int32_t> &trie, std::string_view key,
121  std::string_view value,
122  DATrie<int32_t> *reverseTrie) {
123 
124  auto reverseEntry = generateTableEntry(value, "");
125  bool insert = true;
126  trie.foreach(reverseEntry,
127  [&trie, &key, &value, &insert, reverseTrie](
128  int32_t, size_t len, DATrie<int32_t>::position_type pos) {
129  if (key.length() > len) {
130  std::string oldKey;
131  trie.suffix(oldKey, len, pos);
132  trie.erase(pos);
133  if (reverseTrie) {
134  auto entry = generateTableEntry(oldKey, value);
135  reverseTrie->erase(entry);
136  }
137  } else {
138  insert = false;
139  }
140  return false;
141  });
142  if (insert) {
143  reverseEntry.append(key.begin(), key.end());
144  trie.set(reverseEntry, 1);
145  if (reverseTrie) {
146  auto entry = generateTableEntry(key, value);
147  reverseTrie->set(entry, 1);
148  }
149  }
150 }
151 
152 void saveTrieToText(const DATrie<uint32_t> &trie, std::ostream &out) {
153  std::string buf;
154  std::vector<std::tuple<std::string, std::string, uint32_t>> temp;
155  trie.foreach([&trie, &buf, &temp](uint32_t value, size_t _len,
156  DATrie<int32_t>::position_type pos) {
157  trie.suffix(buf, _len, pos);
158  auto sep = buf.find(keyValueSeparator);
159  std::string_view ref(buf);
160  temp.emplace_back(ref.substr(0, sep), ref.substr(sep + 1), value);
161  return true;
162  });
163  std::sort(temp.begin(), temp.end(), [](const auto &lhs, const auto &rhs) {
164  return std::get<uint32_t>(lhs) < std::get<uint32_t>(rhs);
165  });
166  for (auto &item : temp) {
167  out << std::get<0>(item) << " " << maybeEscapeValue(std::get<1>(item))
168  << '\n';
169  }
170 }
171 
172 uint32_t maxValue(const DATrie<uint32_t> &trie) {
173  uint32_t max = 0;
174  trie.foreach(
175  [&max](uint32_t value, size_t, DATrie<uint32_t>::position_type) {
176  max = std::max(value + 1, max);
177  return true;
178  });
179  return max;
180 }
181 
182 bool insertOrUpdateTrie(DATrie<uint32_t> &trie, uint32_t &index,
183  std::string_view entry, bool updateExisting) {
184  // Always insert to user even it is dup because we need to update the index.
185  if (trie.hasExactMatch(entry) && !updateExisting) {
186  return false;
187  }
188  trie.set(entry, index);
189  index += 1;
190  return true;
191 }
192 
193 } // namespace
194 
195 bool TableBasedDictionaryPrivate::validateKeyValue(std::string_view key,
196  std::string_view value,
197  PhraseFlag flag) const {
198  FCITX_Q();
199  auto keyLength = fcitx::utf8::lengthValidated(key);
200  auto valueLength = fcitx::utf8::lengthValidated(value);
201  if (keyLength == fcitx::utf8::INVALID_LENGTH ||
202  valueLength == fcitx::utf8::INVALID_LENGTH ||
203  (codeLength_ && flag != PhraseFlag::Pinyin &&
204  !q->isValidLength(keyLength))) {
205  return false;
206  }
207  if (!inputCode_.empty() && flag != PhraseFlag::Pinyin &&
208  !q->isAllInputCode(key)) {
209  return false;
210  }
211 
212  return true;
213 }
214 
215 std::pair<DATrie<uint32_t> *, uint32_t *>
216 TableBasedDictionaryPrivate::trieByFlag(PhraseFlag flag) {
217  switch (flag) {
218  case PhraseFlag::None:
219  case PhraseFlag::Pinyin:
220  return {&phraseTrie_, &phraseTrieIndex_};
221  break;
222  case PhraseFlag::User:
223  return {&userTrie_, &userTrieIndex_};
224  break;
225  default:
226  return {nullptr, nullptr};
227  }
228 }
229 
230 std::pair<const DATrie<uint32_t> *, const uint32_t *>
231 TableBasedDictionaryPrivate::trieByFlag(PhraseFlag flag) const {
232  switch (flag) {
233  case PhraseFlag::None:
234  case PhraseFlag::Pinyin:
235  return {&phraseTrie_, &phraseTrieIndex_};
236  break;
237  case PhraseFlag::User:
238  return {&userTrie_, &userTrieIndex_};
239  break;
240  default:
241  return {nullptr, nullptr};
242  }
243 }
244 
245 bool TableBasedDictionaryPrivate::insert(std::string_view key,
246  std::string_view value,
247  PhraseFlag flag) {
248  DATrie<uint32_t> *trie;
249  uint32_t *index;
250 
251  auto pair = trieByFlag(flag);
252  trie = pair.first;
253  index = pair.second;
254 
255  std::string entry;
256  if (flag == PhraseFlag::Pinyin) {
257  entry = generateTableEntry(pinyinKey_, key, value);
258  } else {
259  entry = generateTableEntry(key, value);
260  }
261 
262  if (flag == PhraseFlag::User) {
263  deletionTrie_.erase(entry);
264  }
265 
266  return insertOrUpdateTrie(*trie, *index, entry, flag == PhraseFlag::User);
267 }
268 
269 bool TableBasedDictionaryPrivate::matchTrie(
270  const DATrie<uint32_t> &trie, uint32_t indexOffset, std::string_view code,
271  TableMatchMode mode, PhraseFlag flag,
272  const TableMatchCallback &callback) const {
273  auto range = fcitx::utf8::MakeUTF8CharRange(code);
274  std::vector<DATrie<uint32_t>::position_type> positions;
275  positions.push_back(0);
276  // BFS on trie.
277  for (auto iter = std::begin(range), end = std::end(range); iter != end;
278  iter++) {
279  decltype(positions) newPositions;
280 
281  for (auto position : positions) {
282  if (flag != PhraseFlag::Pinyin && *iter == options_.matchingKey() &&
283  options_.matchingKey()) {
284  for (auto code : inputCode_) {
285  auto curPos = position;
286  auto strCode = fcitx::utf8::UCS4ToUTF8(code);
287  auto result = trie.traverse(strCode, curPos);
288  if (!DATrie<unsigned int>::isNoPath(result)) {
289  newPositions.push_back(curPos);
290  }
291  }
292  } else {
293  auto charRange = iter.charRange();
294  std::string_view chr(
295  &*charRange.first,
296  std::distance(charRange.first, charRange.second));
297  auto curPos = position;
298  auto result = trie.traverse(chr, curPos);
299  if (!DATrie<unsigned int>::isNoPath(result)) {
300  newPositions.push_back(curPos);
301  }
302  }
303  }
304 
305  positions = std::move(newPositions);
306  }
307 
308  auto matchWord = [&trie, &code, &callback, flag, mode,
309  indexOffset](uint32_t value, size_t len,
310  DATrie<int32_t>::position_type pos) {
311  std::string entry;
312  trie.suffix(entry, code.size() + len, pos);
313  auto sep = entry.find(keyValueSeparator, code.size());
314  if (sep == std::string::npos) {
315  return true;
316  }
317 
318  auto view = std::string_view(entry);
319  auto matchedCode = view.substr(0, sep);
320  if (mode == TableMatchMode::Prefix ||
321  (mode == TableMatchMode::Exact &&
322  fcitx::utf8::length(matchedCode) == fcitx::utf8::length(code))) {
323  // Remove pinyinKey.
324  if (flag == PhraseFlag::Pinyin) {
325  matchedCode.remove_prefix(
326  fcitx::utf8::ncharByteLength(matchedCode.begin(), 1));
327  }
328  return callback(matchedCode, view.substr(sep + 1),
329  value + indexOffset, flag);
330  }
331  return true;
332  };
333 
334  for (auto position : positions) {
335  if (!trie.foreach(matchWord, position)) {
336  return false;
337  }
338  }
339  return true;
340 }
341 
342 bool TableBasedDictionaryPrivate::matchTrie(
343  std::string_view code, TableMatchMode mode, PhraseFlag flag,
344  const TableMatchCallback &callback) const {
345  const auto &trie = *trieByFlag(flag).first;
346  if (!matchTrie(trie, 0, code, mode, flag, callback)) {
347  return false;
348  }
349 
350  if (flag == PhraseFlag::None) {
351  unsigned int accumulatedIndex = phraseTrieIndex_;
352  for (const auto &[trie, index] : extraTries_) {
353  if (!matchTrie(trie, accumulatedIndex, code, mode, flag,
354  callback)) {
355  return false;
356  }
357  accumulatedIndex += index;
358  }
359  }
360 
361  return true;
362 }
363 
364 void TableBasedDictionaryPrivate::reset() {
365  pinyinKey_ = promptKey_ = phraseKey_ = 0;
366  phraseTrieIndex_ = userTrieIndex_ = 0;
367  codeLength_ = 0;
368  inputCode_.clear();
369  ignoreChars_.clear();
370  rules_.clear();
371  rules_.shrink_to_fit();
372  phraseTrie_.clear();
373  singleCharTrie_.clear();
374  singleCharConstTrie_.clear();
375  singleCharLookupTrie_.clear();
376  promptTrie_.clear();
377  userTrie_.clear();
378  autoPhraseDict_.clear();
379  deletionTrie_.clear();
380 }
381 bool TableBasedDictionaryPrivate::validate() const {
382  if (inputCode_.empty()) {
383  return false;
384  }
385  if (inputCode_.contains(pinyinKey_)) {
386  return false;
387  }
388  if (inputCode_.contains(promptKey_)) {
389  return false;
390  }
391  if (inputCode_.contains(phraseKey_)) {
392  return false;
393  }
394  return true;
395 }
396 
397 std::optional<std::tuple<std::string, std::string, PhraseFlag>>
398 TableBasedDictionaryPrivate::parseDataLine(std::string_view buf, bool user) {
399  uint32_t special[3] = {pinyinKey_, phraseKey_, promptKey_};
400  PhraseFlag specialFlag[] = {PhraseFlag::Pinyin, PhraseFlag::ConstructPhrase,
401  PhraseFlag::Prompt};
402  auto spacePos = buf.find_first_of(FCITX_WHITESPACE);
403  if (spacePos == std::string::npos || spacePos + 1 == buf.size()) {
404  return {};
405  }
406  auto wordPos = buf.find_first_not_of(FCITX_WHITESPACE, spacePos);
407  if (spacePos == std::string::npos || spacePos + 1 == buf.size()) {
408  return {};
409  }
410 
411  auto key = std::string_view(buf).substr(0, spacePos);
412  std::string value{std::string_view(buf).substr(wordPos)};
413  maybeUnescapeValue(value);
414 
415  if (key.empty() || value.empty()) {
416  return {};
417  }
418 
419  uint32_t firstChar;
420  std::string_view::iterator next =
421  fcitx::utf8::getNextChar(key.begin(), key.end(), &firstChar);
422  auto *iter = std::find(std::begin(special), std::end(special), firstChar);
423  PhraseFlag flag = user ? PhraseFlag::User : PhraseFlag::None;
424  if (iter != std::end(special)) {
425  // Reject flag for user.
426  if (user) {
427  return {};
428  }
429  flag = specialFlag[iter - std::begin(special)];
430  key = key.substr(std::distance(key.begin(), next));
431  }
432 
433  return std::tuple<std::string, std::string, PhraseFlag>{
434  key, std::move(value), flag};
435 }
436 
437 void TableBasedDictionaryPrivate::insertDataLine(std::string_view buf,
438  bool user) {
439  if (auto data = parseDataLine(buf, user)) {
440  auto &[key, value, flag] = *data;
441 
442  q_func()->insert(key, value, flag);
443  }
444 }
445 
446 bool TableBasedDictionaryPrivate::matchWordsInternal(
447  std::string_view code, TableMatchMode mode, bool onlyChecking,
448  const TableMatchCallback &callback) const {
449  auto t0 = std::chrono::high_resolution_clock::now();
450 
451  // Match phrase trie.
452  if (!matchTrie(code, mode, PhraseFlag::None,
453  [&callback, this](std::string_view code,
454  std::string_view word, uint32_t index,
455  PhraseFlag flag) {
456  if (!deletionTrie_.empty()) {
457  auto entry = generateTableEntry(code, word);
458  if (deletionTrie_.hasExactMatch(entry)) {
459  return true;
460  }
461  }
462  return callback(code, word, index, flag);
463  })) {
464  return false;
465  }
466 
467  LIBIME_TABLE_DEBUG() << "Match trie: " << millisecondsTill(t0);
468 
469  // Match Pinyin in the trie
470  if (pinyinKey_) {
471  auto pinyinCode = fcitx::stringutils::concat(
472  fcitx::utf8::UCS4ToUTF8(pinyinKey_), code);
473  // Apply following heuristic for pinyin.
474  auto pinyinMode = TableMatchMode::Exact;
475  int codeLength = fcitx::utf8::length(code);
476  if (onlyChecking || codeLength >= options_.autoSelectLength() ||
477  static_cast<size_t>(codeLength) > codeLength_ ||
478  codeLength >= options_.noMatchAutoSelectLength()) {
479  pinyinMode = TableMatchMode::Prefix;
480  }
481  if (!matchTrie(pinyinCode, pinyinMode, PhraseFlag::Pinyin, callback)) {
482  return false;
483  }
484  }
485 
486  LIBIME_TABLE_DEBUG() << "Match pinyin: " << millisecondsTill(t0);
487 
488  // Match user phrase
489  if (!matchTrie(code, mode, PhraseFlag::User, callback)) {
490  return false;
491  }
492 
493  LIBIME_TABLE_DEBUG() << "Match user: " << millisecondsTill(t0);
494  auto matchAutoPhrase = [mode, code, &callback](std::string_view entry,
495  int32_t) {
496  auto sep = entry.find(keyValueSeparator, code.size());
497  if (sep == std::string::npos) {
498  return true;
499  }
500 
501  auto view = std::string_view(entry);
502  auto matchedCode = view.substr(0, sep);
503  if (mode == TableMatchMode::Prefix ||
504  (mode == TableMatchMode::Exact &&
505  fcitx::utf8::length(matchedCode) == fcitx::utf8::length(code))) {
506  return callback(matchedCode, view.substr(sep + 1), 0,
507  PhraseFlag::Auto);
508  }
509  return true;
510  };
511 
512  return autoPhraseDict_.search(code, matchAutoPhrase);
513 }
514 
515 bool TableBasedDictionaryPrivate::validateHints(std::vector<std::string> &hints,
516  const TableRule &rule) const {
517  if (hints.size() <= 1) {
518  return false;
519  }
520 
521  for (const auto &ruleEntry : rule.entries()) {
522  // skip rule entry like p00.
523  if (ruleEntry.isPlaceHolder()) {
524  continue;
525  }
526 
527  if (ruleEntry.character() > hints.size()) {
528  return false;
529  }
530 
531  size_t index;
532  if (ruleEntry.flag() == TableRuleEntryFlag::FromFront) {
533  index = ruleEntry.character() - 1;
534  } else {
535  index = hints.size() - ruleEntry.character();
536  }
537  assert(index < hints.size());
538 
539  // Don't use hint for table with phrase key, or the requested length
540  // longer.
541  if (phraseKey_ ||
542  fcitx::utf8::length(hints[index]) <
543  static_cast<size_t>(std::abs(ruleEntry.index()))) {
544  hints[index] = std::string();
545  }
546  }
547 
548  return true;
549 }
550 
551 bool TableBasedDictionaryPrivate::hasExactMatchInPhraseTrie(
552  std::string_view entry) const {
553  return phraseTrie_.hasExactMatch(entry) ||
554  std::any_of(extraTries_.begin(), extraTries_.end(),
555  [&entry](const auto &extraTrie) {
556  return extraTrie.first.hasExactMatch(entry);
557  });
558 }
559 
560 void TableBasedDictionaryPrivate::loadBinary(std::istream &in) {
561  FCITX_Q();
562  throw_if_io_fail(unmarshall(in, pinyinKey_));
563  throw_if_io_fail(unmarshall(in, promptKey_));
564  throw_if_io_fail(unmarshall(in, phraseKey_));
565  throw_if_io_fail(unmarshall(in, codeLength_));
566  uint32_t size = 0;
567 
568  throw_if_io_fail(unmarshall(in, size));
569  inputCode_.clear();
570  while (size--) {
571  uint32_t c;
572  throw_if_io_fail(unmarshall(in, c));
573  inputCode_.insert(c);
574  }
575 
576  throw_if_io_fail(unmarshall(in, size));
577  ignoreChars_.clear();
578  while (size--) {
579  uint32_t c;
580  throw_if_io_fail(unmarshall(in, c));
581  ignoreChars_.insert(c);
582  }
583 
584  throw_if_io_fail(unmarshall(in, size));
585  rules_.clear();
586  while (size--) {
587  rules_.emplace_back(in);
588  }
589  phraseTrie_ = decltype(phraseTrie_)(in);
590  phraseTrieIndex_ = maxValue(phraseTrie_);
591  singleCharTrie_ = decltype(singleCharTrie_)(in);
592  if (q->hasRule()) {
593  singleCharConstTrie_ = decltype(singleCharConstTrie_)(in);
594  singleCharLookupTrie_ = decltype(singleCharLookupTrie_)(in);
595  }
596  if (promptKey_) {
597  promptTrie_ = decltype(promptTrie_)(in);
598  }
599 }
600 
601 void TableBasedDictionaryPrivate::loadUserBinary(std::istream &in,
602  uint32_t version) {
603  userTrie_ = decltype(userTrie_)(in);
604  userTrieIndex_ = maxValue(userTrie_);
605  autoPhraseDict_ = decltype(autoPhraseDict_)(TABLE_AUTOPHRASE_SIZE, in);
606  // Version 2 introduced new deletion trie.
607  if (version >= 2) {
608  deletionTrie_ = decltype(deletionTrie_)(in);
609  } else {
610  deletionTrie_ = decltype(deletionTrie_)();
611  }
612 }
613 
614 TableBasedDictionary::TableBasedDictionary()
615  : d_ptr(std::make_unique<TableBasedDictionaryPrivate>(this)) {
616  FCITX_D();
617  d->reset();
618 }
619 
620 TableBasedDictionary::~TableBasedDictionary() = default;
621 
622 void TableBasedDictionary::load(const char *filename, TableFormat format) {
623  std::ifstream in(filename, std::ios::in | std::ios::binary);
624  throw_if_io_fail(in);
625  load(in, format);
626 }
627 
628 void TableBasedDictionary::load(std::istream &in, TableFormat format) {
629  switch (format) {
630  case TableFormat::Binary:
631  loadBinary(in);
632  break;
633  case TableFormat::Text:
634  loadText(in);
635  break;
636  default:
637  throw std::invalid_argument("unknown format type");
638  }
639 }
640 
641 void TableBasedDictionary::loadText(std::istream &in) {
642  FCITX_D();
643  d->reset();
644 
645  std::string buf;
646 
647  auto consumeOptionPrefix = [](std::string_view &buf, int index) {
648  if (fcitx::stringutils::consumePrefix(buf, strConst[0][index])) {
649  return true;
650  }
651  if (fcitx::stringutils::consumePrefix(buf, strConst[1][index])) {
652  return true;
653  }
654  return false;
655  };
656 
657  auto phase = BuildPhase::PhaseConfig;
658  while (!in.eof()) {
659  if (!std::getline(in, buf)) {
660  break;
661  }
662 
663  // Validate everything first, so it's easier to process.
664  if (!fcitx::utf8::validate(buf)) {
665  continue;
666  }
667 
668  auto line = fcitx::stringutils::trimView(buf);
669 
670  switch (phase) {
671  case BuildPhase::PhaseConfig: {
672  if (line.starts_with("#")) {
673  continue;
674  }
675 
676  if (consumeOptionPrefix(line, STR_KEYCODE)) {
677  auto range = fcitx::utf8::MakeUTF8CharRange(line);
678  d->inputCode_ = std::set<uint32_t>(range.begin(), range.end());
679  } else if (consumeOptionPrefix(line, STR_CODELEN)) {
680  d->codeLength_ = std::stoi(std::string(line));
681  } else if (consumeOptionPrefix(line, STR_PINYINLEN)) {
682  // Deprecated option.
683  } else if (consumeOptionPrefix(line, STR_IGNORECHAR)) {
684  auto range = fcitx::utf8::MakeUTF8CharRange(line);
685  d->ignoreChars_ =
686  std::set<uint32_t>(range.begin(), range.end());
687  } else if (consumeOptionPrefix(line, STR_PINYIN)) {
688  const auto chr = fcitx::utf8::getChar(line);
689  if (fcitx::utf8::isValidChar(chr)) {
690  d->pinyinKey_ = chr;
691  }
692  } else if (consumeOptionPrefix(line, STR_PROMPT)) {
693  const auto chr = fcitx::utf8::getChar(line);
694  if (fcitx::utf8::isValidChar(chr)) {
695  d->promptKey_ = chr;
696  }
697  } else if (consumeOptionPrefix(line, STR_CONSTRUCTPHRASE)) {
698  const auto chr = fcitx::utf8::getChar(line);
699  if (fcitx::utf8::isValidChar(chr)) {
700  d->phraseKey_ = chr;
701  }
702  } else if (consumeOptionPrefix(line, STR_DATA)) {
703  phase = BuildPhase::PhaseData;
704  if (!d->validate()) {
705  throw std::invalid_argument("file format is invalid");
706  }
707  break;
708  } else if (consumeOptionPrefix(line, STR_RULE)) {
709  phase = BuildPhase::PhaseRule;
710  break;
711  }
712  break;
713  }
714  case BuildPhase::PhaseRule: {
715  if (line.starts_with("#")) {
716  continue;
717  }
718  if (consumeOptionPrefix(line, STR_DATA)) {
719  phase = BuildPhase::PhaseData;
720  if (!d->validate()) {
721  throw std::invalid_argument("file format is invalid");
722  }
723  break;
724  }
725 
726  if (line.empty()) {
727  continue;
728  }
729 
730  d->rules_.emplace_back(std::string(line), d->codeLength_);
731  break;
732  }
733  case BuildPhase::PhaseData:
734  if (consumeOptionPrefix(line, STR_PHRASE)) {
735  phase = BuildPhase::PhasePhrase;
736  if (!hasRule()) {
737  throw std::invalid_argument(
738  "file has phrase section but no rule");
739  }
740  break;
741  }
742  d->insertDataLine(line, false);
743  break;
744  case BuildPhase::PhasePhrase: {
745  std::string phrase{line};
746  maybeUnescapeValue(phrase);
747  insert(phrase, PhraseFlag::None);
748  break;
749  }
750  }
751  }
752 
753  if (phase != BuildPhase::PhaseData && phase != BuildPhase::PhasePhrase) {
754  throw_if_fail(in.bad(), std::ios_base::failure("io failed"));
755  throw std::invalid_argument("file format is invalid");
756  }
757 }
758 
759 void TableBasedDictionary::saveText(std::ostream &out) {
760  FCITX_D();
761  out << strConst[1][STR_KEYCODE];
762  for (auto c : d->inputCode_) {
763  out << fcitx::utf8::UCS4ToUTF8(c);
764  }
765  out << '\n';
766  out << strConst[1][STR_CODELEN] << d->codeLength_ << '\n';
767  if (!d->ignoreChars_.empty()) {
768  out << strConst[1][STR_IGNORECHAR];
769  for (auto c : d->ignoreChars_) {
770  out << fcitx::utf8::UCS4ToUTF8(c);
771  }
772  out << '\n';
773  }
774  if (d->pinyinKey_) {
775  out << strConst[1][STR_PINYIN] << fcitx::utf8::UCS4ToUTF8(d->pinyinKey_)
776  << '\n';
777  }
778  if (d->promptKey_) {
779  out << strConst[1][STR_PROMPT] << fcitx::utf8::UCS4ToUTF8(d->promptKey_)
780  << '\n';
781  }
782  if (d->phraseKey_) {
783  out << strConst[1][STR_CONSTRUCTPHRASE]
784  << fcitx::utf8::UCS4ToUTF8(d->phraseKey_) << '\n';
785  }
786 
787  if (hasRule()) {
788  out << strConst[1][STR_RULE] << '\n';
789  for (const auto &rule : d->rules_) {
790  out << rule.toString() << '\n';
791  }
792  }
793  out << strConst[1][STR_DATA] << '\n';
794  std::string buf;
795  if (d->promptKey_) {
796  auto promptString = fcitx::utf8::UCS4ToUTF8(d->promptKey_);
797  d->promptTrie_.foreach(
798  [&promptString, d, &buf,
799  &out](uint32_t, size_t _len, DATrie<uint32_t>::position_type pos) {
800  d->promptTrie_.suffix(buf, _len, pos);
801  auto sep = buf.find(keyValueSeparator);
802  if (sep == std::string::npos) {
803  return true;
804  }
805  std::string_view ref(buf);
806  out << promptString << ref.substr(sep + 1) << " "
807  << maybeEscapeValue(ref.substr(0, sep)) << '\n';
808  return true;
809  });
810  }
811  if (d->phraseKey_) {
812  auto phraseString = fcitx::utf8::UCS4ToUTF8(d->phraseKey_);
813  d->singleCharConstTrie_.foreach(
814  [&phraseString, d, &buf, &out](int32_t, size_t _len,
815  DATrie<int32_t>::position_type pos) {
816  d->singleCharConstTrie_.suffix(buf, _len, pos);
817  auto sep = buf.find(keyValueSeparator);
818  if (sep == std::string::npos) {
819  return true;
820  }
821  std::string_view ref(buf);
822  out << phraseString << ref.substr(sep + 1) << " "
823  << maybeEscapeValue(ref.substr(0, sep)) << '\n';
824  return true;
825  });
826  }
827 
828  saveTrieToText(d->phraseTrie_, out);
829 }
830 
831 void TableBasedDictionary::loadBinary(std::istream &in) {
832  FCITX_D();
833  uint32_t magic;
834  uint32_t version;
835  throw_if_io_fail(unmarshall(in, magic));
836  if (magic != tableBinaryFormatMagic) {
837  throw std::invalid_argument("Invalid table magic.");
838  }
839  throw_if_io_fail(unmarshall(in, version));
840  switch (version) {
841  case 1:
842  d->loadBinary(in);
843  break;
844  case tableBinaryFormatVersion:
845  readZSTDCompressed(
846  in, [d](std::istream &compressIn) { d->loadBinary(compressIn); });
847  break;
848 
849  default:
850  throw std::invalid_argument("Invalid table version.");
851  }
852 }
853 
854 void TableBasedDictionary::save(const char *filename, TableFormat format) {
855  std::ofstream fout(filename, std::ios::out | std::ios::binary);
856  throw_if_io_fail(fout);
857  save(fout, format);
858 }
859 
860 void TableBasedDictionary::save(std::ostream &out, TableFormat format) {
861  switch (format) {
862  case TableFormat::Binary:
863  saveBinary(out);
864  break;
865  case TableFormat::Text:
866  saveText(out);
867  break;
868  default:
869  throw std::invalid_argument("unknown format type");
870  }
871 }
872 
873 void TableBasedDictionary::saveBinary(std::ostream &origOut) {
874  throw_if_io_fail(marshall(origOut, tableBinaryFormatMagic));
875  throw_if_io_fail(marshall(origOut, tableBinaryFormatVersion));
876 
877  writeZSTDCompressed(origOut, [this](std::ostream &out) {
878  FCITX_D();
879  throw_if_io_fail(marshall(out, d->pinyinKey_));
880  throw_if_io_fail(marshall(out, d->promptKey_));
881  throw_if_io_fail(marshall(out, d->phraseKey_));
882  throw_if_io_fail(marshall(out, d->codeLength_));
883  throw_if_io_fail(
884  marshall(out, static_cast<uint32_t>(d->inputCode_.size())));
885  for (auto c : d->inputCode_) {
886  throw_if_io_fail(marshall(out, c));
887  }
888  throw_if_io_fail(
889  marshall(out, static_cast<uint32_t>(d->ignoreChars_.size())));
890  for (auto c : d->ignoreChars_) {
891  throw_if_io_fail(marshall(out, c));
892  }
893  throw_if_io_fail(
894  marshall(out, static_cast<uint32_t>(d->rules_.size())));
895  for (const auto &rule : d->rules_) {
896  throw_if_io_fail(out << rule);
897  }
898  d->phraseTrie_.save(out);
899  d->singleCharTrie_.save(out);
900  if (hasRule()) {
901  d->singleCharConstTrie_.save(out);
902  d->singleCharLookupTrie_.save(out);
903  }
904  if (d->promptKey_) {
905  d->promptTrie_.save(out);
906  }
907  });
908 }
909 
910 void TableBasedDictionary::loadUser(const char *filename, TableFormat format) {
911  std::ifstream in(filename, std::ios::in | std::ios::binary);
912  throw_if_io_fail(in);
913  loadUser(in, format);
914 }
915 
916 void TableBasedDictionary::loadUser(std::istream &in, TableFormat format) {
917  FCITX_D();
918  uint32_t magic = 0;
919  uint32_t version = 0;
920  switch (format) {
921  case TableFormat::Binary:
922  throw_if_io_fail(unmarshall(in, magic));
923  if (magic != userTableBinaryFormatMagic) {
924  throw std::invalid_argument("Invalid user table magic.");
925  }
926  throw_if_io_fail(unmarshall(in, version));
927  switch (version) {
928  case 1:
929  case 2:
930  d->loadUserBinary(in, version);
931  break;
932  case userTableBinaryFormatVersion:
933  readZSTDCompressed(in, [d, version](std::istream &compressIn) {
934  d->loadUserBinary(compressIn, version);
935  });
936  break;
937  default:
938  throw std::invalid_argument("Invalid user table version.");
939  }
940  break;
941  case TableFormat::Text: {
942  std::string lineBuf;
943  enum class UserDictState { Phrase, Auto, Delete };
944 
945  UserDictState state = UserDictState::Phrase;
946  while (!in.eof()) {
947  if (!std::getline(in, lineBuf)) {
948  break;
949  }
950 
951  // Validate everything first, so it's easier to process.
952  if (!fcitx::utf8::validate(lineBuf)) {
953  continue;
954  }
955  auto line = fcitx::stringutils::trimView(lineBuf);
956  if (line == UserDictAutoMark) {
957  state = UserDictState::Auto;
958  continue;
959  }
960  if (line == UserDictDeleteMark) {
961  state = UserDictState::Delete;
962  continue;
963  }
964 
965  switch (state) {
966  case UserDictState::Phrase:
967  d->insertDataLine(line, true);
968  break;
969  case UserDictState::Auto: {
970  auto tokens = fcitx::stringutils::split(line, FCITX_WHITESPACE);
971  if (tokens.size() != 3 || !isAllInputCode(tokens[0])) {
972  continue;
973  }
974  try {
975  maybeUnescapeValue(tokens[1]);
976  int32_t hit = std::stoi(tokens[2]);
977  d->autoPhraseDict_.insert(
978  generateTableEntry(tokens[0], tokens[1]), hit);
979  } catch (const std::exception &) {
980  continue;
981  }
982  } break;
983  case UserDictState::Delete: {
984  if (auto data = d->parseDataLine(line, true)) {
985  auto &[key, value, flag] = *data;
986  auto entry = generateTableEntry(key, value);
987  d->deletionTrie_.set(entry, 0);
988  }
989  } break;
990  }
991  }
992  break;
993  }
994  default:
995  throw std::invalid_argument("unknown format type");
996  }
997 }
998 
999 void TableBasedDictionary::saveUser(const char *filename, TableFormat format) {
1000  std::ofstream fout(filename, std::ios::out | std::ios::binary);
1001  throw_if_io_fail(fout);
1002  saveUser(fout, format);
1003 }
1004 
1005 void TableBasedDictionary::saveUser(std::ostream &out, TableFormat format) {
1006  FCITX_D();
1007  switch (format) {
1008  case TableFormat::Binary: {
1009  throw_if_io_fail(marshall(out, userTableBinaryFormatMagic));
1010  throw_if_io_fail(marshall(out, userTableBinaryFormatVersion));
1011 
1012  writeZSTDCompressed(out, [d](std::ostream &compressOut) {
1013  d->userTrie_.save(compressOut);
1014  throw_if_io_fail(compressOut);
1015  d->autoPhraseDict_.save(compressOut);
1016  throw_if_io_fail(compressOut);
1017  d->deletionTrie_.save(compressOut);
1018  throw_if_io_fail(compressOut);
1019  });
1020  break;
1021  }
1022  case TableFormat::Text: {
1023  saveTrieToText(d->userTrie_, out);
1024 
1025  if (!d->autoPhraseDict_.empty()) {
1026  out << UserDictAutoMark << '\n';
1027  std::vector<std::tuple<std::string, std::string, int32_t>>
1028  autoEntries;
1029  d->autoPhraseDict_.search(
1030  "", [&autoEntries](std::string_view entry, int hit) {
1031  auto sep = entry.find(keyValueSeparator);
1032  autoEntries.emplace_back(entry.substr(0, sep),
1033  entry.substr(sep + 1), hit);
1034  return true;
1035  });
1036  for (auto &t : autoEntries | std::views::reverse) {
1037  out << std::get<0>(t) << " " << maybeEscapeValue(std::get<1>(t))
1038  << " " << std::get<2>(t) << '\n';
1039  }
1040  }
1041  if (!d->deletionTrie_.empty()) {
1042  out << UserDictDeleteMark << '\n';
1043  saveTrieToText(d->deletionTrie_, out);
1044  }
1045  break;
1046  }
1047  default:
1048  throw std::invalid_argument("unknown format type");
1049  }
1050 }
1051 
1052 size_t TableBasedDictionary::loadExtra(const char *filename,
1053  TableFormat format) {
1054  std::ifstream in(filename, std::ios::in | std::ios::binary);
1055  throw_if_io_fail(in);
1056  return loadExtra(in, format);
1057 }
1058 
1059 size_t TableBasedDictionary::loadExtra(std::istream &in, TableFormat format) {
1060  FCITX_D();
1061  DATrie<uint32_t> trie;
1062  uint32_t index = 0;
1063  uint32_t magic = 0;
1064  uint32_t version = 0;
1065  switch (format) {
1066  case TableFormat::Binary:
1067  throw_if_io_fail(unmarshall(in, magic));
1068  if (magic != extraTableBinaryFormatMagic) {
1069  throw std::invalid_argument("Invalid user table magic.");
1070  }
1071  throw_if_io_fail(unmarshall(in, version));
1072  switch (version) {
1073  case extraTableBinaryFormatVersion:
1074  readZSTDCompressed(in, [&trie](std::istream &compressIn) {
1075  trie.load(compressIn);
1076  });
1077  index = maxValue(trie);
1078  break;
1079  default:
1080  throw std::invalid_argument("Invalid user table version.");
1081  }
1082  break;
1083  case TableFormat::Text: {
1084  std::string lineBuf;
1085  enum class ExtraDictState { Data, Phrase };
1086 
1087  ExtraDictState state = ExtraDictState::Data;
1088  while (!in.eof()) {
1089  if (!std::getline(in, lineBuf)) {
1090  break;
1091  }
1092 
1093  // Validate everything first, so it's easier to process.
1094  if (!fcitx::utf8::validate(lineBuf)) {
1095  continue;
1096  }
1097  const auto line = fcitx::stringutils::trimView(lineBuf);
1098  if (line == strConst[0][STR_PHRASE] ||
1099  line == strConst[1][STR_PHRASE]) {
1100  state = ExtraDictState::Phrase;
1101  continue;
1102  }
1103 
1104  std::string key;
1105  std::string value;
1106  PhraseFlag flag;
1107  switch (state) {
1108  case ExtraDictState::Data:
1109  if (auto data = d->parseDataLine(line, false)) {
1110  std::tie(key, value, flag) = *data;
1111  if (flag != PhraseFlag::None) {
1112  continue;
1113  }
1114  maybeUnescapeValue(value);
1115  }
1116  break;
1117  case ExtraDictState::Phrase:
1118  value = line;
1119  maybeUnescapeValue(value);
1120  if (!generate(value, key)) {
1121  continue;
1122  }
1123  break;
1124  }
1125  if (value.empty() || key.empty()) {
1126  continue;
1127  }
1128  auto entry = generateTableEntry(key, value);
1129  insertOrUpdateTrie(trie, index, entry, false);
1130  }
1131  break;
1132  }
1133  default:
1134  throw std::invalid_argument("unknown format type");
1135  }
1136 
1137  d->extraTries_.push_back(std::make_pair(std::move(trie), index));
1138  return d->extraTries_.size() - 1;
1139 }
1140 
1141 void TableBasedDictionary::saveExtra(size_t index, const char *filename,
1142  TableFormat format) {
1143  std::ofstream fout(filename, std::ios::out | std::ios::binary);
1144  throw_if_io_fail(fout);
1145  saveExtra(index, fout, format);
1146 }
1147 
1148 void TableBasedDictionary::saveExtra(size_t index, std::ostream &out,
1149  TableFormat format) {
1150  FCITX_D();
1151  if (index >= d->extraTries_.size()) {
1152  throw std::invalid_argument("Invalid extra dict index");
1153  }
1154  switch (format) {
1155  case TableFormat::Binary:
1156 
1157  throw_if_io_fail(marshall(out, extraTableBinaryFormatMagic));
1158  throw_if_io_fail(marshall(out, extraTableBinaryFormatVersion));
1159 
1160  writeZSTDCompressed(out, [d, index](std::ostream &compressOut) {
1161  d->extraTries_[index].first.save(compressOut);
1162  });
1163  break;
1164  case TableFormat::Text:
1165  saveTrieToText(d->extraTries_[index].first, out);
1166  break;
1167  default:
1168  throw std::invalid_argument("unknown format type");
1169  }
1170 }
1171 
1172 void TableBasedDictionary::removeAllExtra() {
1173  FCITX_D();
1174  d->extraTries_.clear();
1175 }
1176 
1177 bool TableBasedDictionary::hasRule() const noexcept {
1178  FCITX_D();
1179  return !d->rules_.empty();
1180 }
1181 
1182 bool TableBasedDictionary::hasCustomPrompt() const noexcept {
1183  FCITX_D();
1184  return !d->promptTrie_.empty();
1185 }
1186 
1187 const TableRule *TableBasedDictionary::findRule(std::string_view name) const {
1188  FCITX_D();
1189  for (const auto &rule : d->rules_) {
1190  if (rule.name() == name) {
1191  return &rule;
1192  }
1193  }
1194  return nullptr;
1195 }
1196 
1197 bool TableBasedDictionary::insert(std::string_view value,
1198  libime::PhraseFlag flag) {
1199  std::string key;
1200  if (flag != PhraseFlag::None && flag != PhraseFlag::User) {
1201  return false;
1202  }
1203 
1204  if (generate(value, key)) {
1205  return insert(key, value, flag);
1206  }
1207  return false;
1208 }
1209 
1210 bool TableBasedDictionary::insert(std::string_view key, std::string_view value,
1211  PhraseFlag flag, bool verifyWithRule) {
1212  FCITX_D();
1213 
1214  if (!d->validateKeyValue(key, value, flag)) {
1215  return false;
1216  }
1217 
1218  switch (flag) {
1219  case PhraseFlag::Pinyin: /* Falls through. */
1220  case PhraseFlag::User: /* Falls through. */
1221  case PhraseFlag::None: {
1222  if (flag != PhraseFlag::Pinyin && verifyWithRule && hasRule()) {
1223  std::string checkKey;
1224  if (!generate(value, checkKey)) {
1225  return false;
1226  }
1227  if (checkKey != key) {
1228  return false;
1229  }
1230  }
1231 
1232  if (!d->insert(key, value, flag)) {
1233  return false;
1234  }
1235 
1236  if (flag == PhraseFlag::None && fcitx::utf8::length(value) == 1 &&
1237  !d->ignoreChars_.contains(fcitx::utf8::getChar(value))) {
1238  updateReverseLookupEntry(d->singleCharTrie_, key, value, nullptr);
1239 
1240  if (hasRule() && !d->phraseKey_) {
1241  updateReverseLookupEntry(d->singleCharConstTrie_, key, value,
1242  &d->singleCharLookupTrie_);
1243  }
1244  }
1245  break;
1246  }
1247  case PhraseFlag::Prompt:
1248  if (!key.empty()) {
1249  d->promptTrie_.set(generateTableEntry(key, value), 0);
1250  } else {
1251  return false;
1252  }
1253  break;
1254  case PhraseFlag::ConstructPhrase:
1255  if (hasRule() && fcitx::utf8::length(value) == 1) {
1256  updateReverseLookupEntry(d->singleCharConstTrie_, key, value,
1257  &d->singleCharLookupTrie_);
1258  }
1259  break;
1260  case PhraseFlag::Auto: {
1261  const auto entry = generateTableEntry(key, value);
1262  auto hit = d->autoPhraseDict_.exactSearch(entry);
1263  if (tableOptions().saveAutoPhraseAfter() >= 1 &&
1264  static_cast<uint32_t>(tableOptions().saveAutoPhraseAfter()) <=
1265  hit + 1) {
1266  d->autoPhraseDict_.erase(entry);
1267  insert(key, value, PhraseFlag::User, false);
1268  } else {
1269  d->autoPhraseDict_.insert(entry);
1270  }
1271  } break;
1272  case PhraseFlag::Invalid:
1273  break;
1274  }
1275  return true;
1276 }
1277 
1278 bool TableBasedDictionary::generate(std::string_view value,
1279  std::string &key) const {
1280  return generateWithHint(value, {}, key);
1281 }
1282 
1283 bool TableBasedDictionary::generateWithHint(
1284  std::string_view value, const std::vector<std::string> &codeHints,
1285  std::string &key) const {
1286  FCITX_D();
1287  if (!hasRule() || value.empty()) {
1288  return false;
1289  }
1290 
1291  // Check word is valid utf8.
1292  auto valueLen = fcitx::utf8::lengthValidated(value);
1293  if (valueLen == fcitx::utf8::INVALID_LENGTH) {
1294  return false;
1295  }
1296 
1297  for (const auto &code : codeHints) {
1298  if (!fcitx::utf8::validate(code)) {
1299  return false;
1300  }
1301  }
1302 
1303  auto hints = codeHints;
1304  hints.resize(valueLen);
1305 
1306  std::string newKey;
1307  for (const auto &rule : d->rules_) {
1308  // check rule can be applied
1309  const bool canApplyRule =
1310  ((rule.flag() == TableRuleFlag::LengthEqual &&
1311  valueLen == rule.phraseLength()) ||
1312  (rule.flag() == TableRuleFlag::LengthLongerThan &&
1313  valueLen >= rule.phraseLength()));
1314  if (!canApplyRule) {
1315  continue;
1316  }
1317 
1318  auto hints = codeHints;
1319  hints.resize(valueLen);
1320  // Fill hints first.
1321  if (!d->validateHints(hints, rule)) {
1322  continue;
1323  }
1324 
1325  bool success = true;
1326  std::set<std::pair<size_t, int>> usedChar;
1327  for (const auto &ruleEntry : rule.entries()) {
1328  std::string_view::const_iterator iter;
1329  // skip rule entry like p00.
1330  if (ruleEntry.isPlaceHolder()) {
1331  continue;
1332  }
1333 
1334  if (ruleEntry.character() > valueLen) {
1335  success = false;
1336  break;
1337  }
1338 
1339  size_t index;
1340  if (ruleEntry.flag() == TableRuleEntryFlag::FromFront) {
1341  index = ruleEntry.character() - 1;
1342  } else {
1343  index = valueLen - ruleEntry.character();
1344  }
1345  iter = fcitx::utf8::nextNChar(value.begin(), index);
1346  std::string_view::iterator prev = iter;
1347  iter = fcitx::utf8::nextChar(iter);
1348  std::string_view chr(&*prev, std::distance(prev, iter));
1349 
1350  std::string entry;
1351  if (!hints[index].empty()) {
1352  entry = hints[index];
1353  } else {
1354  entry = reverseLookup(chr, PhraseFlag::ConstructPhrase);
1355  }
1356  if (entry.empty()) {
1357  success = false;
1358  break;
1359  }
1360 
1361  auto length = fcitx::utf8::lengthValidated(entry);
1362  auto codeIndex = ruleEntry.index();
1363  if (length == fcitx::utf8::INVALID_LENGTH ||
1364  length < static_cast<size_t>(std::abs(codeIndex))) {
1365  continue;
1366  }
1367 
1368  if (codeIndex > 0) {
1369  // code index starts with 1.
1370  codeIndex -= 1;
1371  } else {
1372  codeIndex = static_cast<int>(length) + codeIndex;
1373  }
1374 
1375  auto charIndex = std::make_pair(index, codeIndex);
1376  // Avoid same code being referenced twice.
1377  // This helps for the case like: p11 and p1z point to the same code
1378  // character.
1379  if (usedChar.contains(charIndex)) {
1380  continue;
1381  }
1382  usedChar.insert(charIndex);
1383  auto entryStart = fcitx::utf8::nextNChar(entry.begin(), codeIndex);
1384  auto entryEnd = fcitx::utf8::nextChar(entryStart);
1385 
1386  newKey.append(entryStart, entryEnd);
1387  }
1388 
1389  if (success && !newKey.empty()) {
1390  key = newKey;
1391  return true;
1392  }
1393  }
1394 
1395  return false;
1396 }
1397 
1398 bool TableBasedDictionary::isInputCode(uint32_t c) const {
1399  FCITX_D();
1400  return !!(d->inputCode_.count(c));
1401 }
1402 
1403 bool TableBasedDictionary::isAllInputCode(std::string_view code) const {
1404  std::string_view::iterator iter = code.begin();
1405  std::string_view::iterator end = code.end();
1406  while (iter != end) {
1407  uint32_t chr;
1408  iter = fcitx::utf8::getNextChar(iter, end, &chr);
1409  if (!fcitx::utf8::isValidChar(chr) || !isInputCode(chr)) {
1410  return false;
1411  }
1412  }
1413  return true;
1414 }
1415 
1416 bool TableBasedDictionary::isEndKey(uint32_t c) const {
1417  FCITX_D();
1418  return d->options_.endKey().contains(c);
1419 }
1420 
1421 void TableBasedDictionary::statistic() const {
1422  FCITX_D();
1423  std::cout << "Phrase Trie: " << d->phraseTrie_.mem_size() << '\n'
1424  << "Single Char Trie: " << d->singleCharTrie_.mem_size() << '\n'
1425  << "Single char const trie: "
1426  << d->singleCharConstTrie_.mem_size() << " + "
1427  << d->singleCharLookupTrie_.mem_size() << '\n'
1428  << "Prompt Trie: " << d->promptTrie_.mem_size() << '\n';
1429 }
1430 
1431 void TableBasedDictionary::setTableOptions(TableOptions option) {
1432  FCITX_D();
1433  d->options_ = std::move(option);
1434  if (d->options_.autoSelectLength() < 0) {
1435  d->options_.setAutoSelectLength(maxLength());
1436  }
1437  if (d->options_.noMatchAutoSelectLength() < 0) {
1438  d->options_.setNoMatchAutoSelectLength(maxLength());
1439  }
1440  if (d->options_.autoPhraseLength() < 0) {
1441  d->options_.setAutoPhraseLength(maxLength());
1442  }
1443  d->autoSelectRegex_.reset();
1444  d->noMatchAutoSelectRegex_.reset();
1445  try {
1446  if (!d->options_.autoSelectRegex().empty()) {
1447  d->autoSelectRegex_.emplace(d->options_.autoSelectRegex());
1448  }
1449  } catch (...) {
1450  }
1451  try {
1452  if (!d->options_.noMatchAutoSelectRegex().empty()) {
1453  d->noMatchAutoSelectRegex_.emplace(
1454  d->options_.noMatchAutoSelectRegex());
1455  }
1456  } catch (...) {
1457  }
1458 }
1459 
1460 const TableOptions &TableBasedDictionary::tableOptions() const {
1461  FCITX_D();
1462  return d->options_;
1463 }
1464 
1465 bool TableBasedDictionary::hasPinyin() const {
1466  FCITX_D();
1467  return d->pinyinKey_;
1468 }
1469 
1470 uint32_t TableBasedDictionary::maxLength() const {
1471  FCITX_D();
1472  return d->codeLength_;
1473 }
1474 
1475 bool TableBasedDictionary::isValidLength(size_t length) const {
1476  FCITX_D();
1477  return length <= d->codeLength_;
1478 }
1479 bool TableBasedDictionary::matchWords(
1480  std::string_view code, TableMatchMode mode,
1481  const TableMatchCallback &callback) const {
1482  FCITX_D();
1483  return d->matchWordsInternal(code, mode, false, callback);
1484 }
1485 
1486 bool TableBasedDictionary::hasMatchingWords(std::string_view code,
1487  std::string_view next) const {
1488  std::string str{code};
1489  str.append(next.data(), next.size());
1490  return hasMatchingWords(str);
1491 }
1492 
1493 bool TableBasedDictionary::hasMatchingWords(std::string_view code) const {
1494  FCITX_D();
1495  bool hasMatch = false;
1496  d->matchWordsInternal(
1497  code, TableMatchMode::Prefix, true,
1498  [&hasMatch](std::string_view, std::string_view, uint32_t, PhraseFlag) {
1499  hasMatch = true;
1500  return false;
1501  });
1502  return hasMatch;
1503 }
1504 
1505 bool TableBasedDictionary::hasOneMatchingWord(std::string_view code) const {
1506  // User dict may have the same entry, so we need to check if it is the same.
1507  std::optional<std::tuple<std::string, std::string>> previousMatch;
1508  matchWords(code, TableMatchMode::Prefix,
1509  [&previousMatch](std::string_view code, std::string_view word,
1510  uint32_t, PhraseFlag) {
1511  if (previousMatch) {
1512  if (std::get<0>(*previousMatch) == code &&
1513  std::get<1>(*previousMatch) == word) {
1514  return true;
1515  }
1516  previousMatch.reset();
1517  return false;
1518  }
1519  previousMatch.emplace(code, word);
1520  return true;
1521  });
1522  return previousMatch.has_value();
1523 }
1524 
1525 PhraseFlag TableBasedDictionary::wordExists(std::string_view code,
1526  std::string_view word) const {
1527  FCITX_D();
1528  auto entry = generateTableEntry(code, word);
1529 
1530  if (d->userTrie_.hasExactMatch(entry)) {
1531  return PhraseFlag::User;
1532  }
1533  if (d->hasExactMatchInPhraseTrie(entry) &&
1534  !d->deletionTrie_.hasExactMatch(entry)) {
1535  return PhraseFlag::None;
1536  }
1537 
1538  if (d->autoPhraseDict_.exactSearch(entry)) {
1539  return PhraseFlag::Auto;
1540  }
1541  return PhraseFlag::Invalid;
1542 }
1543 
1544 void TableBasedDictionary::removeWord(std::string_view code,
1545  std::string_view word) {
1546  FCITX_D();
1547  auto entry = generateTableEntry(code, word);
1548  d->autoPhraseDict_.erase(entry);
1549  d->userTrie_.erase(entry);
1550  if (d->hasExactMatchInPhraseTrie(entry) &&
1551  !d->deletionTrie_.hasExactMatch(entry)) {
1552  d->deletionTrie_.set(entry, 0);
1553  }
1554 }
1555 
1556 std::string TableBasedDictionary::reverseLookup(std::string_view word,
1557  PhraseFlag flag) const {
1558  FCITX_D();
1559  if (flag != PhraseFlag::ConstructPhrase && flag != PhraseFlag::None) {
1560  throw std::runtime_error("Invalid flag.");
1561  }
1562  std::string reverseEntry{word};
1563  reverseEntry.push_back(keyValueSeparator);
1564  std::string key;
1565  const auto &trie =
1566  (flag == PhraseFlag::ConstructPhrase ? d->singleCharConstTrie_
1567  : d->singleCharTrie_);
1568  trie.foreach(
1569  reverseEntry,
1570  [&trie, &key](int32_t, size_t len, DATrie<int32_t>::position_type pos) {
1571  trie.suffix(key, len, pos);
1572  return false;
1573  });
1574  return key;
1575 }
1576 
1577 std::string TableBasedDictionary::hint(std::string_view key) const {
1578  FCITX_D();
1579  if (!d->promptKey_) {
1580  return std::string{key};
1581  }
1582 
1583  std::string result;
1584  auto range = fcitx::utf8::MakeUTF8CharRange(key);
1585  for (auto iter = std::begin(range); iter != std::end(range); iter++) {
1586  auto charRange = iter.charRange();
1587  std::string_view search(
1588  &*charRange.first,
1589  std::distance(charRange.first, charRange.second));
1590  std::string entry;
1591  d->promptTrie_.foreach(
1592  generateTableEntry(search, ""),
1593  [&entry, d](uint32_t, size_t len,
1594  DATrie<uint32_t>::position_type pos) {
1595  d->promptTrie_.suffix(entry, len, pos);
1596  return false;
1597  });
1598  if (!entry.empty()) {
1599  result.append(entry);
1600  } else {
1601  result.append(charRange.first, charRange.second);
1602  }
1603  }
1604  return result;
1605 }
1606 
1607 void TableBasedDictionary::matchPrefixImpl(
1608  const SegmentGraph &graph, const GraphMatchCallback &callback,
1609  const std::unordered_set<const SegmentGraphNode *> &ignore,
1610  void * /*helper*/) const {
1611  FCITX_D();
1612  auto range = fcitx::utf8::MakeUTF8CharRange(graph.data());
1613  auto hasWildcard =
1614  d->options_.matchingKey() &&
1615  std::any_of(std::begin(range), std::end(range),
1616  [d](uint32_t c) { return d->options_.matchingKey() == c; });
1617 
1618  const TableMatchMode mode = tableOptions().exactMatch() || hasWildcard
1619  ? TableMatchMode::Exact
1620  : TableMatchMode::Prefix;
1621  SegmentGraphPath path;
1622  path.reserve(2);
1623  graph.bfs(&graph.start(), [this, &ignore, &path, &callback, hasWildcard,
1624  mode](const SegmentGraphBase &graph,
1625  const SegmentGraphNode *node) {
1626  if (!node->prevSize() || ignore.contains(node)) {
1627  return true;
1628  }
1629  for (const auto &prev : node->prevs()) {
1630  path.clear();
1631  path.push_back(&prev);
1632  path.push_back(node);
1633 
1634  auto code = graph.segment(*path[0], *path[1]);
1635  if (code.size() == graph.size()) {
1636  matchWords(
1637  code, mode,
1638  [&](std::string_view code, std::string_view word,
1639  uint32_t index, PhraseFlag flag) {
1640  // Do not return user for noSortInputLength, so code
1641  // shorter than noSortInputLength is always in stable
1642  // order.
1643  if (flag == PhraseFlag::User &&
1644  code.size() <= tableOptions().noSortInputLength()) {
1645  return true;
1646  }
1647 
1648  WordNode wordNode(word, InvalidWordIndex);
1649 
1650  // for length 1 "pinyin", skip long pinyin as an
1651  // optimization.
1652  if (flag == PhraseFlag::Pinyin && graph.size() == 1 &&
1653  code.size() != 1) {
1654  return true;
1655  }
1656  callback(path, wordNode, 0,
1657  std::make_unique<TableLatticeNodePrivate>(
1658  code, index, flag));
1659  return true;
1660  });
1661  } else if (!hasWildcard) {
1662  // use it as a buffer.
1663  std::string entry;
1664  FCITX_D();
1665  d->singleCharLookupTrie_.foreach(
1666  code, [&](uint32_t, size_t len,
1667  DATrie<uint32_t>::position_type pos) {
1668  d->singleCharLookupTrie_.suffix(entry,
1669  code.size() + len, pos);
1670 
1671  auto sep = entry.find(keyValueSeparator);
1672  if (sep == std::string::npos) {
1673  return true;
1674  }
1675 
1676  std::string_view ref(entry);
1677  auto code = ref.substr(0, sep);
1678  auto word = ref.substr(sep + 1);
1679 
1680  WordNode wordNode(word, InvalidWordIndex);
1681  callback(path, wordNode, 0,
1682  std::make_unique<TableLatticeNodePrivate>(
1683  code, 0, PhraseFlag::ConstructPhrase));
1684  return true;
1685  });
1686  }
1687  }
1688  return true;
1689  });
1690 }
1691 } // namespace libime
void insert(const std::string &entry, uint32_t value=0)
Insert a word into dictionary and refresh the MRU.
Provide a DATrie implementation.
bool search(std::string_view s, const std::function< bool(std::string_view, uint32_t)> &callback) const
Check if any word starting with s exists in the dictionary.