7 #include "libime/table/tablebaseddictionary.h" 27 #include <string_view> 29 #include <unordered_set> 32 #include <fcitx-utils/log.h> 33 #include <fcitx-utils/macros.h> 34 #include <fcitx-utils/stringutils.h> 35 #include <fcitx-utils/utf8.h> 37 #include "libime/core/dictionary.h" 38 #include "libime/core/languagemodel.h" 39 #include "libime/core/lattice.h" 40 #include "libime/core/segmentgraph.h" 41 #include "libime/core/utils_p.h" 42 #include "libime/core/zstdfilter.h" 43 #include "autophrasedict.h" 44 #include "constants.h" 46 #include "tablebaseddictionary_p.h" 47 #include "tabledecoder_p.h" 48 #include "tableoptions.h" 49 #include "tablerule.h" 55 constexpr
char keyValueSeparator =
'\x01';
56 constexpr
char keyValueSeparatorString[] = {keyValueSeparator,
'\0'};
58 constexpr uint32_t tableBinaryFormatMagic = 0x000fcabe;
59 constexpr uint32_t tableBinaryFormatVersion = 0x2;
60 constexpr uint32_t userTableBinaryFormatMagic = 0x356fcabe;
61 constexpr uint32_t userTableBinaryFormatVersion = 0x3;
62 constexpr uint32_t extraTableBinaryFormatMagic = 0x6b0fcabe;
63 constexpr uint32_t extraTableBinaryFormatVersion = 0x1;
79 enum class BuildPhase { PhaseConfig, PhaseRule, PhaseData, PhasePhrase };
81 std::string_view strConst[2][STR_LAST] = {
82 {
"键码=",
"码长=",
"规避字符=",
"拼音=",
"拼音长度=",
"[数据]",
83 "[组词规则]",
"提示=",
"构词=",
"[词组]"},
84 {
"KeyCode=",
"Length=",
"InvalidChar=",
"Pinyin=",
"PinyinLength=",
85 "[Data]",
"[Rule]",
"Prompt=",
"ConstructPhrase=",
"[Phrase]"}};
87 constexpr std::string_view UserDictAutoMark =
"[Auto]";
88 constexpr std::string_view UserDictDeleteMark =
"[Delete]";
92 inline std::string generateTableEntry(std::string_view key,
93 std::string_view value) {
94 return fcitx::stringutils::concat(key, keyValueSeparatorString, value);
97 inline std::string generateTableEntry(uint32_t pinyinKey, std::string_view key,
98 std::string_view value) {
99 return fcitx::stringutils::concat(fcitx::utf8::UCS4ToUTF8(pinyinKey), key,
100 keyValueSeparatorString, value);
103 void maybeUnescapeValue(std::string &value) {
104 if (
auto unescape = fcitx::stringutils::unescapeForValue(value)) {
105 value = unescape.value();
109 std::string maybeEscapeValue(std::string_view value) {
110 auto escaped = fcitx::stringutils::escapeForValue(value);
111 if (escaped.size() != value.size()) {
112 if (escaped.starts_with(
"\"") && escaped.ends_with(
"\"")) {
115 return fcitx::stringutils::concat(
"\"", escaped,
"\"");
117 return std::string{value};
120 void updateReverseLookupEntry(DATrie<int32_t> &trie, std::string_view key,
121 std::string_view value,
122 DATrie<int32_t> *reverseTrie) {
124 auto reverseEntry = generateTableEntry(value,
"");
126 trie.foreach(reverseEntry,
127 [&trie, &key, &value, &insert, reverseTrie](
128 int32_t,
size_t len, DATrie<int32_t>::position_type pos) {
129 if (key.length() > len) {
131 trie.suffix(oldKey, len, pos);
134 auto entry = generateTableEntry(oldKey, value);
135 reverseTrie->erase(entry);
143 reverseEntry.append(key.begin(), key.end());
144 trie.set(reverseEntry, 1);
146 auto entry = generateTableEntry(key, value);
147 reverseTrie->set(entry, 1);
152 void saveTrieToText(
const DATrie<uint32_t> &trie, std::ostream &out) {
154 std::vector<std::tuple<std::string, std::string, uint32_t>> temp;
155 trie.foreach([&trie, &buf, &temp](uint32_t value,
size_t _len,
156 DATrie<int32_t>::position_type pos) {
157 trie.suffix(buf, _len, pos);
158 auto sep = buf.find(keyValueSeparator);
159 std::string_view ref(buf);
160 temp.emplace_back(ref.substr(0, sep), ref.substr(sep + 1), value);
163 std::sort(temp.begin(), temp.end(), [](
const auto &lhs,
const auto &rhs) {
164 return std::get<uint32_t>(lhs) < std::get<uint32_t>(rhs);
166 for (
auto &item : temp) {
167 out << std::get<0>(item) <<
" " << maybeEscapeValue(std::get<1>(item))
172 uint32_t maxValue(
const DATrie<uint32_t> &trie) {
175 [&max](uint32_t value,
size_t, DATrie<uint32_t>::position_type) {
176 max = std::max(value + 1, max);
182 bool insertOrUpdateTrie(DATrie<uint32_t> &trie, uint32_t &index,
183 std::string_view entry,
bool updateExisting) {
185 if (trie.hasExactMatch(entry) && !updateExisting) {
188 trie.set(entry, index);
195 bool TableBasedDictionaryPrivate::validateKeyValue(std::string_view key,
196 std::string_view value,
197 PhraseFlag flag)
const {
199 auto keyLength = fcitx::utf8::lengthValidated(key);
200 auto valueLength = fcitx::utf8::lengthValidated(value);
201 if (keyLength == fcitx::utf8::INVALID_LENGTH ||
202 valueLength == fcitx::utf8::INVALID_LENGTH ||
203 (codeLength_ && flag != PhraseFlag::Pinyin &&
204 !q->isValidLength(keyLength))) {
207 if (!inputCode_.empty() && flag != PhraseFlag::Pinyin &&
208 !q->isAllInputCode(key)) {
215 std::pair<DATrie<uint32_t> *, uint32_t *>
216 TableBasedDictionaryPrivate::trieByFlag(PhraseFlag flag) {
218 case PhraseFlag::None:
219 case PhraseFlag::Pinyin:
220 return {&phraseTrie_, &phraseTrieIndex_};
222 case PhraseFlag::User:
223 return {&userTrie_, &userTrieIndex_};
226 return {
nullptr,
nullptr};
230 std::pair<const DATrie<uint32_t> *,
const uint32_t *>
231 TableBasedDictionaryPrivate::trieByFlag(PhraseFlag flag)
const {
233 case PhraseFlag::None:
234 case PhraseFlag::Pinyin:
235 return {&phraseTrie_, &phraseTrieIndex_};
237 case PhraseFlag::User:
238 return {&userTrie_, &userTrieIndex_};
241 return {
nullptr,
nullptr};
245 bool TableBasedDictionaryPrivate::insert(std::string_view key,
246 std::string_view value,
248 DATrie<uint32_t> *trie;
251 auto pair = trieByFlag(flag);
256 if (flag == PhraseFlag::Pinyin) {
257 entry = generateTableEntry(pinyinKey_, key, value);
259 entry = generateTableEntry(key, value);
262 if (flag == PhraseFlag::User) {
263 deletionTrie_.erase(entry);
266 return insertOrUpdateTrie(*trie, *index, entry, flag == PhraseFlag::User);
269 bool TableBasedDictionaryPrivate::matchTrie(
270 const DATrie<uint32_t> &trie, uint32_t indexOffset, std::string_view code,
271 TableMatchMode mode, PhraseFlag flag,
272 const TableMatchCallback &callback)
const {
273 auto range = fcitx::utf8::MakeUTF8CharRange(code);
274 std::vector<DATrie<uint32_t>::position_type> positions;
275 positions.push_back(0);
277 for (
auto iter = std::begin(range), end = std::end(range); iter != end;
279 decltype(positions) newPositions;
281 for (
auto position : positions) {
282 if (flag != PhraseFlag::Pinyin && *iter == options_.matchingKey() &&
283 options_.matchingKey()) {
284 for (
auto code : inputCode_) {
285 auto curPos = position;
286 auto strCode = fcitx::utf8::UCS4ToUTF8(code);
287 auto result = trie.traverse(strCode, curPos);
288 if (!DATrie<unsigned int>::isNoPath(result)) {
289 newPositions.push_back(curPos);
293 auto charRange = iter.charRange();
294 std::string_view chr(
296 std::distance(charRange.first, charRange.second));
297 auto curPos = position;
298 auto result = trie.traverse(chr, curPos);
299 if (!DATrie<unsigned int>::isNoPath(result)) {
300 newPositions.push_back(curPos);
305 positions = std::move(newPositions);
308 auto matchWord = [&trie, &code, &callback, flag, mode,
309 indexOffset](uint32_t value,
size_t len,
310 DATrie<int32_t>::position_type pos) {
312 trie.suffix(entry, code.size() + len, pos);
313 auto sep = entry.find(keyValueSeparator, code.size());
314 if (sep == std::string::npos) {
318 auto view = std::string_view(entry);
319 auto matchedCode = view.substr(0, sep);
320 if (mode == TableMatchMode::Prefix ||
321 (mode == TableMatchMode::Exact &&
322 fcitx::utf8::length(matchedCode) == fcitx::utf8::length(code))) {
324 if (flag == PhraseFlag::Pinyin) {
325 matchedCode.remove_prefix(
326 fcitx::utf8::ncharByteLength(matchedCode.begin(), 1));
328 return callback(matchedCode, view.substr(sep + 1),
329 value + indexOffset, flag);
334 for (
auto position : positions) {
335 if (!trie.foreach(matchWord, position)) {
342 bool TableBasedDictionaryPrivate::matchTrie(
343 std::string_view code, TableMatchMode mode, PhraseFlag flag,
344 const TableMatchCallback &callback)
const {
345 const auto &trie = *trieByFlag(flag).first;
346 if (!matchTrie(trie, 0, code, mode, flag, callback)) {
350 if (flag == PhraseFlag::None) {
351 unsigned int accumulatedIndex = phraseTrieIndex_;
352 for (
const auto &[trie, index] : extraTries_) {
353 if (!matchTrie(trie, accumulatedIndex, code, mode, flag,
357 accumulatedIndex += index;
364 void TableBasedDictionaryPrivate::reset() {
365 pinyinKey_ = promptKey_ = phraseKey_ = 0;
366 phraseTrieIndex_ = userTrieIndex_ = 0;
369 ignoreChars_.clear();
371 rules_.shrink_to_fit();
373 singleCharTrie_.clear();
374 singleCharConstTrie_.clear();
375 singleCharLookupTrie_.clear();
378 autoPhraseDict_.clear();
379 deletionTrie_.clear();
381 bool TableBasedDictionaryPrivate::validate()
const {
382 if (inputCode_.empty()) {
385 if (inputCode_.contains(pinyinKey_)) {
388 if (inputCode_.contains(promptKey_)) {
391 if (inputCode_.contains(phraseKey_)) {
397 std::optional<std::tuple<std::string, std::string, PhraseFlag>>
398 TableBasedDictionaryPrivate::parseDataLine(std::string_view buf,
bool user) {
399 uint32_t special[3] = {pinyinKey_, phraseKey_, promptKey_};
400 PhraseFlag specialFlag[] = {PhraseFlag::Pinyin, PhraseFlag::ConstructPhrase,
402 auto spacePos = buf.find_first_of(FCITX_WHITESPACE);
403 if (spacePos == std::string::npos || spacePos + 1 == buf.size()) {
406 auto wordPos = buf.find_first_not_of(FCITX_WHITESPACE, spacePos);
407 if (spacePos == std::string::npos || spacePos + 1 == buf.size()) {
411 auto key = std::string_view(buf).substr(0, spacePos);
412 std::string value{std::string_view(buf).substr(wordPos)};
413 maybeUnescapeValue(value);
415 if (key.empty() || value.empty()) {
420 std::string_view::iterator next =
421 fcitx::utf8::getNextChar(key.begin(), key.end(), &firstChar);
422 auto *iter = std::find(std::begin(special), std::end(special), firstChar);
423 PhraseFlag flag = user ? PhraseFlag::User : PhraseFlag::None;
424 if (iter != std::end(special)) {
429 flag = specialFlag[iter - std::begin(special)];
430 key = key.substr(std::distance(key.begin(), next));
433 return std::tuple<std::string, std::string, PhraseFlag>{
434 key, std::move(value), flag};
437 void TableBasedDictionaryPrivate::insertDataLine(std::string_view buf,
439 if (
auto data = parseDataLine(buf, user)) {
440 auto &[key, value, flag] = *data;
442 q_func()->insert(key, value, flag);
446 bool TableBasedDictionaryPrivate::matchWordsInternal(
447 std::string_view code, TableMatchMode mode,
bool onlyChecking,
448 const TableMatchCallback &callback)
const {
449 auto t0 = std::chrono::high_resolution_clock::now();
452 if (!matchTrie(code, mode, PhraseFlag::None,
453 [&callback,
this](std::string_view code,
454 std::string_view word, uint32_t index,
456 if (!deletionTrie_.empty()) {
457 auto entry = generateTableEntry(code, word);
458 if (deletionTrie_.hasExactMatch(entry)) {
462 return callback(code, word, index, flag);
467 LIBIME_TABLE_DEBUG() <<
"Match trie: " << millisecondsTill(t0);
471 auto pinyinCode = fcitx::stringutils::concat(
472 fcitx::utf8::UCS4ToUTF8(pinyinKey_), code);
474 auto pinyinMode = TableMatchMode::Exact;
475 int codeLength = fcitx::utf8::length(code);
476 if (onlyChecking || codeLength >= options_.autoSelectLength() ||
477 static_cast<size_t>(codeLength) > codeLength_ ||
478 codeLength >= options_.noMatchAutoSelectLength()) {
479 pinyinMode = TableMatchMode::Prefix;
481 if (!matchTrie(pinyinCode, pinyinMode, PhraseFlag::Pinyin, callback)) {
486 LIBIME_TABLE_DEBUG() <<
"Match pinyin: " << millisecondsTill(t0);
489 if (!matchTrie(code, mode, PhraseFlag::User, callback)) {
493 LIBIME_TABLE_DEBUG() <<
"Match user: " << millisecondsTill(t0);
494 auto matchAutoPhrase = [mode, code, &callback](std::string_view entry,
496 auto sep = entry.find(keyValueSeparator, code.size());
497 if (sep == std::string::npos) {
501 auto view = std::string_view(entry);
502 auto matchedCode = view.substr(0, sep);
503 if (mode == TableMatchMode::Prefix ||
504 (mode == TableMatchMode::Exact &&
505 fcitx::utf8::length(matchedCode) == fcitx::utf8::length(code))) {
506 return callback(matchedCode, view.substr(sep + 1), 0,
512 return autoPhraseDict_.search(code, matchAutoPhrase);
515 bool TableBasedDictionaryPrivate::validateHints(std::vector<std::string> &hints,
516 const TableRule &rule)
const {
517 if (hints.size() <= 1) {
521 for (
const auto &ruleEntry : rule.entries()) {
523 if (ruleEntry.isPlaceHolder()) {
527 if (ruleEntry.character() > hints.size()) {
532 if (ruleEntry.flag() == TableRuleEntryFlag::FromFront) {
533 index = ruleEntry.character() - 1;
535 index = hints.size() - ruleEntry.character();
537 assert(index < hints.size());
542 fcitx::utf8::length(hints[index]) <
543 static_cast<size_t>(std::abs(ruleEntry.index()))) {
544 hints[index] = std::string();
551 bool TableBasedDictionaryPrivate::hasExactMatchInPhraseTrie(
552 std::string_view entry)
const {
553 return phraseTrie_.hasExactMatch(entry) ||
554 std::any_of(extraTries_.begin(), extraTries_.end(),
555 [&entry](
const auto &extraTrie) {
556 return extraTrie.first.hasExactMatch(entry);
560 void TableBasedDictionaryPrivate::loadBinary(std::istream &in) {
562 throw_if_io_fail(unmarshall(in, pinyinKey_));
563 throw_if_io_fail(unmarshall(in, promptKey_));
564 throw_if_io_fail(unmarshall(in, phraseKey_));
565 throw_if_io_fail(unmarshall(in, codeLength_));
568 throw_if_io_fail(unmarshall(in, size));
572 throw_if_io_fail(unmarshall(in, c));
573 inputCode_.insert(c);
576 throw_if_io_fail(unmarshall(in, size));
577 ignoreChars_.clear();
580 throw_if_io_fail(unmarshall(in, c));
581 ignoreChars_.insert(c);
584 throw_if_io_fail(unmarshall(in, size));
587 rules_.emplace_back(in);
589 phraseTrie_ = decltype(phraseTrie_)(in);
590 phraseTrieIndex_ = maxValue(phraseTrie_);
591 singleCharTrie_ = decltype(singleCharTrie_)(in);
593 singleCharConstTrie_ = decltype(singleCharConstTrie_)(in);
594 singleCharLookupTrie_ = decltype(singleCharLookupTrie_)(in);
597 promptTrie_ = decltype(promptTrie_)(in);
601 void TableBasedDictionaryPrivate::loadUserBinary(std::istream &in,
603 userTrie_ = decltype(userTrie_)(in);
604 userTrieIndex_ = maxValue(userTrie_);
605 autoPhraseDict_ = decltype(autoPhraseDict_)(TABLE_AUTOPHRASE_SIZE, in);
608 deletionTrie_ = decltype(deletionTrie_)(in);
610 deletionTrie_ = decltype(deletionTrie_)();
614 TableBasedDictionary::TableBasedDictionary()
615 : d_ptr(std::make_unique<TableBasedDictionaryPrivate>(this)) {
620 TableBasedDictionary::~TableBasedDictionary() =
default;
622 void TableBasedDictionary::load(
const char *filename, TableFormat format) {
623 std::ifstream in(filename, std::ios::in | std::ios::binary);
624 throw_if_io_fail(in);
628 void TableBasedDictionary::load(std::istream &in, TableFormat format) {
630 case TableFormat::Binary:
633 case TableFormat::Text:
637 throw std::invalid_argument(
"unknown format type");
641 void TableBasedDictionary::loadText(std::istream &in) {
647 auto consumeOptionPrefix = [](std::string_view &buf,
int index) {
648 if (fcitx::stringutils::consumePrefix(buf, strConst[0][index])) {
651 if (fcitx::stringutils::consumePrefix(buf, strConst[1][index])) {
657 auto phase = BuildPhase::PhaseConfig;
659 if (!std::getline(in, buf)) {
664 if (!fcitx::utf8::validate(buf)) {
668 auto line = fcitx::stringutils::trimView(buf);
671 case BuildPhase::PhaseConfig: {
672 if (line.starts_with(
"#")) {
676 if (consumeOptionPrefix(line, STR_KEYCODE)) {
677 auto range = fcitx::utf8::MakeUTF8CharRange(line);
678 d->inputCode_ = std::set<uint32_t>(range.begin(), range.end());
679 }
else if (consumeOptionPrefix(line, STR_CODELEN)) {
680 d->codeLength_ = std::stoi(std::string(line));
681 }
else if (consumeOptionPrefix(line, STR_PINYINLEN)) {
683 }
else if (consumeOptionPrefix(line, STR_IGNORECHAR)) {
684 auto range = fcitx::utf8::MakeUTF8CharRange(line);
686 std::set<uint32_t>(range.begin(), range.end());
687 }
else if (consumeOptionPrefix(line, STR_PINYIN)) {
688 const auto chr = fcitx::utf8::getChar(line);
689 if (fcitx::utf8::isValidChar(chr)) {
692 }
else if (consumeOptionPrefix(line, STR_PROMPT)) {
693 const auto chr = fcitx::utf8::getChar(line);
694 if (fcitx::utf8::isValidChar(chr)) {
697 }
else if (consumeOptionPrefix(line, STR_CONSTRUCTPHRASE)) {
698 const auto chr = fcitx::utf8::getChar(line);
699 if (fcitx::utf8::isValidChar(chr)) {
702 }
else if (consumeOptionPrefix(line, STR_DATA)) {
703 phase = BuildPhase::PhaseData;
704 if (!d->validate()) {
705 throw std::invalid_argument(
"file format is invalid");
708 }
else if (consumeOptionPrefix(line, STR_RULE)) {
709 phase = BuildPhase::PhaseRule;
714 case BuildPhase::PhaseRule: {
715 if (line.starts_with(
"#")) {
718 if (consumeOptionPrefix(line, STR_DATA)) {
719 phase = BuildPhase::PhaseData;
720 if (!d->validate()) {
721 throw std::invalid_argument(
"file format is invalid");
730 d->rules_.emplace_back(std::string(line), d->codeLength_);
733 case BuildPhase::PhaseData:
734 if (consumeOptionPrefix(line, STR_PHRASE)) {
735 phase = BuildPhase::PhasePhrase;
737 throw std::invalid_argument(
738 "file has phrase section but no rule");
742 d->insertDataLine(line,
false);
744 case BuildPhase::PhasePhrase: {
745 std::string phrase{line};
746 maybeUnescapeValue(phrase);
747 insert(phrase, PhraseFlag::None);
753 if (phase != BuildPhase::PhaseData && phase != BuildPhase::PhasePhrase) {
754 throw_if_fail(in.bad(), std::ios_base::failure(
"io failed"));
755 throw std::invalid_argument(
"file format is invalid");
759 void TableBasedDictionary::saveText(std::ostream &out) {
761 out << strConst[1][STR_KEYCODE];
762 for (
auto c : d->inputCode_) {
763 out << fcitx::utf8::UCS4ToUTF8(c);
766 out << strConst[1][STR_CODELEN] << d->codeLength_ <<
'\n';
767 if (!d->ignoreChars_.empty()) {
768 out << strConst[1][STR_IGNORECHAR];
769 for (
auto c : d->ignoreChars_) {
770 out << fcitx::utf8::UCS4ToUTF8(c);
775 out << strConst[1][STR_PINYIN] << fcitx::utf8::UCS4ToUTF8(d->pinyinKey_)
779 out << strConst[1][STR_PROMPT] << fcitx::utf8::UCS4ToUTF8(d->promptKey_)
783 out << strConst[1][STR_CONSTRUCTPHRASE]
784 << fcitx::utf8::UCS4ToUTF8(d->phraseKey_) <<
'\n';
788 out << strConst[1][STR_RULE] <<
'\n';
789 for (
const auto &rule : d->rules_) {
790 out << rule.toString() <<
'\n';
793 out << strConst[1][STR_DATA] <<
'\n';
796 auto promptString = fcitx::utf8::UCS4ToUTF8(d->promptKey_);
797 d->promptTrie_.foreach(
798 [&promptString, d, &buf,
799 &out](uint32_t,
size_t _len, DATrie<uint32_t>::position_type pos) {
800 d->promptTrie_.suffix(buf, _len, pos);
801 auto sep = buf.find(keyValueSeparator);
802 if (sep == std::string::npos) {
805 std::string_view ref(buf);
806 out << promptString << ref.substr(sep + 1) <<
" " 807 << maybeEscapeValue(ref.substr(0, sep)) <<
'\n';
812 auto phraseString = fcitx::utf8::UCS4ToUTF8(d->phraseKey_);
813 d->singleCharConstTrie_.foreach(
814 [&phraseString, d, &buf, &out](int32_t,
size_t _len,
815 DATrie<int32_t>::position_type pos) {
816 d->singleCharConstTrie_.suffix(buf, _len, pos);
817 auto sep = buf.find(keyValueSeparator);
818 if (sep == std::string::npos) {
821 std::string_view ref(buf);
822 out << phraseString << ref.substr(sep + 1) <<
" " 823 << maybeEscapeValue(ref.substr(0, sep)) <<
'\n';
828 saveTrieToText(d->phraseTrie_, out);
831 void TableBasedDictionary::loadBinary(std::istream &in) {
835 throw_if_io_fail(unmarshall(in, magic));
836 if (magic != tableBinaryFormatMagic) {
837 throw std::invalid_argument(
"Invalid table magic.");
839 throw_if_io_fail(unmarshall(in, version));
844 case tableBinaryFormatVersion:
846 in, [d](std::istream &compressIn) { d->loadBinary(compressIn); });
850 throw std::invalid_argument(
"Invalid table version.");
854 void TableBasedDictionary::save(
const char *filename, TableFormat format) {
855 std::ofstream fout(filename, std::ios::out | std::ios::binary);
856 throw_if_io_fail(fout);
860 void TableBasedDictionary::save(std::ostream &out, TableFormat format) {
862 case TableFormat::Binary:
865 case TableFormat::Text:
869 throw std::invalid_argument(
"unknown format type");
873 void TableBasedDictionary::saveBinary(std::ostream &origOut) {
874 throw_if_io_fail(marshall(origOut, tableBinaryFormatMagic));
875 throw_if_io_fail(marshall(origOut, tableBinaryFormatVersion));
877 writeZSTDCompressed(origOut, [
this](std::ostream &out) {
879 throw_if_io_fail(marshall(out, d->pinyinKey_));
880 throw_if_io_fail(marshall(out, d->promptKey_));
881 throw_if_io_fail(marshall(out, d->phraseKey_));
882 throw_if_io_fail(marshall(out, d->codeLength_));
884 marshall(out, static_cast<uint32_t>(d->inputCode_.size())));
885 for (
auto c : d->inputCode_) {
886 throw_if_io_fail(marshall(out, c));
889 marshall(out, static_cast<uint32_t>(d->ignoreChars_.size())));
890 for (
auto c : d->ignoreChars_) {
891 throw_if_io_fail(marshall(out, c));
894 marshall(out, static_cast<uint32_t>(d->rules_.size())));
895 for (
const auto &rule : d->rules_) {
896 throw_if_io_fail(out << rule);
898 d->phraseTrie_.save(out);
899 d->singleCharTrie_.save(out);
901 d->singleCharConstTrie_.save(out);
902 d->singleCharLookupTrie_.save(out);
905 d->promptTrie_.save(out);
910 void TableBasedDictionary::loadUser(
const char *filename, TableFormat format) {
911 std::ifstream in(filename, std::ios::in | std::ios::binary);
912 throw_if_io_fail(in);
913 loadUser(in, format);
916 void TableBasedDictionary::loadUser(std::istream &in, TableFormat format) {
919 uint32_t version = 0;
921 case TableFormat::Binary:
922 throw_if_io_fail(unmarshall(in, magic));
923 if (magic != userTableBinaryFormatMagic) {
924 throw std::invalid_argument(
"Invalid user table magic.");
926 throw_if_io_fail(unmarshall(in, version));
930 d->loadUserBinary(in, version);
932 case userTableBinaryFormatVersion:
933 readZSTDCompressed(in, [d, version](std::istream &compressIn) {
934 d->loadUserBinary(compressIn, version);
938 throw std::invalid_argument(
"Invalid user table version.");
941 case TableFormat::Text: {
943 enum class UserDictState { Phrase, Auto, Delete };
945 UserDictState state = UserDictState::Phrase;
947 if (!std::getline(in, lineBuf)) {
952 if (!fcitx::utf8::validate(lineBuf)) {
955 auto line = fcitx::stringutils::trimView(lineBuf);
956 if (line == UserDictAutoMark) {
957 state = UserDictState::Auto;
960 if (line == UserDictDeleteMark) {
961 state = UserDictState::Delete;
966 case UserDictState::Phrase:
967 d->insertDataLine(line,
true);
969 case UserDictState::Auto: {
970 auto tokens = fcitx::stringutils::split(line, FCITX_WHITESPACE);
971 if (tokens.size() != 3 || !isAllInputCode(tokens[0])) {
975 maybeUnescapeValue(tokens[1]);
976 int32_t hit = std::stoi(tokens[2]);
977 d->autoPhraseDict_.insert(
978 generateTableEntry(tokens[0], tokens[1]), hit);
979 }
catch (
const std::exception &) {
983 case UserDictState::Delete: {
984 if (
auto data = d->parseDataLine(line,
true)) {
985 auto &[key, value, flag] = *data;
986 auto entry = generateTableEntry(key, value);
987 d->deletionTrie_.set(entry, 0);
995 throw std::invalid_argument(
"unknown format type");
999 void TableBasedDictionary::saveUser(
const char *filename, TableFormat format) {
1000 std::ofstream fout(filename, std::ios::out | std::ios::binary);
1001 throw_if_io_fail(fout);
1002 saveUser(fout, format);
1005 void TableBasedDictionary::saveUser(std::ostream &out, TableFormat format) {
1008 case TableFormat::Binary: {
1009 throw_if_io_fail(marshall(out, userTableBinaryFormatMagic));
1010 throw_if_io_fail(marshall(out, userTableBinaryFormatVersion));
1012 writeZSTDCompressed(out, [d](std::ostream &compressOut) {
1013 d->userTrie_.save(compressOut);
1014 throw_if_io_fail(compressOut);
1015 d->autoPhraseDict_.save(compressOut);
1016 throw_if_io_fail(compressOut);
1017 d->deletionTrie_.save(compressOut);
1018 throw_if_io_fail(compressOut);
1022 case TableFormat::Text: {
1023 saveTrieToText(d->userTrie_, out);
1025 if (!d->autoPhraseDict_.empty()) {
1026 out << UserDictAutoMark <<
'\n';
1027 std::vector<std::tuple<std::string, std::string, int32_t>>
1029 d->autoPhraseDict_.search(
1030 "", [&autoEntries](std::string_view entry,
int hit) {
1031 auto sep = entry.find(keyValueSeparator);
1032 autoEntries.emplace_back(entry.substr(0, sep),
1033 entry.substr(sep + 1), hit);
1036 for (
auto &t : autoEntries | std::views::reverse) {
1037 out << std::get<0>(t) <<
" " << maybeEscapeValue(std::get<1>(t))
1038 <<
" " << std::get<2>(t) <<
'\n';
1041 if (!d->deletionTrie_.empty()) {
1042 out << UserDictDeleteMark <<
'\n';
1043 saveTrieToText(d->deletionTrie_, out);
1048 throw std::invalid_argument(
"unknown format type");
1052 size_t TableBasedDictionary::loadExtra(
const char *filename,
1053 TableFormat format) {
1054 std::ifstream in(filename, std::ios::in | std::ios::binary);
1055 throw_if_io_fail(in);
1056 return loadExtra(in, format);
1059 size_t TableBasedDictionary::loadExtra(std::istream &in, TableFormat format) {
1061 DATrie<uint32_t> trie;
1064 uint32_t version = 0;
1066 case TableFormat::Binary:
1067 throw_if_io_fail(unmarshall(in, magic));
1068 if (magic != extraTableBinaryFormatMagic) {
1069 throw std::invalid_argument(
"Invalid user table magic.");
1071 throw_if_io_fail(unmarshall(in, version));
1073 case extraTableBinaryFormatVersion:
1074 readZSTDCompressed(in, [&trie](std::istream &compressIn) {
1075 trie.load(compressIn);
1077 index = maxValue(trie);
1080 throw std::invalid_argument(
"Invalid user table version.");
1083 case TableFormat::Text: {
1084 std::string lineBuf;
1085 enum class ExtraDictState { Data, Phrase };
1087 ExtraDictState state = ExtraDictState::Data;
1089 if (!std::getline(in, lineBuf)) {
1094 if (!fcitx::utf8::validate(lineBuf)) {
1097 const auto line = fcitx::stringutils::trimView(lineBuf);
1098 if (line == strConst[0][STR_PHRASE] ||
1099 line == strConst[1][STR_PHRASE]) {
1100 state = ExtraDictState::Phrase;
1108 case ExtraDictState::Data:
1109 if (
auto data = d->parseDataLine(line,
false)) {
1110 std::tie(key, value, flag) = *data;
1111 if (flag != PhraseFlag::None) {
1114 maybeUnescapeValue(value);
1117 case ExtraDictState::Phrase:
1119 maybeUnescapeValue(value);
1120 if (!generate(value, key)) {
1125 if (value.empty() || key.empty()) {
1128 auto entry = generateTableEntry(key, value);
1129 insertOrUpdateTrie(trie, index, entry,
false);
1134 throw std::invalid_argument(
"unknown format type");
1137 d->extraTries_.push_back(std::make_pair(std::move(trie), index));
1138 return d->extraTries_.size() - 1;
1141 void TableBasedDictionary::saveExtra(
size_t index,
const char *filename,
1142 TableFormat format) {
1143 std::ofstream fout(filename, std::ios::out | std::ios::binary);
1144 throw_if_io_fail(fout);
1145 saveExtra(index, fout, format);
1148 void TableBasedDictionary::saveExtra(
size_t index, std::ostream &out,
1149 TableFormat format) {
1151 if (index >= d->extraTries_.size()) {
1152 throw std::invalid_argument(
"Invalid extra dict index");
1155 case TableFormat::Binary:
1157 throw_if_io_fail(marshall(out, extraTableBinaryFormatMagic));
1158 throw_if_io_fail(marshall(out, extraTableBinaryFormatVersion));
1160 writeZSTDCompressed(out, [d, index](std::ostream &compressOut) {
1161 d->extraTries_[index].first.save(compressOut);
1164 case TableFormat::Text:
1165 saveTrieToText(d->extraTries_[index].first, out);
1168 throw std::invalid_argument(
"unknown format type");
1172 void TableBasedDictionary::removeAllExtra() {
1174 d->extraTries_.clear();
1177 bool TableBasedDictionary::hasRule() const noexcept {
1179 return !d->rules_.empty();
1182 bool TableBasedDictionary::hasCustomPrompt() const noexcept {
1184 return !d->promptTrie_.empty();
1187 const TableRule *TableBasedDictionary::findRule(std::string_view name)
const {
1189 for (
const auto &rule : d->rules_) {
1190 if (rule.name() == name) {
1197 bool TableBasedDictionary::insert(std::string_view value,
1198 libime::PhraseFlag flag) {
1200 if (flag != PhraseFlag::None && flag != PhraseFlag::User) {
1204 if (generate(value, key)) {
1205 return insert(key, value, flag);
1210 bool TableBasedDictionary::insert(std::string_view key, std::string_view value,
1211 PhraseFlag flag,
bool verifyWithRule) {
1214 if (!d->validateKeyValue(key, value, flag)) {
1219 case PhraseFlag::Pinyin:
1220 case PhraseFlag::User:
1221 case PhraseFlag::None: {
1222 if (flag != PhraseFlag::Pinyin && verifyWithRule && hasRule()) {
1223 std::string checkKey;
1224 if (!generate(value, checkKey)) {
1227 if (checkKey != key) {
1232 if (!d->insert(key, value, flag)) {
1236 if (flag == PhraseFlag::None && fcitx::utf8::length(value) == 1 &&
1237 !d->ignoreChars_.contains(fcitx::utf8::getChar(value))) {
1238 updateReverseLookupEntry(d->singleCharTrie_, key, value,
nullptr);
1240 if (hasRule() && !d->phraseKey_) {
1241 updateReverseLookupEntry(d->singleCharConstTrie_, key, value,
1242 &d->singleCharLookupTrie_);
1247 case PhraseFlag::Prompt:
1249 d->promptTrie_.set(generateTableEntry(key, value), 0);
1254 case PhraseFlag::ConstructPhrase:
1255 if (hasRule() && fcitx::utf8::length(value) == 1) {
1256 updateReverseLookupEntry(d->singleCharConstTrie_, key, value,
1257 &d->singleCharLookupTrie_);
1260 case PhraseFlag::Auto: {
1261 const auto entry = generateTableEntry(key, value);
1262 auto hit = d->autoPhraseDict_.exactSearch(entry);
1263 if (tableOptions().saveAutoPhraseAfter() >= 1 &&
1264 static_cast<uint32_t>(tableOptions().saveAutoPhraseAfter()) <=
1266 d->autoPhraseDict_.erase(entry);
1267 insert(key, value, PhraseFlag::User,
false);
1269 d->autoPhraseDict_.insert(entry);
1272 case PhraseFlag::Invalid:
1278 bool TableBasedDictionary::generate(std::string_view value,
1279 std::string &key)
const {
1280 return generateWithHint(value, {}, key);
1283 bool TableBasedDictionary::generateWithHint(
1284 std::string_view value,
const std::vector<std::string> &codeHints,
1285 std::string &key)
const {
1287 if (!hasRule() || value.empty()) {
1292 auto valueLen = fcitx::utf8::lengthValidated(value);
1293 if (valueLen == fcitx::utf8::INVALID_LENGTH) {
1297 for (
const auto &code : codeHints) {
1298 if (!fcitx::utf8::validate(code)) {
1303 auto hints = codeHints;
1304 hints.resize(valueLen);
1307 for (
const auto &rule : d->rules_) {
1309 const bool canApplyRule =
1310 ((rule.flag() == TableRuleFlag::LengthEqual &&
1311 valueLen == rule.phraseLength()) ||
1312 (rule.flag() == TableRuleFlag::LengthLongerThan &&
1313 valueLen >= rule.phraseLength()));
1314 if (!canApplyRule) {
1318 auto hints = codeHints;
1319 hints.resize(valueLen);
1321 if (!d->validateHints(hints, rule)) {
1325 bool success =
true;
1326 std::set<std::pair<size_t, int>> usedChar;
1327 for (
const auto &ruleEntry : rule.entries()) {
1328 std::string_view::const_iterator iter;
1330 if (ruleEntry.isPlaceHolder()) {
1334 if (ruleEntry.character() > valueLen) {
1340 if (ruleEntry.flag() == TableRuleEntryFlag::FromFront) {
1341 index = ruleEntry.character() - 1;
1343 index = valueLen - ruleEntry.character();
1345 iter = fcitx::utf8::nextNChar(value.begin(), index);
1346 std::string_view::iterator prev = iter;
1347 iter = fcitx::utf8::nextChar(iter);
1348 std::string_view chr(&*prev, std::distance(prev, iter));
1351 if (!hints[index].empty()) {
1352 entry = hints[index];
1354 entry = reverseLookup(chr, PhraseFlag::ConstructPhrase);
1356 if (entry.empty()) {
1361 auto length = fcitx::utf8::lengthValidated(entry);
1362 auto codeIndex = ruleEntry.index();
1363 if (length == fcitx::utf8::INVALID_LENGTH ||
1364 length < static_cast<size_t>(std::abs(codeIndex))) {
1368 if (codeIndex > 0) {
1372 codeIndex =
static_cast<int>(length) + codeIndex;
1375 auto charIndex = std::make_pair(index, codeIndex);
1379 if (usedChar.contains(charIndex)) {
1382 usedChar.insert(charIndex);
1383 auto entryStart = fcitx::utf8::nextNChar(entry.begin(), codeIndex);
1384 auto entryEnd = fcitx::utf8::nextChar(entryStart);
1386 newKey.append(entryStart, entryEnd);
1389 if (success && !newKey.empty()) {
1398 bool TableBasedDictionary::isInputCode(uint32_t c)
const {
1400 return !!(d->inputCode_.count(c));
1403 bool TableBasedDictionary::isAllInputCode(std::string_view code)
const {
1404 std::string_view::iterator iter = code.begin();
1405 std::string_view::iterator end = code.end();
1406 while (iter != end) {
1408 iter = fcitx::utf8::getNextChar(iter, end, &chr);
1409 if (!fcitx::utf8::isValidChar(chr) || !isInputCode(chr)) {
1416 bool TableBasedDictionary::isEndKey(uint32_t c)
const {
1418 return d->options_.endKey().contains(c);
1421 void TableBasedDictionary::statistic()
const {
1423 std::cout <<
"Phrase Trie: " << d->phraseTrie_.mem_size() <<
'\n' 1424 <<
"Single Char Trie: " << d->singleCharTrie_.mem_size() <<
'\n' 1425 <<
"Single char const trie: " 1426 << d->singleCharConstTrie_.mem_size() <<
" + " 1427 << d->singleCharLookupTrie_.mem_size() <<
'\n' 1428 <<
"Prompt Trie: " << d->promptTrie_.mem_size() <<
'\n';
1431 void TableBasedDictionary::setTableOptions(TableOptions option) {
1433 d->options_ = std::move(option);
1434 if (d->options_.autoSelectLength() < 0) {
1435 d->options_.setAutoSelectLength(maxLength());
1437 if (d->options_.noMatchAutoSelectLength() < 0) {
1438 d->options_.setNoMatchAutoSelectLength(maxLength());
1440 if (d->options_.autoPhraseLength() < 0) {
1441 d->options_.setAutoPhraseLength(maxLength());
1443 d->autoSelectRegex_.reset();
1444 d->noMatchAutoSelectRegex_.reset();
1446 if (!d->options_.autoSelectRegex().empty()) {
1447 d->autoSelectRegex_.emplace(d->options_.autoSelectRegex());
1452 if (!d->options_.noMatchAutoSelectRegex().empty()) {
1453 d->noMatchAutoSelectRegex_.emplace(
1454 d->options_.noMatchAutoSelectRegex());
1460 const TableOptions &TableBasedDictionary::tableOptions()
const {
1465 bool TableBasedDictionary::hasPinyin()
const {
1467 return d->pinyinKey_;
1470 uint32_t TableBasedDictionary::maxLength()
const {
1472 return d->codeLength_;
1475 bool TableBasedDictionary::isValidLength(
size_t length)
const {
1477 return length <= d->codeLength_;
1479 bool TableBasedDictionary::matchWords(
1480 std::string_view code, TableMatchMode mode,
1481 const TableMatchCallback &callback)
const {
1483 return d->matchWordsInternal(code, mode,
false, callback);
1486 bool TableBasedDictionary::hasMatchingWords(std::string_view code,
1487 std::string_view next)
const {
1488 std::string str{code};
1489 str.append(next.data(), next.size());
1490 return hasMatchingWords(str);
1493 bool TableBasedDictionary::hasMatchingWords(std::string_view code)
const {
1495 bool hasMatch =
false;
1496 d->matchWordsInternal(
1497 code, TableMatchMode::Prefix,
true,
1498 [&hasMatch](std::string_view, std::string_view, uint32_t, PhraseFlag) {
1505 bool TableBasedDictionary::hasOneMatchingWord(std::string_view code)
const {
1507 std::optional<std::tuple<std::string, std::string>> previousMatch;
1508 matchWords(code, TableMatchMode::Prefix,
1509 [&previousMatch](std::string_view code, std::string_view word,
1510 uint32_t, PhraseFlag) {
1511 if (previousMatch) {
1512 if (std::get<0>(*previousMatch) == code &&
1513 std::get<1>(*previousMatch) == word) {
1516 previousMatch.reset();
1519 previousMatch.emplace(code, word);
1522 return previousMatch.has_value();
1525 PhraseFlag TableBasedDictionary::wordExists(std::string_view code,
1526 std::string_view word)
const {
1528 auto entry = generateTableEntry(code, word);
1530 if (d->userTrie_.hasExactMatch(entry)) {
1531 return PhraseFlag::User;
1533 if (d->hasExactMatchInPhraseTrie(entry) &&
1534 !d->deletionTrie_.hasExactMatch(entry)) {
1535 return PhraseFlag::None;
1538 if (d->autoPhraseDict_.exactSearch(entry)) {
1539 return PhraseFlag::Auto;
1541 return PhraseFlag::Invalid;
1544 void TableBasedDictionary::removeWord(std::string_view code,
1545 std::string_view word) {
1547 auto entry = generateTableEntry(code, word);
1548 d->autoPhraseDict_.erase(entry);
1549 d->userTrie_.erase(entry);
1550 if (d->hasExactMatchInPhraseTrie(entry) &&
1551 !d->deletionTrie_.hasExactMatch(entry)) {
1552 d->deletionTrie_.set(entry, 0);
1556 std::string TableBasedDictionary::reverseLookup(std::string_view word,
1557 PhraseFlag flag)
const {
1559 if (flag != PhraseFlag::ConstructPhrase && flag != PhraseFlag::None) {
1560 throw std::runtime_error(
"Invalid flag.");
1562 std::string reverseEntry{word};
1563 reverseEntry.push_back(keyValueSeparator);
1566 (flag == PhraseFlag::ConstructPhrase ? d->singleCharConstTrie_
1567 : d->singleCharTrie_);
1570 [&trie, &key](int32_t,
size_t len, DATrie<int32_t>::position_type pos) {
1571 trie.suffix(key, len, pos);
1577 std::string TableBasedDictionary::hint(std::string_view key)
const {
1579 if (!d->promptKey_) {
1580 return std::string{key};
1584 auto range = fcitx::utf8::MakeUTF8CharRange(key);
1585 for (
auto iter = std::begin(range); iter != std::end(range); iter++) {
1586 auto charRange = iter.charRange();
1589 std::distance(charRange.first, charRange.second));
1591 d->promptTrie_.foreach(
1592 generateTableEntry(
search,
""),
1593 [&entry, d](uint32_t,
size_t len,
1594 DATrie<uint32_t>::position_type pos) {
1595 d->promptTrie_.suffix(entry, len, pos);
1598 if (!entry.empty()) {
1599 result.append(entry);
1601 result.append(charRange.first, charRange.second);
1607 void TableBasedDictionary::matchPrefixImpl(
1608 const SegmentGraph &graph,
const GraphMatchCallback &callback,
1609 const std::unordered_set<const SegmentGraphNode *> &ignore,
1612 auto range = fcitx::utf8::MakeUTF8CharRange(graph.data());
1614 d->options_.matchingKey() &&
1615 std::any_of(std::begin(range), std::end(range),
1616 [d](uint32_t c) {
return d->options_.matchingKey() == c; });
1618 const TableMatchMode mode = tableOptions().exactMatch() || hasWildcard
1619 ? TableMatchMode::Exact
1620 : TableMatchMode::Prefix;
1621 SegmentGraphPath path;
1623 graph.bfs(&graph.start(), [
this, &ignore, &path, &callback, hasWildcard,
1624 mode](
const SegmentGraphBase &graph,
1625 const SegmentGraphNode *node) {
1626 if (!node->prevSize() || ignore.contains(node)) {
1629 for (
const auto &prev : node->prevs()) {
1631 path.push_back(&prev);
1632 path.push_back(node);
1634 auto code = graph.segment(*path[0], *path[1]);
1635 if (code.size() == graph.size()) {
1638 [&](std::string_view code, std::string_view word,
1639 uint32_t index, PhraseFlag flag) {
1643 if (flag == PhraseFlag::User &&
1644 code.size() <= tableOptions().noSortInputLength()) {
1648 WordNode wordNode(word, InvalidWordIndex);
1652 if (flag == PhraseFlag::Pinyin && graph.size() == 1 &&
1656 callback(path, wordNode, 0,
1657 std::make_unique<TableLatticeNodePrivate>(
1658 code, index, flag));
1661 }
else if (!hasWildcard) {
1665 d->singleCharLookupTrie_.foreach(
1666 code, [&](uint32_t,
size_t len,
1667 DATrie<uint32_t>::position_type pos) {
1668 d->singleCharLookupTrie_.suffix(entry,
1669 code.size() + len, pos);
1671 auto sep = entry.find(keyValueSeparator);
1672 if (sep == std::string::npos) {
1676 std::string_view ref(entry);
1677 auto code = ref.substr(0, sep);
1678 auto word = ref.substr(sep + 1);
1680 WordNode wordNode(word, InvalidWordIndex);
1681 callback(path, wordNode, 0,
1682 std::make_unique<TableLatticeNodePrivate>(
1683 code, 0, PhraseFlag::ConstructPhrase));
void insert(const std::string &entry, uint32_t value=0)
Insert a word into dictionary and refresh the MRU.
Provide a DATrie implementation.
bool search(std::string_view s, const std::function< bool(std::string_view, uint32_t)> &callback) const
Check if any word starting with s exists in the dictionary.