7 #include "pinyindictionary.h" 25 #include <string_view> 27 #include <unordered_set> 30 #include <boost/container_hash/hash.hpp> 31 #include <fcitx-utils/macros.h> 32 #include <fcitx-utils/signals.h> 33 #include <fcitx-utils/stringutils.h> 35 #include "libime/core/dictionary.h" 36 #include "libime/core/languagemodel.h" 37 #include "libime/core/lattice.h" 38 #include "libime/core/lrucache.h" 39 #include "libime/core/segmentgraph.h" 40 #include "libime/core/triedictionary.h" 41 #include "libime/core/utils.h" 42 #include "libime/core/utils_p.h" 43 #include "libime/core/zstdfilter.h" 44 #include "libime/pinyin/pinyinmatchstate.h" 45 #include "constants.h" 46 #include "pinyindecoder_p.h" 47 #include "pinyinencoder.h" 48 #include "pinyinmatchstate_p.h" 53 const float fuzzyCost = std::log10(0.5F);
54 const size_t minimumLongWordLength = 3;
55 const float invalidPinyinCost = -100.0F;
56 const char pinyinHanziSep =
'!';
58 constexpr uint32_t pinyinBinaryFormatMagic = 0x000fc613;
59 constexpr uint32_t pinyinBinaryFormatVersion = 0x2;
61 struct PinyinSegmentGraphPathHasher {
62 PinyinSegmentGraphPathHasher(
const SegmentGraph &graph) : graph_(graph) {}
66 std::string pathToPinyins(
const SegmentGraphPath &path)
const {
68 result.reserve(path.size() + path.back()->index() -
69 path.front()->index() + 1);
70 const auto &data = graph_.data();
71 auto iter = path.begin();
72 while (iter + 1 < path.end()) {
73 auto begin = (*iter)->index();
74 auto end = (*std::next(iter))->index();
76 if (data[begin] ==
'\'') {
80 result.push_back(data[begin]);
83 result.push_back(
'|');
89 size_t operator()(
const SegmentGraphPath &path)
const {
90 if (path.size() <= 1) {
93 boost::hash<char> hasher;
96 const auto &data = graph_.data();
97 auto iter = path.begin();
98 while (iter + 1 < path.end()) {
99 auto begin = (*iter)->index();
100 auto end = (*std::next(iter))->index();
102 if (data[begin] ==
'\'') {
105 while (begin < end) {
106 boost::hash_combine(seed, hasher(data[begin]));
109 boost::hash_combine(seed, hasher(
'|'));
117 bool operator()(
const SegmentGraphPath &path,
const std::string &s)
const {
118 if (path.size() <= 1) {
122 const auto &data = graph_.data();
123 auto iter = path.begin();
124 while (iter + 1 < path.end() && is != s.end()) {
125 auto begin = (*iter)->index();
126 auto end = (*std::next(iter))->index();
128 if (data[begin] ==
'\'') {
131 while (begin < end && is != s.end()) {
132 if (*is != data[begin]) {
142 if (is == s.end() || *is !=
'|') {
147 return iter + 1 == path.end() && is == s.end();
151 const SegmentGraph &graph_;
154 struct SegmentGraphNodeGreater {
155 bool operator()(
const SegmentGraphNode *lhs,
156 const SegmentGraphNode *rhs)
const {
157 return lhs->index() > rhs->index();
163 const SegmentGraphNode *prevIsSeparator(
const SegmentGraph &graph,
164 const SegmentGraphNode &node) {
165 if (node.prevSize() == 1) {
166 const auto range = node.prevs();
167 const auto &prev = range.front();
168 auto pinyin = graph.segment(prev, node);
169 if (pinyin.starts_with(
"\'")) {
176 inline void searchOneStep(
177 std::list<std::pair<const PinyinTrie *, PinyinTrie::position_type>> &nodes,
179 std::list<std::pair<const PinyinTrie *, PinyinTrie::position_type>>
181 auto iter = nodes.begin();
182 while (iter != nodes.end()) {
184 const auto resultRaw =
185 iter->first->traverseRaw(¤t, 1, iter->second);
187 if (PinyinTrie::isNoPathRaw(resultRaw)) {
193 bool changed =
false;
194 for (
char test = PinyinEncoder::firstFinal;
195 test <= PinyinEncoder::lastFinal; test++) {
196 decltype(extraNodes)::value_type p = *iter;
197 const auto resultRaw = p.first->traverseRaw(&test, 1, p.second);
198 if (!PinyinTrie::isNoPathRaw(resultRaw)) {
199 extraNodes.push_back(p);
204 *iter = extraNodes.back();
205 extraNodes.pop_back();
212 nodes.splice(nodes.end(), std::move(extraNodes));
215 size_t fuzzyFactor(PinyinFuzzyFlags flags) {
217 if (flags.test(PinyinFuzzyFlag::Correction)) {
218 flags = flags.unset(PinyinFuzzyFlag::Correction);
219 factor += PINYIN_CORRECTION_FUZZY_FACTOR;
221 if (flags.test(PinyinFuzzyFlag::AdvancedTypo)) {
222 flags = flags.unset(PinyinFuzzyFlag::AdvancedTypo);
223 factor += PINYIN_ADVACNED_TYPO_FUZZY_FACTOR;
231 PinyinDictionary::TrieType loadTextImpl(std::istream &in) {
232 PinyinDictionary::TrieType trie;
237 if (!std::getline(in, lineBuf)) {
242 std::string_view line = lineBuf;
243 std::vector<std::string> tokens;
244 while (!line.empty()) {
246 auto consumed = fcitx::stringutils::consumeMaybeEscapedValue(
247 line, FCITX_WHITESPACE, &token);
248 if (!consumed.empty()) {
249 tokens.push_back(std::string(token));
252 if (tokens.size() == 3 || tokens.size() == 2) {
253 const std::string &hanzi = tokens[0];
254 std::string_view pinyin = tokens[1];
258 if (tokens.size() == 3) {
259 prob = std::stof(tokens[2]);
262 pinyin, PinyinFuzzyFlag::VE_UE);
263 result.push_back(pinyinHanziSep);
264 result.insert(result.end(), hanzi.begin(), hanzi.end());
265 trie.set(result.data(), result.size(), prob);
266 }
catch (
const std::invalid_argument &e) {
268 <<
"Skipped line " << lineNo <<
", exception: " << e.what();
275 PinyinDictionary::TrieType loadBinaryImpl(std::istream &in) {
276 PinyinDictionary::TrieType trie;
278 uint32_t version = 0;
279 throw_if_io_fail(unmarshall(in, magic));
280 if (magic != pinyinBinaryFormatMagic) {
281 throw std::invalid_argument(
"Invalid pinyin magic.");
283 throw_if_io_fail(unmarshall(in, version));
288 case pinyinBinaryFormatVersion:
290 in, [&trie](std::istream &compressIn) { trie.load(compressIn); });
293 throw std::invalid_argument(
"Invalid pinyin version.");
304 const SegmentGraph &graph,
const GraphMatchCallback &callback,
305 const std::unordered_set<const SegmentGraphNode *> &ignore,
307 : graph_(graph), hasher_(graph), callback_(callback), ignore_(ignore),
308 matchedPathsMap_(&matchState->d_func()->matchedPaths_),
309 nodeCacheMap_(&matchState->d_func()->nodeCacheMap_),
310 matchCacheMap_(&matchState->d_func()->matchCacheMap_),
311 flags_(matchState->fuzzyFlags()),
312 spProfile_(matchState->shuangpinProfile()),
313 correctionProfile_(matchState->correctionProfile()),
314 partialLongWordLimit_(matchState->partialLongWordLimit()) {}
317 const SegmentGraph &graph,
const GraphMatchCallback &callback,
318 const std::unordered_set<const SegmentGraphNode *> &ignore,
319 NodeToMatchedPinyinPathsMap &matchedPaths)
320 : graph_(graph), hasher_(graph), callback_(callback), ignore_(ignore),
321 matchedPathsMap_(&matchedPaths) {}
326 PinyinSegmentGraphPathHasher hasher_;
328 const GraphMatchCallback &callback_;
329 const std::unordered_set<const SegmentGraphNode *> &ignore_;
330 NodeToMatchedPinyinPathsMap *matchedPathsMap_;
331 PinyinTrieNodeCache *nodeCacheMap_ =
nullptr;
332 PinyinMatchResultCache *matchCacheMap_ =
nullptr;
333 PinyinFuzzyFlags flags_{PinyinFuzzyFlag::None};
334 std::shared_ptr<const ShuangpinProfile> spProfile_;
335 std::shared_ptr<const PinyinCorrectionProfile> correctionProfile_;
336 size_t partialLongWordLimit_ = 0;
342 : fcitx::QPtrHolder<PinyinDictionary>(q) {}
346 MatchedPinyinPaths ¤tMatches)
const;
351 MatchedPinyinPaths ¤tMatches)
const;
354 const MatchedPinyinPaths &newPaths)
const;
356 const MatchedPinyinPath &path)
const;
361 fcitx::ScopedConnection conn_;
362 std::vector<PinyinDictFlags> flags_;
365 void PinyinDictionaryPrivate::addEmptyMatch(
367 MatchedPinyinPaths ¤tMatches)
const {
371 if (¤tNode != &graph.end() &&
372 !graph.segment(currentNode.index(), currentNode.index() + 1)
373 .starts_with(
"\'")) {
374 SegmentGraphPath vec;
375 if (
const auto *prev = prevIsSeparator(graph, currentNode)) {
379 vec.push_back(¤tNode);
380 for (
size_t i = 0; i < q->dictSize(); i++) {
381 if (flags_[i].test(PinyinDictFlag::FullMatch) &&
382 ¤tNode != &graph.start()) {
385 if (flags_[i].test(PinyinDictFlag::Disabled)) {
388 const auto &trie = *q->trie(i);
389 currentMatches.emplace_back(&trie, 0, vec, flags_[i]);
390 currentMatches.back().triePositions().emplace_back(0, 0);
395 PinyinTriePositions traverseAlongPathOneStepBySyllables(
396 const MatchedPinyinPath &path,
397 const MatchedPinyinSyllablesWithFuzzyFlags &syls) {
398 PinyinTriePositions positions;
399 for (
const auto &pr : path.triePositions()) {
402 std::tie(_pos, fuzzies) = pr;
403 for (
const auto &syl : syls) {
406 auto initial =
static_cast<char>(syl.first);
407 const auto resultRaw = path.trie()->traverseRaw(&initial, 1, pos);
408 if (PinyinTrie::isNoPathRaw(resultRaw)) {
411 const auto &finals = syl.second;
413 auto updateNext = [fuzzies, &path, &positions](PinyinFinal pyFinal,
416 auto final =
static_cast<char>(pyFinal);
417 const auto resultRaw = path.trie()->traverseRaw(&
final, 1, pos);
419 if (!PinyinTrie::isNoPathRaw(resultRaw)) {
420 size_t newFuzzies = fuzzies + fuzzyFactor;
421 positions.emplace_back(pos, newFuzzies);
424 if (finals.size() > 1 || finals[0].first != PinyinFinal::Invalid) {
425 for (
auto final : finals) {
426 updateNext(
final.first, fuzzyFactor(
final.second), pos);
428 }
else if (!path.flags_.test(PinyinDictFlag::FullMatch)) {
429 for (
char test = PinyinEncoder::firstFinal;
430 test <= PinyinEncoder::lastFinal; test++) {
431 updateNext(static_cast<PinyinFinal>(test), 1, pos);
439 template <
typename T>
440 void matchWordsOnTrie(
const PinyinTrie *userDict,
const MatchedPinyinPath &path,
441 bool matchLongWord,
const T &callback) {
442 for (
const auto &pr : path.triePositions()) {
445 std::tie(pos, fuzzies) = pr;
446 const float extraCost = fuzzies * fuzzyCost;
450 const bool isCorrection = fuzzies >= PINYIN_CORRECTION_FUZZY_FACTOR;
452 path.trie()->foreach(
453 [userDict, &path, &callback, extraCost, isCorrection](
454 PinyinTrie::value_type value,
size_t len, uint64_t pos) {
456 s.reserve(len + (path.size() * 2));
457 path.trie()->suffix(s, len + (path.size() * 2), pos);
458 if (
size_t separator =
459 s.find(pinyinHanziSep, path.size() * 2);
460 separator != std::string::npos) {
461 std::string_view view(s);
462 auto encodedPinyin = view.substr(0, separator);
463 auto hanzi = view.substr(separator + 1);
464 const size_t lengthDiff =
465 ((encodedPinyin.size() / 2) - path.size());
467 if (path.trie() == userDict && value < 0 &&
471 float overLengthCost = fuzzyCost * lengthDiff;
473 callback(encodedPinyin, hanzi,
474 value + extraCost + overLengthCost,
481 const char sep = pinyinHanziSep;
482 const auto resultRaw = path.trie()->traverseRaw(&sep, 1, pos);
483 if (PinyinTrie::isNoPathRaw(resultRaw)) {
487 path.trie()->foreach(
488 [&path, &callback, extraCost, isCorrection](
489 PinyinTrie::value_type value,
size_t len, uint64_t pos) {
491 s.reserve(len + (path.size() * 2) + 1);
492 path.trie()->suffix(s, len + (path.size() * 2) + 1, pos);
493 std::string_view view(s);
494 auto encodedPinyin = view.substr(0, path.size() * 2);
495 auto hanzi = view.substr((path.size() * 2) + 1);
496 callback(encodedPinyin, hanzi, value + extraCost,
505 bool PinyinDictionaryPrivate::matchWordsForOnePath(
508 bool matched =
false;
509 assert(path.path_.size() >= 2);
512 if (path.flags_.test(PinyinDictFlag::FullMatch) &&
513 (path.path_.front() != &context.graph_.start() ||
514 path.path_.back() != &context.graph_.end())) {
519 const bool matchLongWordEnabled =
520 context.partialLongWordLimit_ &&
521 std::max(minimumLongWordLength, context.partialLongWordLimit_) + 1 <=
523 !path.flags_.test(PinyinDictFlag::FullMatch);
525 const bool matchLongWord =
526 (path.path_.back() == &context.graph_.end() && matchLongWordEnabled);
528 auto foundOneWord = [&path, &prevNode, &matched, &context](
529 std::string_view encodedPinyin,
WordNode &word,
530 float cost,
bool isCorrection) {
531 context.callback_(path.path_, word, cost,
532 std::make_unique<PinyinLatticeNodePrivate>(
533 encodedPinyin, isCorrection));
534 if (path.size() == 1 &&
535 path.path_[path.path_.size() - 2] == &prevNode) {
540 if (context.matchCacheMap_) {
541 auto &matchCache = (*context.matchCacheMap_)[path.trie()];
543 matchCache.find(path.path_, context.hasher_, context.hasher_);
546 matchCache.insert(context.hasher_.pathToPinyins(path.path_));
549 auto &items = *result;
551 q->trie(PinyinDictionary::UserDict), path, matchLongWordEnabled,
552 [&items](std::string_view encodedPinyin, std::string_view hanzi,
553 float cost,
bool isCorrection) {
554 items.emplace_back(hanzi, cost, encodedPinyin,
558 for (
auto &item : *result) {
559 if (!matchLongWord &&
560 item.encodedPinyin_.size() / 2 > path.size()) {
563 foundOneWord(item.encodedPinyin_, item.word_, item.value_,
568 q->trie(PinyinDictionary::UserDict), path, matchLongWord,
569 [&foundOneWord](std::string_view encodedPinyin,
570 std::string_view hanzi,
float cost,
572 WordNode word(hanzi, InvalidWordIndex);
573 foundOneWord(encodedPinyin, word, cost, isCorrection);
580 bool PinyinDictionaryPrivate::matchWords(
582 const MatchedPinyinPaths &newPaths)
const {
583 bool matched =
false;
584 for (
const auto &path : newPaths) {
585 matched |= matchWordsForOnePath(context, path);
591 void PinyinDictionaryPrivate::findMatchesBetween(
594 MatchedPinyinPaths ¤tMatches)
const {
596 auto &matchedPathsMap = *context.matchedPathsMap_;
597 auto pinyin = graph.segment(prevNode, currentNode);
600 if (pinyin.starts_with(
"\'")) {
601 const auto &prevMatches = matchedPathsMap[&prevNode];
602 for (
const auto &match : prevMatches) {
604 auto path = match.path_;
605 path.push_back(¤tNode);
606 currentMatches.emplace_back(match.result_, std::move(path),
610 if (¤tNode == &graph.end()) {
612 context.callback_({&prevNode, ¤tNode}, word, 0,
nullptr);
619 ? PinyinEncoder::shuangpinToSyllablesWithFuzzyFlags(
620 pinyin, *context.spProfile_, context.flags_)
621 : PinyinEncoder::stringToSyllablesWithFuzzyFlags(
622 pinyin, context.correctionProfile_.get(), context.flags_);
623 const MatchedPinyinPaths &prevMatchedPaths = matchedPathsMap[&prevNode];
624 MatchedPinyinPaths newPaths;
625 for (
const auto &path : prevMatchedPaths) {
627 auto segmentPath = path.path_;
628 segmentPath.push_back(¤tNode);
631 if (context.nodeCacheMap_) {
632 auto &nodeCache = (*context.nodeCacheMap_)[path.trie()];
634 nodeCache.find(segmentPath, context.hasher_, context.hasher_);
635 std::shared_ptr<MatchedPinyinTrieNodes> result;
637 result = std::make_shared<MatchedPinyinTrieNodes>(
638 path.trie(), path.size() + 1);
639 nodeCache.insert(context.hasher_.pathToPinyins(segmentPath),
641 result->triePositions_ =
642 traverseAlongPathOneStepBySyllables(path, syls);
645 assert(result->size_ == path.size() + 1);
648 if (!result->triePositions_.empty()) {
649 newPaths.emplace_back(result, segmentPath, path.flags_);
653 newPaths.emplace_back(path.trie(), path.size() + 1, segmentPath,
656 newPaths.back().result_->triePositions_ =
657 traverseAlongPathOneStepBySyllables(path, syls);
659 if (newPaths.back().triePositions().empty()) {
665 if (!context.ignore_.contains(¤tNode)) {
667 if (!matchWords(context, newPaths)) {
670 SegmentGraphPath vec;
672 if (
const auto *prevPrev =
673 prevIsSeparator(context.graph_, prevNode)) {
674 vec.push_back(prevPrev);
676 vec.push_back(&prevNode);
677 vec.push_back(¤tNode);
678 WordNode word(pinyin, InvalidWordIndex);
679 context.callback_(vec, word, invalidPinyinCost,
nullptr);
683 std::move(newPaths.begin(), newPaths.end(),
684 std::back_inserter(currentMatches));
687 void PinyinDictionaryPrivate::matchNode(
690 auto &matchedPathsMap = *context.matchedPathsMap_;
692 if (matchedPathsMap.contains(¤tNode)) {
695 auto ¤tMatches = matchedPathsMap[¤tNode];
697 addEmptyMatch(context, currentNode, currentMatches);
700 for (
const auto &prevNode : currentNode.prevs()) {
701 findMatchesBetween(context, prevNode, currentNode, currentMatches);
705 void PinyinDictionary::matchPrefixImpl(
706 const SegmentGraph &graph,
const GraphMatchCallback &callback,
707 const std::unordered_set<const SegmentGraphNode *> &ignore,
708 void *helper)
const {
711 NodeToMatchedPinyinPathsMap localMatchedPaths;
720 using SegmentGraphNodeQueue =
722 std::vector<const SegmentGraphNode *>,
724 SegmentGraphNodeQueue q;
726 const auto &start = graph.start();
737 const auto *currentNode = q.top();
741 for (
const auto &node : currentNode->nexts()) {
745 d->matchNode(context, *currentNode);
749 void PinyinDictionary::matchWords(
const char *data,
size_t size,
750 PinyinMatchCallback callback)
const {
751 if (!PinyinEncoder::isValidUserPinyin(data, size)) {
756 std::list<std::pair<const PinyinTrie *, PinyinTrie::position_type>> nodes;
757 for (
size_t i = 0; i < dictSize(); i++) {
758 if (d->flags_[i].test(PinyinDictFlag::Disabled)) {
761 const auto &trie = *this->trie(i);
762 nodes.emplace_back(&trie, 0);
764 for (
size_t i = 0; i <= size && !nodes.empty(); i++) {
769 current = pinyinHanziSep;
771 searchOneStep(nodes, current);
774 for (
auto &node : nodes) {
776 [&node, &callback, size](PinyinTrie::value_type value,
size_t len,
779 node.first->suffix(s, len + size + 1, pos);
781 auto view = std::string_view(s);
782 return callback(view.substr(0, size), view.substr(size + 1),
789 void PinyinDictionary::matchWordsPrefix(
const char *data,
size_t size,
790 PinyinMatchCallback callback)
const {
791 if (!PinyinEncoder::isValidUserPinyin(data, size)) {
796 std::list<std::pair<const PinyinTrie *, PinyinTrie::position_type>> nodes;
797 for (
size_t i = 0; i < dictSize(); i++) {
798 if (d->flags_[i].test(PinyinDictFlag::Disabled)) {
801 const auto &trie = *this->trie(i);
802 nodes.emplace_back(&trie, 0);
804 for (
size_t i = 0; i < size && !nodes.empty(); i++) {
805 searchOneStep(nodes, data[i]);
808 for (
auto &node : nodes) {
810 [&node, &callback, size](PinyinTrie::value_type value,
size_t len,
813 node.first->suffix(s, len + size, pos);
815 std::string_view view(s);
816 if (
auto sep = view.find(pinyinHanziSep, size);
817 sep != std::string::npos) {
818 return callback(view.substr(0, sep), view.substr(sep + 1),
827 PinyinDictionary::PinyinDictionary()
828 : d_ptr(std::make_unique<PinyinDictionaryPrivate>(
this)) {
830 d->conn_ = connect<TrieDictionary::dictSizeChanged>([
this](
size_t size) {
832 d->flags_.resize(size);
834 d->flags_.resize(dictSize());
837 PinyinDictionary::~PinyinDictionary() {}
839 void PinyinDictionary::load(
size_t idx,
const char *filename,
840 PinyinDictFormat format) {
841 std::ifstream in(filename, std::ios::in | std::ios::binary);
842 throw_if_io_fail(in);
843 load(idx, in, format);
846 void PinyinDictionary::load(
size_t idx, std::istream &in,
847 PinyinDictFormat format) {
848 setTrie(idx, load(in, format));
852 PinyinDictFormat format) {
854 case PinyinDictFormat::Text:
855 return loadTextImpl(in);
856 case PinyinDictFormat::Binary:
857 return loadBinaryImpl(in);
859 throw std::invalid_argument(
"invalid format type");
863 void PinyinDictionary::loadText(
size_t idx, std::istream &in) {
864 *mutableTrie(idx) = loadTextImpl(in);
867 void PinyinDictionary::loadBinary(
size_t idx, std::istream &in) {
868 *mutableTrie(idx) = loadBinaryImpl(in);
871 void PinyinDictionary::save(
size_t idx,
const char *filename,
872 PinyinDictFormat format) {
873 std::ofstream fout(filename, std::ios::out | std::ios::binary);
874 throw_if_io_fail(fout);
875 save(idx, fout, format);
878 void PinyinDictionary::save(
size_t idx, std::ostream &out,
879 PinyinDictFormat format) {
881 case PinyinDictFormat::Text:
884 case PinyinDictFormat::Binary: {
885 throw_if_io_fail(marshall(out, pinyinBinaryFormatMagic));
886 throw_if_io_fail(marshall(out, pinyinBinaryFormatVersion));
888 writeZSTDCompressed(out, [
this, idx](std::ostream &compressOut) {
889 mutableTrie(idx)->save(compressOut);
893 throw std::invalid_argument(
"invalid format type");
897 void PinyinDictionary::saveText(
size_t idx, std::ostream &out) {
899 std::ios state(
nullptr);
901 const auto &trie = *this->trie(idx);
902 trie.foreach([&trie, &buf, &out](
float value,
size_t _len,
903 PinyinTrie::position_type pos) {
904 trie.suffix(buf, _len, pos);
905 auto sep = buf.find(pinyinHanziSep);
906 if (sep == std::string::npos) {
909 auto fullPinyin = PinyinEncoder::decodeFullPinyin(buf.data(), sep);
910 std::string_view ref(buf);
911 out << fcitx::stringutils::escapeForValue(ref.substr(sep + 1)) <<
" " 912 << fullPinyin <<
" " << std::setprecision(16) << value <<
'\n';
918 void PinyinDictionary::addWord(
size_t idx, std::string_view fullPinyin,
919 std::string_view hanzi,
float cost) {
921 fullPinyin, PinyinFuzzyFlag::VE_UE);
922 result.push_back(pinyinHanziSep);
923 result.insert(result.end(), hanzi.begin(), hanzi.end());
924 TrieDictionary::addWord(idx, std::string_view(result.data(), result.size()),
929 PinyinDictionary::lookupWord(
size_t idx, std::string_view fullPinyin,
930 std::string_view hanzi)
const {
932 fullPinyin, PinyinFuzzyFlag::VE_UE);
933 result.push_back(pinyinHanziSep);
934 result.insert(result.end(), hanzi.begin(), hanzi.end());
935 auto value = trie(idx)->exactMatchSearchRaw(result.data(), result.size());
936 if (PinyinTrie::isValidRaw(value)) {
942 bool PinyinDictionary::removeWord(
size_t idx, std::string_view fullPinyin,
943 std::string_view hanzi) {
945 fullPinyin, PinyinFuzzyFlag::VE_UE);
946 result.push_back(pinyinHanziSep);
947 result.insert(result.end(), hanzi.begin(), hanzi.end());
948 return TrieDictionary::removeWord(
949 idx, std::string_view(result.data(), result.size()));
952 void PinyinDictionary::setFlags(
size_t idx, PinyinDictFlags flags) {
954 if (idx >= dictSize()) {
957 d->flags_.resize(dictSize());
958 d->flags_[idx] = flags;
PinyinDictionary is a set of dictionaries for Pinyin.
Provide a DATrie implementation.
static std::vector< char > encodeFullPinyinWithFlags(std::string_view pinyin, PinyinFuzzyFlags flags)
Encode a quote separated pinyin string.