libime
pinyinencoder.cpp
1 /*
2  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 #include "pinyinencoder.h"
7 #include <algorithm>
8 #include <array>
9 #include <cassert>
10 #include <cstddef>
11 #include <cstdint>
12 #include <functional>
13 #include <initializer_list>
14 #include <iterator>
15 #include <queue>
16 #include <ranges>
17 #include <stdexcept>
18 #include <string>
19 #include <string_view>
20 #include <tuple>
21 #include <unordered_map>
22 #include <utility>
23 #include <vector>
24 #include <boost/bimap.hpp>
25 #include <boost/container/static_vector.hpp>
26 #include <fcitx-utils/charutils.h>
27 #include <fcitx-utils/log.h>
28 #include <fcitx-utils/stringutils.h>
29 #include "libime/core/segmentgraph.h"
30 #include "pinyincorrectionprofile.h"
31 #include "pinyindata.h"
32 #include "pinyindata_p.h"
33 #include "shuangpinprofile.h"
34 
35 namespace libime {
36 
37 static const std::string emptyString;
38 
39 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
40  PinyinFuzzyFlags fuzzy) {
41  log << fuzzy.toInteger();
42  return log;
43 }
44 
45 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
46  PinyinInitial initial) {
47  log << PinyinEncoder::initialToString(initial);
48  return log;
49 }
50 
51 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
52  PinyinFinal final) {
53  log << PinyinEncoder::finalToString(final);
54  return log;
55 }
56 
57 fcitx::LogMessageBuilder &operator<<(fcitx::LogMessageBuilder &log,
58  PinyinSyllable syl) {
59  log << syl.toString();
60  return log;
61 }
62 
63 template <typename L, typename R>
64 boost::bimap<L, R>
65 makeBimap(std::initializer_list<typename boost::bimap<L, R>::value_type> list) {
66  return boost::bimap<L, R>(list.begin(), list.end());
67 }
68 
69 static const auto initialMap = makeBimap<PinyinInitial, std::string>({
70  {PinyinInitial::B, "b"}, {PinyinInitial::P, "p"},
71  {PinyinInitial::M, "m"}, {PinyinInitial::F, "f"},
72  {PinyinInitial::D, "d"}, {PinyinInitial::T, "t"},
73  {PinyinInitial::N, "n"}, {PinyinInitial::L, "l"},
74  {PinyinInitial::G, "g"}, {PinyinInitial::K, "k"},
75  {PinyinInitial::H, "h"}, {PinyinInitial::J, "j"},
76  {PinyinInitial::Q, "q"}, {PinyinInitial::X, "x"},
77  {PinyinInitial::ZH, "zh"}, {PinyinInitial::CH, "ch"},
78  {PinyinInitial::SH, "sh"}, {PinyinInitial::R, "r"},
79  {PinyinInitial::Z, "z"}, {PinyinInitial::C, "c"},
80  {PinyinInitial::S, "s"}, {PinyinInitial::Y, "y"},
81  {PinyinInitial::W, "w"}, {PinyinInitial::Zero, ""},
82 });
83 
84 static const auto finalMap = makeBimap<PinyinFinal, std::string>({
85  {PinyinFinal::A, "a"}, {PinyinFinal::AI, "ai"},
86  {PinyinFinal::AN, "an"}, {PinyinFinal::ANG, "ang"},
87  {PinyinFinal::AO, "ao"}, {PinyinFinal::E, "e"},
88  {PinyinFinal::EI, "ei"}, {PinyinFinal::EN, "en"},
89  {PinyinFinal::ENG, "eng"}, {PinyinFinal::ER, "er"},
90  {PinyinFinal::O, "o"}, {PinyinFinal::ONG, "ong"},
91  {PinyinFinal::OU, "ou"}, {PinyinFinal::I, "i"},
92  {PinyinFinal::IA, "ia"}, {PinyinFinal::IE, "ie"},
93  {PinyinFinal::IAO, "iao"}, {PinyinFinal::IU, "iu"},
94  {PinyinFinal::IAN, "ian"}, {PinyinFinal::IN, "in"},
95  {PinyinFinal::IANG, "iang"}, {PinyinFinal::ING, "ing"},
96  {PinyinFinal::IONG, "iong"}, {PinyinFinal::U, "u"},
97  {PinyinFinal::UA, "ua"}, {PinyinFinal::UO, "uo"},
98  {PinyinFinal::UAI, "uai"}, {PinyinFinal::UI, "ui"},
99  {PinyinFinal::UAN, "uan"}, {PinyinFinal::UN, "un"},
100  {PinyinFinal::UANG, "uang"}, {PinyinFinal::V, "v"},
101  {PinyinFinal::UE, "ue"}, {PinyinFinal::VE, "ve"},
102  {PinyinFinal::NG, "ng"}, {PinyinFinal::Zero, ""},
103  {PinyinFinal::Letter_A, "A"}, {PinyinFinal::Letter_B, "B"},
104  {PinyinFinal::Letter_C, "C"}, {PinyinFinal::Letter_D, "D"},
105  {PinyinFinal::Letter_E, "E"}, {PinyinFinal::Letter_F, "F"},
106  {PinyinFinal::Letter_G, "G"}, {PinyinFinal::Letter_H, "H"},
107  {PinyinFinal::Letter_I, "I"}, {PinyinFinal::Letter_J, "J"},
108  {PinyinFinal::Letter_K, "K"}, {PinyinFinal::Letter_L, "L"},
109  {PinyinFinal::Letter_M, "M"}, {PinyinFinal::Letter_N, "N"},
110  {PinyinFinal::Letter_O, "O"}, {PinyinFinal::Letter_P, "P"},
111  {PinyinFinal::Letter_Q, "Q"}, {PinyinFinal::Letter_R, "R"},
112  {PinyinFinal::Letter_S, "S"}, {PinyinFinal::Letter_T, "T"},
113  {PinyinFinal::Letter_U, "U"}, {PinyinFinal::Letter_V, "V"},
114  {PinyinFinal::Letter_W, "W"}, {PinyinFinal::Letter_X, "X"},
115  {PinyinFinal::Letter_Y, "Y"}, {PinyinFinal::Letter_Z, "Z"},
116 });
117 
118 static const int maxPinyinLength = 6;
119 
121  bool valid;
122  std::string_view match;
123  bool isCompletePinyin;
124 };
125 
126 bool hasMatchInMap(const PinyinMap &map, std::string_view range,
127  PinyinFuzzyFlags flags) {
128  auto iterPair = map.equal_range(range);
129  if (iterPair.first != iterPair.second) {
130  for (const auto &item :
131  std::ranges::subrange(iterPair.first, iterPair.second)) {
132  if (flags.test(item.flags())) {
133  // do not consider m/n/r as complete pinyin
134  return true;
135  }
136  }
137  }
138  return false;
139 }
140 
141 template <typename Iter>
142 LongestMatchResult longestMatch(Iter iter, Iter end, PinyinFuzzyFlags flags,
143  const PinyinMap &map) {
144  if ((*iter == 'i' || *iter == 'u' || *iter == 'v') &&
145  !flags.testAny(PinyinFuzzyFlags{PinyinFuzzyFlag::Correction,
146  PinyinFuzzyFlag::Letter})) {
147  return {.valid = false,
148  .match = std::string_view(&*iter, std::distance(iter, end)),
149  .isCompletePinyin = false};
150  }
151  if (std::distance(iter, end) > maxPinyinLength) {
152  end = iter + maxPinyinLength;
153  }
154  auto range = std::string_view(&*iter, std::distance(iter, end));
155  for (; !range.empty(); range.remove_suffix(1)) {
156  if (hasMatchInMap(map, range, flags)) {
157  // do not consider m/n/r as complete pinyin
158  return {.valid = true,
159  .match = range,
160  .isCompletePinyin =
161  (range != "m" && range != "n" && range != "r")};
162  }
163  if (range.size() <= 2) {
164  auto iter = initialMap.right.find(std::string{range});
165  if (iter != initialMap.right.end()) {
166  return {
167  .valid = true, .match = range, .isCompletePinyin = false};
168  }
169  }
170  }
171 
172  assert(range.empty());
173  range = std::string_view(&*iter, 1);
174 
175  return {.valid = false, .match = range, .isCompletePinyin = false};
176 }
177 
178 std::string PinyinSyllable::toString() const {
179  return PinyinEncoder::initialToString(initial_) +
180  PinyinEncoder::finalToString(final_);
181 }
182 SegmentGraph PinyinEncoder::parseUserPinyin(std::string userPinyin,
183  PinyinFuzzyFlags flags) {
184  return parseUserPinyin(std::move(userPinyin), nullptr, flags);
185 }
186 
188 PinyinEncoder::parseUserPinyin(std::string userPinyin,
189  const PinyinCorrectionProfile *profile,
190  PinyinFuzzyFlags flags) {
191  SegmentGraph result{std::move(userPinyin)};
192  auto pinyin = result.data();
193  const auto end = pinyin.end();
194 
195  if (!profile) {
196  flags = flags.unset(PinyinFuzzyFlag::Correction);
197  }
198  std::vector<PinyinFuzzyFlags> flagsToTry = {flags};
199  if (flags.test(PinyinFuzzyFlag::Correction)) {
200  flagsToTry.push_back(flags.unset(PinyinFuzzyFlag::Correction));
201  }
202  if (flags.test(PinyinFuzzyFlag::AdvancedTypo)) {
203  flagsToTry.push_back(flags.unset(PinyinFuzzyFlag::AdvancedTypo)
204  .unset(PinyinFuzzyFlag::Correction));
205  }
206 
207  const auto &pinyinMap = profile ? profile->pinyinMap() : getPinyinMapV2();
208 
209  std::priority_queue<size_t, std::vector<size_t>, std::greater<>> q;
210  q.push(0);
211  while (!q.empty()) {
212  size_t top;
213  do {
214  top = q.top();
215  q.pop();
216  } while (!q.empty() && q.top() == top);
217  if (top >= pinyin.size()) {
218  continue;
219  }
220  auto iter = std::next(pinyin.begin(), top);
221  if (*iter == '\'') {
222  while (iter != pinyin.end() && *iter == '\'') {
223  iter++;
224  }
225  auto next = std::distance(pinyin.begin(), iter);
226  result.addNext(top, next);
227  if (static_cast<size_t>(next) < pinyin.size()) {
228  q.push(next);
229  }
230  continue;
231  }
232  if (fcitx::charutils::isupper(*iter)) {
233  result.addNext(top, top + 1);
234  q.push(top + 1);
235  continue;
236  }
237  for (const auto fuzzyFlags : flagsToTry) {
238  auto [valid, str, isCompletePinyin] =
239  longestMatch(iter, end, fuzzyFlags, pinyinMap);
240 
241  // it's not complete a pinyin, no need to try
242  if (!valid || !isCompletePinyin) {
243  result.addNext(top, top + str.size());
244  q.push(top + str.size());
245  } else {
246  // check fuzzy seg
247  // pinyin may end with aegimnoruv(h)
248  // and may start with abcdefghjklmnopqrstwxyz.
249  // the intersection is aegmnor(h), while for m, it only 'm',
250  // so don't consider it also, make sure current pinyin does
251  // not end with a separator, other wise, jin'an may be
252  // parsed into ji'n because, nextMatch is starts with "'".
253  std::array<size_t, 2> nextSize;
254  size_t nNextSize = 0;
255  // Check if we can do fuzzy segement, e.g.
256  // zhuni -> zhu ni
257  if (str.size() > 1 && top + str.size() < pinyin.size() &&
258  pinyin[top + str.size()] != '\'' &&
259  (str.back() == 'a' || str.back() == 'e' ||
260  str.back() == 'g' || str.back() == 'n' ||
261  str.back() == 'o' || str.back() == 'r' ||
262  str.back() == 'h' ||
263  fuzzyFlags.test(PinyinFuzzyFlag::Correction)) &&
264  hasMatchInMap(pinyinMap, str.substr(0, str.size() - 1),
265  fuzzyFlags)) {
266  // str[0:-1] is also a full pinyin, check next pinyin
267  auto nextMatch = longestMatch(iter + str.size(), end,
268  fuzzyFlags, pinyinMap);
269  auto nextMatchAlt = longestMatch(iter + str.size() - 1, end,
270  fuzzyFlags, pinyinMap);
271  auto matchSizeAlt =
272  str.size() - 1 + nextMatchAlt.match.size();
273 
274  // comparator is (validPinyin, whole size>= lhs pinyin,
275  // isCompletePinyin) validPinyin means it's at least
276  // some pinyin, instead of things startsWith i,u,v.
277  // Since longestMatch will now treat string startsWith
278  // iuv a whole segment, we need to compare validity
279  // before the length. If whole size is equal to lhs
280  // pinyin, then it should be handled by inner segement
281  // flag.
282  std::tuple<bool, bool, bool> compare(
283  nextMatch.valid, true, nextMatch.isCompletePinyin);
284  std::tuple<bool, bool, bool> compareAlt(
285  nextMatchAlt.valid, matchSizeAlt > str.size(),
286  nextMatchAlt.isCompletePinyin);
287 
288  if (compare >= compareAlt) {
289  result.addNext(top, top + str.size());
290  q.push(top + str.size());
291  nextSize[nNextSize++] = str.size();
292  }
293  if (compare <= compareAlt) {
294  result.addNext(top, top + str.size() - 1);
295  q.push(top + str.size() - 1);
296  nextSize[nNextSize++] = str.size() - 1;
297  }
298  } else {
299  result.addNext(top, top + str.size());
300  q.push(top + str.size());
301  nextSize[nNextSize++] = str.size();
302  }
303 
304  for (size_t i = 0; i < nNextSize; i++) {
305  auto nextPinyin = str.substr(0, nextSize[i]);
306  if (nextPinyin == "din" || nextPinyin == "bon" ||
307  nextPinyin == "won") {
308  result.addNext(top, top + 2);
309  result.addNext(top + 2, top + 3);
310  } else if (nextPinyin == "bong" || nextPinyin == "wong") {
311  // Do a split like {b,w}o n g
312  result.addNext(top, top + 2);
313  result.addNext(top + 2, top + 4);
314  result.addNext(top + 2, top + 3);
315  // Skip top+3 -> top+4, in case g can have a better
316  q.push(top + 3);
317  } else if ((nextPinyin.size() >= 4 &&
318  fuzzyFlags.test(PinyinFuzzyFlag::Inner)) ||
319  (nextPinyin.size() == 3 &&
320  flags.test(PinyinFuzzyFlag::InnerShort))) {
321  const auto &innerSegments = getInnerSegmentV2();
322  auto iter = innerSegments.find(nextPinyin);
323  if (iter != innerSegments.end()) {
324  for (const auto &innerSeg : iter->second) {
325  result.addNext(top,
326  top + innerSeg.first.size());
327  result.addNext(top + innerSeg.first.size(),
328  top + nextSize[i]);
329  }
330  }
331  } else if (nextPinyin.size() == 2 &&
332  flags.test(PinyinFuzzyFlag::InnerShort) &&
333  nextPinyin == "ng") {
334  // Handle ng -> n'g, the condition is so simple so
335  // we don't make it go through the inner segment
336  // lookup.
337  result.addNext(top, top + 1);
338  result.addNext(top + 1, top + 2);
339  }
340  }
341  }
342  }
343  }
344  return result;
345 }
346 
347 SegmentGraph PinyinEncoder::parseUserShuangpin(std::string userPinyin,
348  const ShuangpinProfile &sp,
349  PinyinFuzzyFlags flags) {
350  flags = flags.unset(PinyinFuzzyFlag::AdvancedTypo);
351  SegmentGraph result{std::move(userPinyin)};
352  auto pinyin = result.data();
353 
354  // assume user always type valid shuangpin first, if not keep one.
355  size_t i = 0;
356 
357  const auto &table = sp.table();
358  while (i < pinyin.size()) {
359  auto start = i;
360  while (i < pinyin.size() && pinyin[i] == '\'') {
361  i++;
362  }
363  if (start != i) {
364  result.addNext(start, i);
365  continue;
366  }
367  auto initial = pinyin[i];
368  if (fcitx::charutils::isupper(initial)) {
369  result.addNext(i, i + 1);
370  i = i + 1;
371  continue;
372  }
373  char final = '\0';
374  if (i + 1 < pinyin.size() && pinyin[i + 1] != '\'') {
375  final = pinyin[i + 1];
376  }
377 
378  std::string match{initial};
379  if (final) {
380  match.push_back(final);
381  }
382 
383  auto longestMatchInTable = [flags](decltype(table) t,
384  const std::string &v) {
385  auto py = v;
386  while (!py.empty()) {
387  auto iter = t.find(py);
388  if (iter != t.end()) {
389  for (const auto &p : iter->second) {
390  if (flags.test(p.second)) {
391  return iter;
392  }
393  }
394  }
395  py.pop_back();
396  }
397  return t.end();
398  };
399 
400  auto iter = longestMatchInTable(table, match);
401  if (iter != table.end()) {
402  result.addNext(i, i + iter->first.size());
403  i = i + iter->first.size();
404  } else {
405  result.addNext(i, i + 1);
406  i = i + 1;
407  }
408  }
409 
410  if (pinyin.size() >= 4 && flags.test(PinyinFuzzyFlag::PartialSp)) {
411  size_t i = 0;
412  while (i < pinyin.size()) {
413  size_t start = i;
414  while (i < pinyin.size() && pinyin[i] == '\'') {
415  i++;
416  }
417  // This is already handled above.
418  if (start != i) {
419  continue;
420  }
421  if (!result.ensureNode(i).isChild(&result.ensureNode(i + 1))) {
422  result.addNext(i, i + 1);
423  }
424  i = i + 1;
425  }
426  }
427 
428  return result;
429 }
430 
431 std::vector<char> PinyinEncoder::encodeFullPinyin(std::string_view pinyin) {
432  return encodeFullPinyinWithFlags(pinyin, PinyinFuzzyFlag::None);
433 }
434 
435 std::vector<char>
437  PinyinFuzzyFlags flags) {
438  std::vector<std::string> pinyins = fcitx::stringutils::split(pinyin, "'");
439  std::vector<char> result;
440  result.resize(pinyins.size() * 2);
441  int idx = 0;
442  for (const auto &singlePinyin : pinyins) {
443  const auto &map = getPinyinMapV2();
444  auto [begin, end] = map.equal_range(singlePinyin);
445  if (begin == end) {
446  throw std::invalid_argument("invalid full pinyin: " +
447  std::string{pinyin});
448  }
449 
450  auto pred = [&flags](const PinyinEntry &entry) {
451  return flags.test(entry.flags());
452  };
453  begin = std::find_if(begin, end, pred);
454  if (begin == end) {
455  throw std::invalid_argument("invalid full pinyin: " +
456  std::string{pinyin});
457  }
458 
459  auto iter = begin;
460  begin = std::next(begin);
461  if (!std::none_of(begin, end, pred)) {
462  throw std::invalid_argument("invalid full pinyin: " +
463  std::string{pinyin});
464  }
465 
466  result[idx++] = static_cast<char>(iter->initial());
467  result[idx++] = static_cast<char>(iter->final());
468  }
469 
470  return result;
471 }
472 
473 std::vector<char> PinyinEncoder::encodeOneUserPinyin(std::string pinyin) {
474  if (pinyin.empty()) {
475  return {};
476  }
477  auto graph = parseUserPinyin(std::move(pinyin), PinyinFuzzyFlag::None);
478  std::vector<char> result;
479  const SegmentGraphNode *node = &graph.start();
480  const SegmentGraphNode *prev = nullptr;
481  while (node->nextSize()) {
482  prev = node;
483  node = &node->nexts().front();
484  auto seg = graph.segment(*prev, *node);
485  if (seg.empty() || seg[0] == '\'') {
486  continue;
487  }
488  auto syls = stringToSyllables(seg, PinyinFuzzyFlag::None);
489  if (syls.empty()) {
490  return {};
491  }
492  result.push_back(static_cast<char>(syls[0].first));
493  result.push_back(static_cast<char>(syls[0].second[0].first));
494  }
495  return result;
496 }
497 
498 bool PinyinEncoder::isValidUserPinyin(const char *data, size_t size) {
499  if (size % 2 != 0) {
500  return false;
501  }
502 
503  for (size_t i = 0; i < size / 2; i++) {
504  if (!PinyinEncoder::isValidInitial(data[i * 2])) {
505  return false;
506  }
507  }
508  return true;
509 }
510 
511 std::string PinyinEncoder::decodeFullPinyin(const char *data, size_t size) {
512  if (size % 2 != 0) {
513  throw std::invalid_argument("invalid pinyin key");
514  }
515  std::string result;
516  for (size_t i = 0, e = size / 2; i < e; i++) {
517  if (i) {
518  result += '\'';
519  }
520  result += initialToString(static_cast<PinyinInitial>(data[i * 2]));
521  result += finalToString(static_cast<PinyinFinal>(data[(i * 2) + 1]));
522  }
523  return result;
524 }
525 
526 const std::string &PinyinEncoder::initialToString(PinyinInitial initial) {
527  const static std::vector<std::string> s = []() {
528  std::vector<std::string> s;
529  s.resize(lastInitial - firstInitial + 1);
530  for (char c = firstInitial; c <= lastInitial; c++) {
531  auto iter = initialMap.left.find(static_cast<PinyinInitial>(c));
532  s[c - firstInitial] = iter->second;
533  }
534  return s;
535  }();
536  auto c = static_cast<char>(initial);
537  if (c >= firstInitial && c <= lastInitial) {
538  return s[c - firstInitial];
539  }
540  return emptyString;
541 }
542 
543 PinyinInitial PinyinEncoder::stringToInitial(const std::string &str) {
544  auto iter = initialMap.right.find(str);
545  if (iter != initialMap.right.end()) {
546  return iter->second;
547  }
548  return PinyinInitial::Invalid;
549 }
550 
551 const std::string &PinyinEncoder::finalToString(PinyinFinal final) {
552  const static std::vector<std::string> s = []() {
553  std::vector<std::string> s;
554  s.resize(lastLetter - firstFinal + 1);
555  for (char c = firstFinal; c <= lastLetter; c++) {
556  auto iter = finalMap.left.find(static_cast<PinyinFinal>(c));
557  s[c - firstFinal] = iter->second;
558  }
559  return s;
560  }();
561  auto c = static_cast<char>(final);
562  if (c >= firstFinal && c <= lastLetter) {
563  return s[c - firstFinal];
564  }
565  return emptyString;
566 }
567 
568 PinyinFinal PinyinEncoder::stringToFinal(const std::string &str) {
569  auto iter = finalMap.right.find(str);
570  if (iter != finalMap.right.end()) {
571  return iter->second;
572  }
573  return PinyinFinal::Invalid;
574 }
575 
576 bool PinyinEncoder::isValidInitialFinal(PinyinInitial initial,
577  PinyinFinal final) {
578  if (initial != PinyinInitial::Invalid && final != PinyinFinal::Invalid) {
579  int16_t encode =
580  ((static_cast<int16_t>(initial) - PinyinEncoder::firstInitial) *
581  (PinyinEncoder::lastLetter - PinyinEncoder::firstFinal + 1)) +
582  (static_cast<int16_t>(final) - PinyinEncoder::firstFinal);
583  const auto &a = getEncodedInitialFinal();
584  return encode < static_cast<int>(a.size()) && a[encode];
585  }
586  return false;
587 }
588 
589 std::string PinyinEncoder::initialFinalToPinyinString(PinyinInitial initial,
590  PinyinFinal final) {
591  std::string result = initialToString(initial);
592  std::string finalString;
593  switch (final) {
594  case PinyinFinal::VE:
595  case PinyinFinal::V:
596  if (initial == PinyinInitial::N || initial == PinyinInitial::L) {
597  if (final == PinyinFinal::VE) {
598  finalString = "üe";
599  } else {
600  finalString = "ü";
601  }
602  break;
603  }
604  // FALLTHROUGH
605  default:
606  finalString = finalToString(final);
607  break;
608  }
609  result.append(finalString);
610  return result;
611 }
612 
613 namespace {
614 
615 template <typename FuzzyValue, typename Adjuster>
616 void getFuzzy(FuzzyPinyinSyllables<FuzzyValue> &syls, PinyinSyllable syl,
617  PinyinFuzzyFlags flags, bool isSp, const Adjuster &adjuster) {
618  // ng/gn is already handled by table
619  boost::container::static_vector<std::tuple<PinyinInitial, PinyinFuzzyFlags>,
620  2>
621  initials{{syl.initial(), PinyinFuzzyFlag::None}};
622  boost::container::static_vector<std::tuple<PinyinFinal, PinyinFuzzyFlags>,
623  10>
624  finals{{syl.final(), PinyinFuzzyFlag::None}};
625 
626  // for full pinyin {s,z,c} we also want them to match {sh,zh,ch}
627  if (syl.final() == PinyinFinal::Invalid && !isSp) {
628  if (syl.initial() == PinyinInitial::C) {
629  flags |= PinyinFuzzyFlag::C_CH;
630  }
631  if (syl.initial() == PinyinInitial::Z) {
632  flags |= PinyinFuzzyFlag::Z_ZH;
633  }
634  if (syl.initial() == PinyinInitial::S) {
635  flags |= PinyinFuzzyFlag::S_SH;
636  }
637  }
638 
639  const static std::vector<
640  std::tuple<PinyinInitial, PinyinInitial, PinyinFuzzyFlag>>
641  initialFuzzies = {
642  {PinyinInitial::C, PinyinInitial::CH, PinyinFuzzyFlag::C_CH},
643  {PinyinInitial::S, PinyinInitial::SH, PinyinFuzzyFlag::S_SH},
644  {PinyinInitial::Z, PinyinInitial::ZH, PinyinFuzzyFlag::Z_ZH},
645  {PinyinInitial::F, PinyinInitial::H, PinyinFuzzyFlag::F_H},
646  {PinyinInitial::L, PinyinInitial::N, PinyinFuzzyFlag::L_N},
647  {PinyinInitial::L, PinyinInitial::R, PinyinFuzzyFlag::L_R},
648  };
649 
650  for (const auto &initialFuzzy : initialFuzzies) {
651  if ((syl.initial() == std::get<0>(initialFuzzy) ||
652  syl.initial() == std::get<1>(initialFuzzy)) &&
653  flags.test(std::get<2>(initialFuzzy))) {
654  initials.emplace_back((syl.initial() == std::get<0>(initialFuzzy)
655  ? std::get<1>(initialFuzzy)
656  : std::get<0>(initialFuzzy)),
657  std::get<2>(initialFuzzy));
658  break;
659  }
660  }
661 
662  const static std::vector<
663  std::tuple<PinyinFinal, PinyinFinal, PinyinFuzzyFlag>>
664  finalFuzzies = {
665  {PinyinFinal::V, PinyinFinal::U, PinyinFuzzyFlag::V_U},
666  {PinyinFinal::AN, PinyinFinal::ANG, PinyinFuzzyFlag::AN_ANG},
667  {PinyinFinal::EN, PinyinFinal::ENG, PinyinFuzzyFlag::EN_ENG},
668  {PinyinFinal::IAN, PinyinFinal::IANG, PinyinFuzzyFlag::IAN_IANG},
669  {PinyinFinal::IN, PinyinFinal::ING, PinyinFuzzyFlag::IN_ING},
670  {PinyinFinal::U, PinyinFinal::OU, PinyinFuzzyFlag::U_OU},
671  {PinyinFinal::UAN, PinyinFinal::UANG, PinyinFuzzyFlag::UAN_UANG},
672  {PinyinFinal::VE, PinyinFinal::UE, PinyinFuzzyFlag::VE_UE},
673  };
674 
675  for (const auto &finalFuzzy : finalFuzzies) {
676  if ((syl.final() == std::get<0>(finalFuzzy) ||
677  syl.final() == std::get<1>(finalFuzzy)) &&
678  flags.test(std::get<2>(finalFuzzy))) {
679  finals.emplace_back((syl.final() == std::get<0>(finalFuzzy)
680  ? std::get<1>(finalFuzzy)
681  : std::get<0>(finalFuzzy)),
682  std::get<2>(finalFuzzy));
683  break;
684  }
685  }
686 
687  // "aeo"
688 
689  const static std::vector<std::tuple<PinyinFinal, PinyinFinal>>
690  partialFinals = {
691  {PinyinFinal::A, PinyinFinal::AN},
692  {PinyinFinal::A, PinyinFinal::ANG},
693  {PinyinFinal::A, PinyinFinal::AI},
694  {PinyinFinal::A, PinyinFinal::AO},
695  {PinyinFinal::E, PinyinFinal::EI},
696  {PinyinFinal::E, PinyinFinal::EN},
697  {PinyinFinal::E, PinyinFinal::ENG},
698  {PinyinFinal::E, PinyinFinal::ER},
699  {PinyinFinal::O, PinyinFinal::OU},
700  {PinyinFinal::O, PinyinFinal::ONG},
701  };
702  if (initials.size() == 1 &&
703  std::get<0>(initials[0]) == PinyinInitial::Zero &&
704  flags.test(PinyinFuzzyFlag::PartialFinal)) {
705  for (const auto &partialFinal : partialFinals) {
706  if (syl.final() == std::get<0>(partialFinal)) {
707  finals.emplace_back(std::get<1>(partialFinal),
708  PinyinFuzzyFlag::PartialFinal);
709  }
710  }
711  }
712 
713  for (size_t i = 0; i < initials.size(); i++) {
714  for (size_t j = 0; j < finals.size(); j++) {
715  auto initial = std::get<0>(initials[i]);
716  auto final = std::get<0>(finals[j]);
717  auto flags = std::get<1>(initials[i]) | std::get<1>(finals[j]);
718  if ((i == 0 && j == 0) || final == PinyinFinal::Invalid ||
719  PinyinEncoder::isValidInitialFinal(initial, final)) {
720  auto iter = std::find_if(
721  syls.begin(), syls.end(),
722  [initial](const auto &p) { return p.first == initial; });
723  if (iter == syls.end()) {
724  syls.emplace_back(std::piecewise_construct,
725  std::forward_as_tuple(initial),
726  std::forward_as_tuple());
727  iter = std::prev(syls.end());
728  }
729  auto &finals = iter->second;
730  if (std::find_if(finals.begin(), finals.end(),
731  [final](auto &p) {
732  return p.first == final;
733  }) == finals.end()) {
734  finals.emplace_back(final, adjuster(flags));
735  }
736  }
737  }
738  }
739 }
740 
741 template <typename FuzzyValue, typename Adjuster>
742 FuzzyPinyinSyllables<FuzzyValue>
743 stringToSyllablesImpl(std::string_view pinyinView, const PinyinMap &map,
744  PinyinFuzzyFlags flags, const Adjuster &adjuster) {
745  FuzzyPinyinSyllables<FuzzyValue> result;
746  std::string pinyin(pinyinView);
747  // we only want {M,N,R}/Invalid instead of {M,N,R}/Zero, so we could get
748  // match for everything.
749  if (pinyin != "m" && pinyin != "n" && pinyin != "r") {
750  auto iterPair = map.equal_range(pinyin);
751  for (const auto &item :
752  std::ranges::subrange(iterPair.first, iterPair.second)) {
753  if (flags.test(item.flags())) {
754  getFuzzy(result, {item.initial(), item.final()}, flags,
755  /*isSp=*/false,
756  [&adjuster, &item](PinyinFuzzyFlags flags) {
757  return adjuster(item.flags() | flags);
758  });
759  }
760  }
761  }
762 
763  auto iter = initialMap.right.find(pinyin);
764  if (initialMap.right.end() != iter) {
765  getFuzzy(result, {iter->second, PinyinFinal::Invalid}, flags,
766  /*isSp=*/false, adjuster);
767  }
768 
769  if (pinyin.size() == 1 && fcitx::charutils::islower(pinyin[0]) &&
770  flags.test(PinyinFuzzyFlag::Letter)) {
771  getFuzzy(result,
772  {PinyinInitial::Zero, PinyinEncoder::letterToFinal(pinyin[0])},
773  flags,
774  /*isSp=*/false, [&adjuster](PinyinFuzzyFlags flags) {
775  return adjuster(flags | PinyinFuzzyFlag::Letter);
776  });
777  }
778 
779  if (result.empty()) {
780  result.emplace_back(
781  std::piecewise_construct,
782  std::forward_as_tuple(PinyinInitial::Invalid),
783  std::forward_as_tuple(
784  1, std::make_pair(PinyinFinal::Invalid,
785  adjuster(PinyinFuzzyFlag::None))));
786  }
787 
788 #if 0
789  else {
790  // replace invalid
791  for (auto &p : result) {
792  if (p.second.size() == 1 && p.second[0] == PinyinFinal::Invalid) {
793  p.second.clear();
794  for (char test = PinyinEncoder::firstFinal;
795  test <= PinyinEncoder::lastFinal; test++) {
796  auto final = static_cast<PinyinFinal>(test);
797  if (PinyinEncoder::isValidInitialFinal(p.first, final)) {
798  p.second.push_back(final);
799  }
800  }
801  }
802  }
803  }
804 #endif
805 
806  return result;
807 }
808 
809 } // namespace
810 
811 MatchedPinyinSyllables
812 PinyinEncoder::stringToSyllables(std::string_view pinyinView,
813  PinyinFuzzyFlags flags) {
814  auto adjuster = [](const PinyinFuzzyFlags &flags) {
815  return flags != PinyinFuzzyFlag::None;
816  };
817  return stringToSyllablesImpl<bool>(pinyinView, getPinyinMapV2(), flags,
818  adjuster);
819 }
820 
821 MatchedPinyinSyllablesWithFuzzyFlags
822 PinyinEncoder::stringToSyllablesWithFuzzyFlags(
823  std::string_view pinyinView, const PinyinCorrectionProfile *profile,
824  PinyinFuzzyFlags flags) {
825  auto identity = [](const PinyinFuzzyFlags &flags) { return flags; };
826  return stringToSyllablesImpl<PinyinFuzzyFlags>(
827  pinyinView, profile ? profile->pinyinMap() : getPinyinMapV2(), flags,
828  identity);
829 }
830 
831 namespace {
832 
833 template <typename FuzzyValue, typename Adjuster>
834 FuzzyPinyinSyllables<FuzzyValue>
835 shuangpinToSyllablesImpl(std::string_view pinyinView,
836  const ShuangpinProfile &sp, PinyinFuzzyFlags flags,
837  const Adjuster &adjuster) {
838  assert(pinyinView.size() <= 2);
839  std::string pinyin(pinyinView);
840 
841  const auto &table = sp.table();
842  auto iter = table.find(pinyin);
843 
844  // Don't match partial final if our shuangpin is full size.
845  if (pinyinView.size() > 1) {
846  // This option is somewhat meaningless in full Shuangpin.
847  flags = flags.unset(PinyinFuzzyFlag::PartialFinal);
848  }
849 
850  FuzzyPinyinSyllables<FuzzyValue> result;
851  if (iter != table.end()) {
852  for (const auto &p : iter->second) {
853  if (flags.test(p.second)) {
854  getFuzzy(result, {p.first.initial(), p.first.final()}, flags,
855  /*isSp=*/true,
856  [base = p.second, &adjuster](PinyinFuzzyFlags flags) {
857  return adjuster(flags | base);
858  });
859  }
860  }
861  }
862 
863  if (pinyin.length() == 1 && ((fcitx::charutils::islower(pinyin[0]) &&
864  flags.test(PinyinFuzzyFlag::Letter)) ||
865  fcitx::charutils::isupper(pinyin[0]))) {
866  bool isLower = fcitx::charutils::islower(pinyin[0]);
867  getFuzzy(result,
868  {PinyinInitial::Zero, PinyinEncoder::letterToFinal(pinyin[0])},
869  flags,
870  /*isSp=*/true, [&adjuster, isLower](PinyinFuzzyFlags flags) {
871  if (isLower) {
872  flags |= PinyinFuzzyFlag::Letter;
873  }
874  return adjuster(flags);
875  });
876  }
877 
878  if (result.empty()) {
879  result.emplace_back(
880  std::piecewise_construct,
881  std::forward_as_tuple(PinyinInitial::Invalid),
882  std::forward_as_tuple(
883  1, std::make_pair(PinyinFinal::Invalid,
884  adjuster(PinyinFuzzyFlag::None))));
885  }
886 
887  return result;
888 }
889 
890 } // namespace
891 
892 MatchedPinyinSyllables
893 PinyinEncoder::shuangpinToSyllables(std::string_view pinyinView,
894  const ShuangpinProfile &sp,
895  PinyinFuzzyFlags flags) {
896  auto adjuster = [](const PinyinFuzzyFlags &flags) {
897  return flags != PinyinFuzzyFlag::None;
898  };
899  return shuangpinToSyllablesImpl<bool>(pinyinView, sp, flags, adjuster);
900 }
901 
902 MatchedPinyinSyllablesWithFuzzyFlags
903 PinyinEncoder::shuangpinToSyllablesWithFuzzyFlags(std::string_view pinyinView,
904  const ShuangpinProfile &sp,
905  PinyinFuzzyFlags flags) {
906  auto identity = [](const PinyinFuzzyFlags &flags) { return flags; };
907  return shuangpinToSyllablesImpl<PinyinFuzzyFlags>(pinyinView, sp, flags,
908  identity);
909 }
910 
911 std::string
912 PinyinEncoder::shuangpinToPinyin(std::string_view pinyinView,
913  const libime::ShuangpinProfile &sp) {
914  assert(pinyinView.size() <= 2);
915  auto syls = shuangpinToSyllables(pinyinView, sp, PinyinFuzzyFlag::None);
916  if (!syls.empty() && !syls[0].second.empty() && !syls[0].second[0].second) {
917  auto initial = syls[0].first;
918  auto final = syls[0].second[0].first;
919  return initialToString(initial) + finalToString(final);
920  }
921  return "";
922 }
923 
924 bool PinyinEncoder::isFinalLetter(PinyinFinal final) {
925  return final >= PinyinFinal::Letter_A && final <= PinyinFinal::Letter_Z;
926 }
927 
928 PinyinFinal PinyinEncoder::letterToFinal(char c) {
929  if (c >= 'a' && c <= 'z') {
930  return static_cast<PinyinFinal>(
931  static_cast<char>(PinyinFinal::Letter_A) + (c - 'a'));
932  }
933  if (c >= 'A' && c <= 'Z') {
934  return static_cast<PinyinFinal>(
935  static_cast<char>(PinyinFinal::Letter_A) + (c - 'A'));
936  }
937  return PinyinFinal::Invalid;
938 }
939 
940 } // namespace libime
static bool isFinalLetter(PinyinFinal final)
Check if the final is a letter.
static std::vector< char > encodeFullPinyinWithFlags(std::string_view pinyin, PinyinFuzzyFlags flags)
Encode a quote separated pinyin string.
const PinyinMap & pinyinMap() const
Return the updated pinyin map.
Class that holds updated Pinyin correction mapping based on correction mapping.
static std::vector< char > encodeFullPinyin(std::string_view pinyin)
Encode a quote separated pinyin string.