libime
shuangpinprofile.cpp
1 /*
2  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 #include "shuangpinprofile.h"
7 #include <algorithm>
8 #include <cassert>
9 #include <cstddef>
10 #include <istream>
11 #include <map>
12 #include <memory>
13 #include <ranges>
14 #include <set>
15 #include <stdexcept>
16 #include <string>
17 #include <string_view>
18 #include <tuple>
19 #include <unordered_map>
20 #include <utility>
21 #include <vector>
22 #include <fcitx-utils/charutils.h>
23 #include <fcitx-utils/macros.h>
24 #include <fcitx-utils/stringutils.h>
25 #include "pinyincorrectionprofile.h"
26 #include "pinyindata.h"
27 #include "pinyinencoder.h"
28 #include "shuangpindata.h"
29 
30 namespace libime {
31 
33 public:
34  ShuangpinProfilePrivate() = default;
35  FCITX_INLINE_DEFINE_DEFAULT_DTOR_COPY_AND_MOVE(ShuangpinProfilePrivate)
36 
37  std::string zeroS_ = "o";
38  std::unordered_multimap<char, PinyinFinal> finalMap_;
39  std::unordered_multimap<char, PinyinInitial> initialMap_;
40  std::unordered_multimap<std::string, std::pair<PinyinInitial, PinyinFinal>>
41  initialFinalMap_;
42  std::set<PinyinFinal> finalSet_;
43  ShuangpinProfile::ValidInputSetType validInputs_;
44  ShuangpinProfile::ValidInputSetType validInitials_;
45  ShuangpinProfile::TableType spTable_;
46 
47  void buildShuangpinTable(const PinyinCorrectionProfile *correctionProfile) {
48  // Set up valid inputs.
49  for (char c = 'a'; c <= 'z'; c++) {
50  validInputs_.insert(c);
51  }
52  for (const auto &p : initialMap_) {
53  validInputs_.insert(p.first);
54  }
55  std::unordered_map<PinyinFinal, char> singleCharFinal;
56  for (const auto &p : finalMap_) {
57  validInputs_.insert(p.first);
58  if (PinyinEncoder::finalToString(p.second).size() == 1) {
59  singleCharFinal[p.second] = p.first;
60  }
61  }
62 
63  for (const auto &p : initialFinalMap_) {
64  for (auto c : p.first) {
65  validInputs_.insert(c);
66  }
67  }
68 
69  std::set<char> initialChars;
70  for (auto zero : zeroS_) {
71  if (zero != '*') {
72  validInputs_.insert(zero);
73  initialChars.insert(zero);
74  }
75  }
76 
77  // Collect all initial and final chars.
78  // Add single char initial to initialChars.
79  for (auto c = PinyinEncoder::firstInitial;
80  c <= PinyinEncoder::lastInitial; c++) {
81  const auto &initialString =
82  PinyinEncoder::initialToString(static_cast<PinyinInitial>(c));
83  if (initialString.size() == 1) {
84  initialChars.insert(initialString[0]);
85  }
86  }
87  // Add char in map to initialChars.
88  for (auto &p : initialMap_) {
89  initialChars.insert(p.first);
90  }
91 
92  // Collect all final chars.
93  // Add single char final to finalChars.
94  std::set<char> finalChars;
95  for (auto c = PinyinEncoder::firstFinal; c <= PinyinEncoder::lastFinal;
96  c++) {
97  auto f = static_cast<PinyinFinal>(c);
98  const auto &finalString = PinyinEncoder::finalToString(f);
99  if (finalString.size() == 1 && !singleCharFinal.contains(f)) {
100  finalChars.insert(finalString[0]);
101  singleCharFinal[f] = finalString[0];
102  }
103  }
104  // Add final in map to finalChars
105  for (auto &p : finalMap_) {
106  finalChars.insert(p.first);
107  }
108 
109  for (const auto &[final, chr] : singleCharFinal) {
110  auto [begin, end] = finalMap_.equal_range(chr);
111  if (std::find_if(begin, end, [final = final](const auto &item) {
112  return item.second == final;
113  }) == end) {
114  finalMap_.emplace(chr, final);
115  }
116  }
117 
118  auto addPinyinToList =
119  [](std::multimap<PinyinSyllable, PinyinFuzzyFlags> &pys,
120  PinyinInitial i, PinyinFinal f, PinyinFuzzyFlags flags) {
121  PinyinSyllable s(i, f);
122  if (flags == PinyinFuzzyFlag::None) {
123 
124  auto iter = pys.find(s);
125  // We replace fuzzy with non-fuzzy.
126  if (iter != pys.end() &&
127  iter->second != PinyinFuzzyFlag::None) {
128  pys.erase(s);
129  iter = pys.end();
130  }
131  if (iter == pys.end()) {
132  pys.emplace(s, flags);
133  }
134  } else {
135  auto iterPair = pys.equal_range(s);
136  // no match
137  if (iterPair.first != iterPair.second) {
138  if (iterPair.first->second == PinyinFuzzyFlag::None) {
139  return;
140  }
141  // check dup
142  for (auto i = iterPair.first; i != iterPair.second;
143  i++) {
144  if (i->second == flags) {
145  return;
146  }
147  }
148  }
149 
150  pys.emplace(s, flags);
151  }
152  };
153 
154  auto addPinyin =
155  [addPinyinToList](
156  std::multimap<PinyinSyllable, PinyinFuzzyFlags> &pys,
157  const std::string &py) {
158  const auto &map = getPinyinMapV2();
159  auto iterPair = map.equal_range(py);
160  if (iterPair.first != iterPair.second) {
161  for (const auto &item : std::ranges::subrange(
162  iterPair.first, iterPair.second)) {
163  // Shuangpin should not consider advanced typo, since
164  // it's swapping character order and will leads to wrong
165  // entry. Common typo also have "ng->gn" is ok.
166  if (item.flags().test(PinyinFuzzyFlag::AdvancedTypo)) {
167  continue;
168  }
169  addPinyinToList(pys, item.initial(), item.final(),
170  item.flags());
171  }
172  }
173  };
174 
175  // Special handling for Ziranma & Xiaohe style.
176  if (zeroS_.find('*') != std::string::npos) {
177  // length 1: aeiou, repeat it once: e.g. aa
178  // length 2: keep same as quanpin
179  // length 3: use the initial of quanpin and the one in the table.
180  for (auto c : finalChars) {
181  // If c is in final map.
182  auto finalIterPair = finalMap_.equal_range(c);
183  for (auto &item : std::ranges::subrange(finalIterPair.first,
184  finalIterPair.second)) {
185  if (PinyinEncoder::isValidInitialFinal(PinyinInitial::Zero,
186  item.second)) {
187  std::string input;
188  const auto &finalString =
189  PinyinEncoder::finalToString(item.second);
190  if (finalString.size() == 1) {
191  input = std::string{c, c};
192  } else {
193  auto final = PinyinEncoder::stringToFinal(
194  std::string{finalString[0]});
195  if (final != PinyinFinal::Invalid) {
196  auto singleCharFinalIter =
197  singleCharFinal.find(final);
198  if (singleCharFinalIter !=
199  singleCharFinal.end()) {
200  input = std::string{
201  singleCharFinalIter->second, c};
202  }
203  }
204  }
205  spTable_[input].emplace(
206  PinyinSyllable{PinyinInitial::Zero, item.second},
207  PinyinFuzzyFlag::None);
208  }
209  }
210  }
211  }
212 
213  // Enumerate the combinition of initial + final
214  for (auto c1 : initialChars) {
215  for (auto c2 : finalChars) {
216  std::string input{c1, c2};
217  auto &pys = spTable_[input];
218 
219  std::vector<PinyinInitial> initials;
220  std::vector<PinyinFinal> finals;
221  auto initialIterPair = initialMap_.equal_range(c1);
222  if (initialIterPair.first != initialIterPair.second) {
223  for (auto &item : std::ranges::subrange(
224  initialIterPair.first, initialIterPair.second)) {
225  initials.push_back(item.second);
226  }
227  }
228  auto initial = PinyinEncoder::stringToInitial(std::string{c1});
229  if (initial != PinyinInitial::Invalid) {
230  initials.push_back(initial);
231  }
232 
233  if (zeroS_.find(c1) != std::string::npos) {
234  initials.push_back(PinyinInitial::Zero);
235  }
236 
237  auto finalIterPair = finalMap_.equal_range(c2);
238  for (auto &item : std::ranges::subrange(finalIterPair.first,
239  finalIterPair.second)) {
240  finals.push_back(item.second);
241  }
242 
243  for (auto i : initials) {
244  for (auto f : finals) {
245  auto py = PinyinEncoder::initialToString(i) +
246  PinyinEncoder::finalToString(f);
247  addPinyin(pys, py);
248  }
249  }
250 
251  if (pys.empty()) {
252  spTable_.erase(input);
253  }
254  }
255  }
256 
257  // Populate initial final map.
258  for (const auto &p : initialFinalMap_) {
259  auto &pys = spTable_[p.first];
260  auto py = PinyinEncoder::initialToString(p.second.first) +
261  PinyinEncoder::finalToString(p.second.second);
262  addPinyin(pys, py);
263  }
264 
265  // Add non-existent 2 char pinyin to the map.
266  for (const auto &p : getPinyinMapV2()) {
267  // Don't add "ng" as two char direct pinyin.
268  if (p.pinyin() == "ng") {
269  continue;
270  }
271 
272  if (p.pinyin().size() == 2 && p.initial() == PinyinInitial::Zero &&
273  (!spTable_.contains(p.pinyin()) ||
274  zeroS_.find('*') != std::string::npos)) {
275  auto &pys = spTable_[p.pinyin()];
276  pys.emplace(PinyinSyllable{p.initial(), p.final()}, p.flags());
277  }
278  }
279 
280  // Add partial pinyin to the table.
281  for (char c : validInputs_) {
282  std::string input{c};
283  auto &pys = spTable_[input];
284  auto initial = PinyinEncoder::stringToInitial(std::string{c});
285  if (initial != PinyinInitial::Invalid) {
286  addPinyinToList(pys, initial, PinyinFinal::Invalid,
287  PinyinFuzzyFlag::None);
288  }
289  auto initialIterPair = initialMap_.equal_range(c);
290  for (auto &item : std::ranges::subrange(initialIterPair.first,
291  initialIterPair.second)) {
292  addPinyinToList(pys, item.second, PinyinFinal::Invalid,
293  PinyinFuzzyFlag::None);
294  }
295 
296  // Add single char final to partial pinyin.
297  auto [begin, end] = finalMap_.equal_range(c);
298  for (auto &item : std::ranges::subrange(begin, end)) {
299  const auto final = item.second;
300  if (PinyinEncoder::finalToString(final).size() == 1 &&
301  PinyinEncoder::isValidInitialFinal(PinyinInitial::Zero,
302  final) &&
303  pys.empty()) {
304  addPinyinToList(pys, PinyinInitial::Zero, final,
305  PinyinFuzzyFlag::None);
306  }
307  }
308 
309  if (pys.empty()) {
310  spTable_.erase(input);
311  }
312  }
313 
314  std::vector<std::tuple<std::string, PinyinSyllable, PinyinFuzzyFlags>>
315  newEntries;
316 
317  if (correctionProfile != nullptr) {
318  const auto &correctionMap = correctionProfile->correctionMap();
319  for (const auto &[input, pys] : spTable_) {
320  // Only apply correction on full shuangpin.
321  if (input.size() < 2) {
322  continue;
323  }
324  for (size_t i = 0; i < input.size(); i++) {
325  auto chr = input[i];
326  auto swap = correctionMap.find(chr);
327  if (swap == correctionMap.end() || swap->second.empty()) {
328  continue;
329  }
330  std::string newInput = input;
331  for (auto sub : swap->second) {
332  newInput[i] = sub;
333  for (const auto &x : pys) {
334  newEntries.emplace_back(
335  newInput, x.first,
336  x.second | PinyinFuzzyFlag::Correction);
337  }
338  newInput[i] = chr;
339  }
340  }
341  }
342  }
343 
344  for (const auto &[input, syllable, flags] : newEntries) {
345  auto &pys = spTable_[input];
346  pys.emplace(syllable, flags);
347  }
348 
349  for (const auto &sp : spTable_) {
350  assert(!sp.first.empty() && sp.first.size() <= 2);
351  validInitials_.insert(sp.first[0]);
352  }
353  }
354 };
355 
356 ShuangpinProfile::ShuangpinProfile(ShuangpinBuiltinProfile profile)
357  : ShuangpinProfile::ShuangpinProfile(profile, nullptr) {}
358 
359 ShuangpinProfile::ShuangpinProfile(std::istream &in)
360  : ShuangpinProfile::ShuangpinProfile(in, nullptr) {}
361 
362 ShuangpinProfile::ShuangpinProfile(
363  ShuangpinBuiltinProfile profile,
364  const PinyinCorrectionProfile *correctionProfile)
365  : d_ptr(std::make_unique<ShuangpinProfilePrivate>()) {
366  FCITX_D();
367  const SP_C *c = nullptr;
368  const SP_S *s = nullptr;
369  switch (profile) {
370  case ShuangpinBuiltinProfile::Ziranma:
371  c = SPMap_C_Ziranma;
372  s = SPMap_S_Ziranma;
373  d->zeroS_ = "o*";
374  break;
375  case ShuangpinBuiltinProfile::MS:
376  c = SPMap_C_MS;
377  s = SPMap_S_MS;
378  break;
379  case ShuangpinBuiltinProfile::Ziguang:
380  c = SPMap_C_Ziguang;
381  s = SPMap_S_Ziguang;
382  break;
383  case ShuangpinBuiltinProfile::ABC:
384  c = SPMap_C_ABC;
385  s = SPMap_S_ABC;
386  break;
387  case ShuangpinBuiltinProfile::Zhongwenzhixing:
388  c = SPMap_C_Zhongwenzhixing;
389  s = SPMap_S_Zhongwenzhixing;
390  break;
391  case ShuangpinBuiltinProfile::PinyinJiajia:
392  c = SPMap_C_PinyinJiaJia;
393  s = SPMap_S_PinyinJiaJia;
394  d->zeroS_ = "o*";
395  break;
396  case ShuangpinBuiltinProfile::Xiaohe:
397  d->zeroS_ = "*";
398  c = SPMap_C_XIAOHE;
399  s = SPMap_S_XIAOHE;
400  break;
401  case ShuangpinBuiltinProfile::GB:
402  d->zeroS_ = "a";
403  c = SPMap_C_GB;
404  s = SPMap_S_GB;
405  break;
406  default:
407  throw std::invalid_argument("Invalid profile");
408  }
409 
410  for (auto i = 0; c[i].cJP; i++) {
411  auto final = PinyinEncoder::stringToFinal(c[i].strQP);
412  d->finalMap_.emplace(c[i].cJP, final);
413  d->finalSet_.insert(final);
414  }
415 
416  for (auto i = 0; s[i].cJP; i++) {
417  d->initialMap_.emplace(s[i].cJP,
418  PinyinEncoder::stringToInitial(s[i].strQP));
419  }
420 
421  d->buildShuangpinTable(correctionProfile);
422 }
423 
424 ShuangpinProfile::ShuangpinProfile(
425  std::istream &in, const PinyinCorrectionProfile *correctionProfile)
426  : d_ptr(std::make_unique<ShuangpinProfilePrivate>()) {
427  FCITX_D();
428  std::string lineBuf;
429  bool isDefault = false;
430  while (std::getline(in, lineBuf)) {
431  auto line = fcitx::stringutils::trimView(lineBuf);
432  if (line.empty() || line.starts_with('#')) {
433  continue;
434  }
435 
436  std::string_view option("方案名称=");
437  if (fcitx::stringutils::consumePrefix(line, option)) {
438  isDefault = (line == "自然码" || line == "微软" || line == "紫光" ||
439  line == "拼音加加" || line == "中文之星" ||
440  line == "智能ABC" || line == "小鹤");
441  continue;
442  }
443 
444  if (isDefault) {
445  continue;
446  }
447 
448  auto tolowerInPlace = [](std::string &s) {
449  std::transform(s.begin(), s.end(), s.begin(),
450  [](char c) { return fcitx::charutils::tolower(c); });
451  };
452 
453  if (line[0] == '=' && line.size() > 1) {
454  d->zeroS_ = std::string(line.substr(1));
455  tolowerInPlace(d->zeroS_);
456  continue;
457  }
458 
459  auto equal = line.find('=');
460  // no '=', or equal at first char, or len(substr after equal) != 1
461  if (equal == std::string_view::npos || equal == 0) {
462  continue;
463  }
464 
465  if (equal + 2 == line.size()) {
466  std::string pinyin{line.substr(0, equal)};
467  auto key = fcitx::charutils::tolower(line[equal + 1]);
468  if (auto final = PinyinEncoder::stringToFinal(pinyin);
469  final != PinyinFinal::Invalid) {
470  d->finalMap_.emplace(key, final);
471  } else if (auto initial = PinyinEncoder::stringToInitial(pinyin);
472  initial != PinyinInitial::Invalid) {
473  d->initialMap_.emplace(key, initial);
474  }
475  } else if (equal + 3 == line.size()) {
476  std::string_view pinyin = line.substr(0, equal);
477  std::string key{line.substr(equal + 1)};
478  tolowerInPlace(key);
479  try {
480  auto result = PinyinEncoder::encodeFullPinyin(pinyin);
481  if (result.size() != 2) {
482  continue;
483  }
484  d->initialFinalMap_.emplace(
485  key, std::make_pair(static_cast<PinyinInitial>(result[0]),
486  static_cast<PinyinFinal>(result[1])));
487  } catch (...) {
488  }
489  }
490  }
491 
492  d->buildShuangpinTable(correctionProfile);
493 }
494 
495 FCITX_DEFINE_DPTR_COPY_AND_DEFAULT_DTOR_AND_MOVE(ShuangpinProfile)
496 
497 // Deprecated, keep to only keep ABI stable.
498 void ShuangpinProfile::buildShuangpinTable() {}
499 
500 const ShuangpinProfile::TableType &ShuangpinProfile::table() const {
501  FCITX_D();
502  return d->spTable_;
503 }
504 
505 const ShuangpinProfile::ValidInputSetType &
506 ShuangpinProfile::validInput() const {
507  FCITX_D();
508  return d->validInputs_;
509 }
510 
511 const ShuangpinProfile::ValidInputSetType &
512 ShuangpinProfile::validInitial() const {
513  FCITX_D();
514  return d->validInitials_;
515 }
516 } // namespace libime
Class that holds updated Pinyin correction mapping based on correction mapping.
static std::vector< char > encodeFullPinyin(std::string_view pinyin)
Encode a quote separated pinyin string.
const std::unordered_map< char, std::vector< char > > & correctionMap() const
Return the correction mapping.