libime
pinyinencoder.h
1 /*
2  * SPDX-FileCopyrightText: 2017-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  */
6 #ifndef _FCITX_LIBIME_PINYIN_PINYINENCODER_H_
7 #define _FCITX_LIBIME_PINYIN_PINYINENCODER_H_
8 
9 #include <cstddef>
10 #include <string>
11 #include <string_view>
12 #include <utility>
13 #include <vector>
14 #include <fcitx-utils/flags.h>
15 #include <fcitx-utils/log.h>
16 #include <fcitx-utils/macros.h>
17 #include <libime/core/segmentgraph.h>
18 #include <libime/pinyin/libimepinyin_export.h>
19 
20 namespace libime {
21 
22 class ShuangpinProfile;
23 class PinyinCorrectionProfile;
24 
25 enum class PinyinFuzzyFlag {
26  None = 0,
27  CommonTypo = 1 << 0,
28  NG_GN [[deprecated]] = CommonTypo,
29  V_U = 1 << 1,
30  AN_ANG = 1 << 2, // 0
31  EN_ENG = 1 << 3, // 1
32  IAN_IANG = 1 << 4, // 2
33  IN_ING = 1 << 5, // 3
34  U_OU = 1 << 6, // 4
35  UAN_UANG = 1 << 7, // 5
36  C_CH = 1 << 8, // 0
37  F_H = 1 << 9, // 1
38  L_N = 1 << 10, // 2
39  S_SH = 1 << 11, // 3
40  Z_ZH = 1 << 12, // 4
41  VE_UE = 1 << 13,
42  Inner = 1 << 14,
43  InnerShort = 1 << 15,
44  PartialFinal = 1 << 16,
45  /**
46  * Enable matching partial shuangpin
47  *
48  * @since 1.0.11
49  */
50  PartialSp = 1 << 17,
51  /**
52  * Enable typo that may cause ambiguity.
53  *
54  * @since 1.1.3
55  */
56  AdvancedTypo = 1 << 18,
57  /**
58  * Enable correction based on layout profile.
59  *
60  * @since 1.1.7
61  */
62  Correction = 1 << 19,
63  /**
64  * @since 1.1.11
65  */
66  L_R = 1 << 20,
67  /**
68  * Enable matching for lower case single pinyin as English letter.
69  */
70  Letter = 1 << 21,
71 };
72 
73 using PinyinFuzzyFlags = fcitx::Flags<PinyinFuzzyFlag>;
74 
75 LIBIMEPINYIN_EXPORT fcitx::LogMessageBuilder &
76 operator<<(fcitx::LogMessageBuilder &log, PinyinFuzzyFlags fuzzy);
77 
78 enum class PinyinInitial : char {
79  Invalid = 0,
80  B = 'A',
81  P,
82  M,
83  F,
84  D,
85  T,
86  N,
87  L,
88  G,
89  K,
90  H,
91  J,
92  Q,
93  X,
94  ZH,
95  CH,
96  SH,
97  R,
98  Z,
99  C,
100  S,
101  Y,
102  W,
103  Zero
104 };
105 
106 inline bool operator<(PinyinInitial l, PinyinInitial r) {
107  return static_cast<char>(l) < static_cast<char>(r);
108 }
109 
110 inline bool operator<=(PinyinInitial l, PinyinInitial r) {
111  return l < r || l == r;
112 }
113 
114 inline bool operator>(PinyinInitial l, PinyinInitial r) { return !(l <= r); }
115 
116 inline bool operator>=(PinyinInitial l, PinyinInitial r) { return !(l < r); }
117 
118 LIBIMEPINYIN_EXPORT fcitx::LogMessageBuilder &
119 operator<<(fcitx::LogMessageBuilder &log, PinyinInitial initial);
120 
121 enum class PinyinFinal : char {
122  Invalid = 0,
123  A = 'A',
124  AI,
125  AN,
126  ANG,
127  AO,
128  E,
129  EI,
130  EN,
131  ENG,
132  ER,
133  O,
134  ONG,
135  OU,
136  I,
137  IA,
138  IE,
139  IAO,
140  IU,
141  IAN,
142  IN,
143  IANG,
144  ING,
145  IONG,
146  U,
147  UA,
148  UO,
149  UAI,
150  UI,
151  UAN,
152  UN,
153  UANG,
154  V,
155  VE,
156  UE,
157  NG,
158  Zero,
159  Letter_A,
160  Letter_B,
161  Letter_C,
162  Letter_D,
163  Letter_E,
164  Letter_F,
165  Letter_G,
166  Letter_H,
167  Letter_I,
168  Letter_J,
169  Letter_K,
170  Letter_L,
171  Letter_M,
172  Letter_N,
173  Letter_O,
174  Letter_P,
175  Letter_Q,
176  Letter_R,
177  Letter_S,
178  Letter_T,
179  Letter_U,
180  Letter_V,
181  Letter_W,
182  Letter_X,
183  Letter_Y,
184  Letter_Z,
185 };
186 
187 inline bool operator<(PinyinFinal l, PinyinFinal r) {
188  return static_cast<char>(l) < static_cast<char>(r);
189 }
190 
191 inline bool operator<=(PinyinFinal l, PinyinFinal r) { return l < r || l == r; }
192 
193 inline bool operator>(PinyinFinal l, PinyinFinal r) { return !(l <= r); }
194 
195 inline bool operator>=(PinyinFinal l, PinyinFinal r) { return !(l < r); }
196 
197 LIBIMEPINYIN_EXPORT fcitx::LogMessageBuilder &
198 operator<<(fcitx::LogMessageBuilder &log, PinyinFinal final);
199 
200 struct LIBIMEPINYIN_EXPORT PinyinSyllable {
201 public:
202  PinyinSyllable(PinyinInitial initial, PinyinFinal final)
203  : initial_(initial), final_(final) {}
204  FCITX_INLINE_DEFINE_DEFAULT_DTOR_AND_COPY(PinyinSyllable)
205 
206  PinyinInitial initial() const { return initial_; }
207  PinyinFinal final() const { return final_; }
208 
209  std::string toString() const;
210 
211  bool operator==(const PinyinSyllable &other) const {
212  return initial_ == other.initial_ && final_ == other.final_;
213  }
214 
215  bool operator!=(const PinyinSyllable &other) const {
216  return !(*this == other);
217  }
218  bool operator<(const PinyinSyllable &other) const {
219  return std::make_pair(initial_, final_) <
220  std::make_pair(other.initial_, other.final_);
221  }
222  bool operator<=(const PinyinSyllable &other) const {
223  return *this < other || *this == other;
224  }
225  bool operator>(const PinyinSyllable &other) const {
226  return !(*this <= other);
227  }
228  bool operator>=(const PinyinSyllable &other) const {
229  return !(*this < other);
230  }
231 
232 private:
233  PinyinInitial initial_;
234  PinyinFinal final_;
235 };
236 
237 LIBIMEPINYIN_EXPORT fcitx::LogMessageBuilder &
238 operator<<(fcitx::LogMessageBuilder &log, PinyinSyllable syl);
239 
240 template <typename FuzzyValue>
241 using FuzzyPinyinSyllables = std::vector<
242  std::pair<PinyinInitial, std::vector<std::pair<PinyinFinal, FuzzyValue>>>>;
243 
244 using MatchedPinyinSyllables = FuzzyPinyinSyllables<bool>;
245 
246 using MatchedPinyinSyllablesWithFuzzyFlags =
247  FuzzyPinyinSyllables<PinyinFuzzyFlags>;
248 
249 class LIBIMEPINYIN_EXPORT PinyinEncoder {
250 public:
251  static SegmentGraph parseUserPinyin(std::string pinyin,
252  PinyinFuzzyFlags flags);
253  static SegmentGraph parseUserPinyin(std::string pinyin,
254  const PinyinCorrectionProfile *profile,
255  PinyinFuzzyFlags flags);
256 
257  static SegmentGraph parseUserShuangpin(std::string pinyin,
258  const ShuangpinProfile &sp,
259  PinyinFuzzyFlags flags);
260 
261  /**
262  * @brief Encode a quote separated pinyin string.
263  *
264  * @param pinyin pinyin string, like ni'hao
265  * @return encoded pinyin.
266  */
267  static std::vector<char> encodeFullPinyin(std::string_view pinyin);
268  /**
269  * @brief Encode a quote separated pinyin string.
270  *
271  * @param pinyin pinyin string, like ni'hao
272  * @param flags fuzzy flags that is acceptable
273  * @return encoded pinyin.
274  *
275  * @since 1.0.17
276  */
277  static std::vector<char> encodeFullPinyinWithFlags(std::string_view pinyin,
278  PinyinFuzzyFlags flags);
279  static std::vector<char> encodeOneUserPinyin(std::string pinyin);
280 
281  static std::string shuangpinToPinyin(std::string_view pinyin,
282  const ShuangpinProfile &sp);
283 
284  static bool isValidUserPinyin(const char *data, size_t size);
285 
286  static bool isValidUserPinyin(const std::vector<char> &v) {
287  return isValidUserPinyin(v.data(), v.size());
288  }
289 
290  static std::string decodeFullPinyin(const std::vector<char> &v) {
291  return decodeFullPinyin(v.data(), v.size());
292  }
293  static std::string decodeFullPinyin(std::string_view s) {
294  return decodeFullPinyin(s.data(), s.size());
295  }
296  static std::string decodeFullPinyin(const char *data, size_t size);
297 
298  static const std::string &initialToString(PinyinInitial initial);
299  static PinyinInitial stringToInitial(const std::string &str);
300  static bool isValidInitial(char c) {
301  return c >= firstInitial && c <= lastInitial;
302  }
303 
304  static const std::string &finalToString(PinyinFinal final);
305  static PinyinFinal stringToFinal(const std::string &str);
306  static bool isValidFinal(char c) {
307  return c >= firstFinal && c <= lastFinal;
308  }
309 
310  static bool isValidInitialFinal(PinyinInitial initial, PinyinFinal final);
311 
312  /**
313  * Check if the final is a letter.
314  * @since 1.1.14
315  */
316  static bool isFinalLetter(PinyinFinal final);
317 
318  static PinyinFinal letterToFinal(char c);
319 
320  // This will use "ΓΌ" when possible.
321  static std::string initialFinalToPinyinString(PinyinInitial initial,
322  PinyinFinal final);
323 
324  static MatchedPinyinSyllables stringToSyllables(std::string_view pinyin,
325  PinyinFuzzyFlags flags);
326 
327  static MatchedPinyinSyllablesWithFuzzyFlags
328  stringToSyllablesWithFuzzyFlags(std::string_view pinyin,
329  const PinyinCorrectionProfile *profile,
330  PinyinFuzzyFlags flags);
331 
332  static MatchedPinyinSyllables
333  shuangpinToSyllables(std::string_view pinyin, const ShuangpinProfile &sp,
334  PinyinFuzzyFlags flags);
335  static MatchedPinyinSyllablesWithFuzzyFlags
336  shuangpinToSyllablesWithFuzzyFlags(std::string_view pinyin,
337  const ShuangpinProfile &sp,
338  PinyinFuzzyFlags flags);
339 
340  static constexpr char firstInitial = static_cast<char>(PinyinInitial::B);
341  static constexpr char lastInitial = static_cast<char>(PinyinInitial::Zero);
342  static constexpr char firstFinal = static_cast<char>(PinyinFinal::A);
343  static constexpr char lastFinal = static_cast<char>(PinyinFinal::Zero);
344  static constexpr char firstLetter =
345  static_cast<char>(PinyinFinal::Letter_A);
346  static constexpr char lastLetter = static_cast<char>(PinyinFinal::Letter_Z);
347 };
348 } // namespace libime
349 
350 #endif // _FCITX_LIBIME_PINYIN_PINYINENCODER_H_
Class that holds updated Pinyin correction mapping based on correction mapping.