Fcitx
stringutils.cpp
1 /*
2  * SPDX-FileCopyrightText: 2015-2017 CSSlayer <wengxt@gmail.com>
3  *
4  * SPDX-License-Identifier: LGPL-2.1-or-later
5  *
6  */
7 #include "stringutils.h"
8 #include <array>
9 #include <cassert>
10 #include <climits>
11 #include <cstdint>
12 #include <cstring>
13 #include <initializer_list>
14 #include <limits>
15 #include <optional>
16 #include <string>
17 #include <string_view>
18 #include <utility>
19 #include <vector>
20 #include <fcitx-utils/fcitxutils_export.h>
21 #include "charutils.h"
22 #include "macros.h"
23 
24 namespace fcitx::stringutils {
25 namespace details {
26 
27 std::string
28 concatPieces(std::initializer_list<std::pair<const char *, std::size_t>> list) {
29  std::size_t size = 0;
30  for (auto pair : list) {
31  size += pair.second;
32  }
33  std::string result;
34  result.reserve(size);
35  for (const auto &pair : list) {
36  result.append(pair.first, pair.first + pair.second);
37  }
38  assert(result.size() == size);
39  return result;
40 }
41 
42 std::string concatPathPieces(
43  std::initializer_list<std::pair<const char *, std::size_t>> list) {
44  if (!list.size()) {
45  return {};
46  }
47 
48  bool first = true;
49  bool firstPieceIsSlash = false;
50  std::size_t size = 0;
51  for (const auto &pair : list) {
52  if (first) {
53  if (pair.first[pair.second - 1] == '/') {
54  firstPieceIsSlash = true;
55  }
56  first = false;
57  } else {
58  size += 1;
59  }
60  size += pair.second;
61  }
62  if (list.size() > 1 && firstPieceIsSlash) {
63  size -= 1;
64  }
65  std::string result;
66  result.reserve(size);
67  first = true;
68  for (auto pair : list) {
69  if (first) {
70  first = false;
71  } else if (firstPieceIsSlash) {
72  firstPieceIsSlash = false;
73  } else {
74  result += '/';
75  }
76 
77  result.append(pair.first, pair.first + pair.second);
78  }
79  assert(result.size() == size);
80  return result;
81 }
82 } // namespace details
83 
84 namespace {
85 
86 constexpr std::array<char, std::numeric_limits<uint8_t>::max()> escapeMap =
87  []() consteval {
88  std::array<char, std::numeric_limits<uint8_t>::max()> table{};
89  table.fill('\0');
90  table['\\'] = '\\';
91  table['"'] = '"';
92  table['\n'] = 'n';
93  table['\f'] = 'f';
94  table['\r'] = 'r';
95  table['\t'] = 't';
96  table['\v'] = 'v';
97  return table;
98  }();
99 
100 constexpr std::array<char, std::numeric_limits<uint8_t>::max()> unescapeMap =
101  []() consteval {
102  std::array<char, std::numeric_limits<uint8_t>::max()> table{};
103  table.fill('\0');
104  table['\\'] = '\\';
105  table['"'] = '"';
106  table['n'] = '\n';
107  table['f'] = '\f';
108  table['r'] = '\r';
109  table['t'] = '\t';
110  table['v'] = '\v';
111  return table;
112  }();
113 
114 } // namespace
115 
116 FCITXUTILS_DEPRECATED_EXPORT bool startsWith(const std::string &str,
117  const std::string &prefix) {
118  return str.starts_with(prefix);
119 }
120 
121 bool startsWith(std::string_view str, std::string_view prefix) {
122  return str.starts_with(prefix);
123 }
124 
125 FCITXUTILS_DEPRECATED_EXPORT bool endsWith(const std::string &str,
126  const std::string &suffix) {
127  return str.ends_with(suffix);
128 }
129 
130 bool endsWith(std::string_view str, std::string_view suffix) {
131  return str.ends_with(suffix);
132 }
133 
134 inline std::pair<std::string::size_type, std::string::size_type>
135 trimInplaceImpl(std::string_view str) {
136  auto start = str.find_first_not_of(FCITX_WHITESPACE);
137  if (start == std::string::npos) {
138  return {str.size(), str.size()};
139  }
140 
141  auto end = str.size();
142  while (end > start && charutils::isspace(str[end - 1])) {
143  --end;
144  }
145 
146  return {start, end};
147 }
148 
149 FCITXUTILS_DEPRECATED_EXPORT
150 std::pair<std::string::size_type, std::string::size_type>
151 trimInplace(const std::string &str) {
152  return trimInplaceImpl(str);
153 }
154 
155 std::pair<std::string::size_type, std::string::size_type>
156 trimInplace(std::string_view str) {
157  return trimInplaceImpl(str);
158 }
159 
160 FCITXUTILS_DEPRECATED_EXPORT
161 std::string trim(const std::string &str) { return trim(std::string_view(str)); }
162 
163 std::string trim(std::string_view str) {
164  auto pair = trimInplaceImpl(str);
165  return {str.begin() + pair.first, str.begin() + pair.second};
166 }
167 
168 std::string_view trimView(std::string_view str) {
169  auto pair = trimInplace(str);
170  return str.substr(pair.first, pair.second - pair.first);
171 }
172 
173 FCITXUTILS_DEPRECATED_EXPORT
174 std::vector<std::string> split(const std::string &str, const std::string &delim,
175  SplitBehavior behavior) {
176  return split(std::string_view(str), std::string_view(delim), behavior);
177 }
178 
179 std::vector<std::string> split(std::string_view str, std::string_view delim,
180  SplitBehavior behavior) {
181  std::vector<std::string> strings;
182  std::string::size_type lastPos;
183  std::string::size_type pos;
184  if (behavior == SplitBehavior::SkipEmpty) {
185  lastPos = str.find_first_not_of(delim, 0);
186  } else {
187  lastPos = 0;
188  }
189  pos = str.find_first_of(delim, lastPos);
190 
191  while (std::string::npos != pos || std::string::npos != lastPos) {
192  strings.push_back(std::string(str.substr(lastPos, pos - lastPos)));
193  if (behavior == SplitBehavior::SkipEmpty) {
194  lastPos = str.find_first_not_of(delim, pos);
195  } else {
196  if (pos == std::string::npos) {
197  break;
198  }
199  lastPos = pos + 1;
200  }
201  pos = str.find_first_of(delim, lastPos);
202  }
203 
204  return strings;
205 }
206 
207 FCITXUTILS_DEPRECATED_EXPORT std::vector<std::string>
208 split(const std::string &str, const std::string &delim) {
209  return split(std::string_view(str), std::string_view(delim));
210 }
211 
212 std::vector<std::string> split(std::string_view str, std::string_view delim) {
213  return split(str, delim, SplitBehavior::SkipEmpty);
214 }
215 
216 std::string replaceAll(std::string str, const std::string &before,
217  const std::string &after) {
218  if (before.empty()) {
219  return str;
220  }
221 
222  constexpr int MAX_REPLACE_INDICES_NUM = 128;
223 
224  size_t pivot = 0;
225  std::string newString;
226  size_t lastLen = 0;
227  size_t indices[MAX_REPLACE_INDICES_NUM];
228 
229  size_t newStringPos = 0;
230  size_t oldStringPos = 0;
231 
232  auto copyAndMoveOn = [&newString, &newStringPos](std::string_view source,
233  size_t pos,
234  size_t length) {
235  if (length == 0) {
236  return;
237  }
238  // Append source[pos..pos+length] to newString.
239  newString.replace(newStringPos, length, source, pos, length);
240  newStringPos += length;
241  };
242 
243  do {
244 
245  int nIndices = 0;
246  while (nIndices < MAX_REPLACE_INDICES_NUM) {
247  pivot = str.find(before, pivot);
248  if (pivot == std::string::npos) {
249  break;
250  }
251 
252  indices[nIndices++] = pivot;
253  pivot += before.size();
254  }
255 
256  if (nIndices) {
257  if (!lastLen) {
258  lastLen = str.size() + nIndices * after.size() -
259  nIndices * before.size();
260  newString.resize(lastLen);
261  } else {
262  size_t newLen = lastLen + nIndices * after.size() -
263  nIndices * before.size();
264  lastLen = newLen;
265  newString.resize(newLen);
266  }
267 
268  // string s is split as
269  // oldStringPos, indices[0], indices[0] + before.size(), indices[1],
270  // indices[1] + before.size()
271  // .... indices[nIndices - 1], indices[nIndices - 1] + before.size()
272  copyAndMoveOn(str, oldStringPos, indices[0] - oldStringPos);
273  copyAndMoveOn(after, 0, after.size());
274 
275  for (int i = 1; i < nIndices; i++) {
276  copyAndMoveOn(str, indices[i - 1] + before.size(),
277  indices[i] - (indices[i - 1] + before.size()));
278  copyAndMoveOn(after, 0, after.size());
279  }
280 
281  oldStringPos = indices[nIndices - 1] + before.size();
282  }
283  } while (pivot != std::string::npos);
284 
285  if (!lastLen) {
286  return str;
287  }
288 
289  copyAndMoveOn(str, oldStringPos, str.size() - oldStringPos);
290  newString.resize(newStringPos);
291 
292  return newString;
293 }
294 
295 #define REHASH(a) \
296  if (ol_minus_1 < sizeof(unsigned int) * CHAR_BIT) \
297  hashHaystack -= (a) << ol_minus_1; \
298  hashHaystack <<= 1
299 
300 const char *backwardSearch(const char *haystack, size_t l, const char *needle,
301  size_t ol, size_t from) {
302  if (ol > l) {
303  return nullptr;
304  }
305  size_t delta = l - ol;
306  if (from > l) {
307  return nullptr;
308  }
309  if (from > delta) {
310  from = delta;
311  }
312 
313  const char *end = haystack;
314  haystack += from;
315  const unsigned int ol_minus_1 = ol - 1;
316  const char *n = needle + ol_minus_1;
317  const char *h = haystack + ol_minus_1;
318  unsigned int hashNeedle = 0;
319  unsigned int hashHaystack = 0;
320  size_t idx;
321  for (idx = 0; idx < ol; ++idx) {
322  hashNeedle = ((hashNeedle << 1) + *(n - idx));
323  hashHaystack = ((hashHaystack << 1) + *(h - idx));
324  }
325  hashHaystack -= *haystack;
326  while (haystack >= end) {
327  hashHaystack += *haystack;
328  if (hashHaystack == hashNeedle && memcmp(needle, haystack, ol) == 0) {
329  return haystack;
330  }
331  --haystack;
332  REHASH(*(haystack + ol));
333  }
334  return nullptr;
335 }
336 
337 char *backwardSearch(char *haystack, size_t l, const char *needle, size_t ol,
338  size_t from) {
339  return const_cast<char *>(backwardSearch(
340  static_cast<const char *>(haystack), l, needle, ol, from));
341 }
342 
343 size_t backwardSearch(const std::string &haystack, const std::string &needle,
344  size_t from) {
345  const auto *cstr = haystack.c_str();
346  const auto *result = backwardSearch(cstr, haystack.size(), needle.c_str(),
347  needle.size(), from);
348  if (result) {
349  return result - cstr;
350  }
351  return std::string::npos;
352 }
353 
354 enum class UnescapeState { NORMAL, ESCAPE };
355 
356 bool unescape(std::string &str, bool unescapeQuote) {
357  if (str.empty()) {
358  return true;
359  }
360 
361  size_t i = 0;
362  size_t j = 0;
363  UnescapeState state = UnescapeState::NORMAL;
364  do {
365  switch (state) {
366  case UnescapeState::NORMAL:
367  if (str[i] == '\\') {
368  state = UnescapeState::ESCAPE;
369  } else {
370  str[j] = str[i];
371  j++;
372  }
373  break;
374  case UnescapeState::ESCAPE:
375  if (auto c = unescapeMap[str[i]];
376  c && (unescapeQuote || c != '"')) {
377  str[j] = c;
378  j++;
379  state = UnescapeState::NORMAL;
380  break;
381  }
382  // invalid escape sequence
383  return false;
384  }
385  } while (str[i++]);
386  str.resize(j - 1);
387  return true;
388 }
389 
390 std::optional<std::string> unescapeForValue(std::string_view str) {
391  // having quote at beginning and end, escape
392  if (str.size() >= 2 && str.front() == '"' && str.back() == '"') {
393  std::string result;
394  auto originLength = str.size();
395  auto consumed = consumeMaybeEscapedValue(str, "", &result);
396  if (consumed.size() == originLength) {
397  return result;
398  }
399  return std::nullopt;
400  }
401  return std::string{str};
402 }
403 
404 std::string escapeForValue(std::string_view str) {
405  std::string value;
406  value.reserve(str.size());
407  const bool needEscape =
408  str.find_first_of("\f\r\t\v \"\\\n") != std::string::npos;
409  if (needEscape) {
410  value.push_back('"');
411  }
412  for (char c : str) {
413  if (auto escape = escapeMap[static_cast<uint8_t>(c)]) {
414  value.push_back('\\');
415  value.push_back(escape);
416  } else {
417  value.push_back(c);
418  }
419  }
420  if (needEscape) {
421  value.push_back('"');
422  }
423 
424  return value;
425 }
426 
427 bool consumePrefix(std::string_view &str, std::string_view prefix) {
428  if (str.starts_with(prefix)) {
429  str = str.substr(prefix.size());
430  return true;
431  }
432  return false;
433 }
434 
435 std::string_view consumeMaybeEscapedValue(std::string_view &input,
436  std::string_view skip,
437  std::string *output) {
438  auto start = input.find_first_not_of(skip);
439  if (start == std::string_view::npos) {
440  input = std::string_view();
441  if (output) {
442  output->clear();
443  }
444  return {};
445  }
446 
447  input = input.substr(start);
448  assert(!input.empty());
449  const bool maybeQuoted = input.front() == '"';
450  if (maybeQuoted) {
451  std::string result;
452  UnescapeState state = UnescapeState::NORMAL;
453  size_t end = 0;
454  for (size_t i = 1; i < input.size(); i++) {
455  const char c = input[i];
456  switch (state) {
457  case UnescapeState::NORMAL:
458  if (c == '\\') {
459  state = UnescapeState::ESCAPE;
460  } else if (c == '"') {
461  end = i + 1;
462  break;
463  } else {
464  result.push_back(c);
465  }
466  break;
467  case UnescapeState::ESCAPE:
468  if (auto c = unescapeMap[input[i]]) {
469  result.push_back(c);
470  } else {
471  // invalid escape sequence
472  // and treat it as normal character.
473  result.push_back(input[i]);
474  }
475  state = UnescapeState::NORMAL;
476  break;
477  }
478  if (end) {
479  break;
480  }
481  }
482  if (end > 0) {
483  auto consumed = input.substr(0, end);
484  input = input.substr(end);
485  if (output) {
486  *output = std::move(result);
487  }
488  return consumed;
489  }
490  }
491  auto end = input.find_first_of(skip, 1);
492  auto consumed = input.substr(0, end);
493  input =
494  end == std::string_view::npos ? std::string_view() : input.substr(end);
495  if (output) {
496  *output = std::string(consumed);
497  }
498  return consumed;
499 }
500 
501 } // namespace fcitx::stringutils
std::vector< std::string > split(std::string_view str, std::string_view delim)
Split the string by delim.
std::optional< std::string > unescapeForValue(std::string_view str)
unescape a string if it is quoted, otherwise return the original string.
bool unescape(std::string &str, bool unescapeQuote)
Inplace unescape a string contains slash, new line, optionally quote.
bool consumePrefix(std::string_view &str, std::string_view prefix)
Return a substring of input str if str starts with given prefix.
size_t length(Iter start, Iter end)
Return the number UTF-8 characters in the string iterator range.
Definition: utf8.h:33
std::pair< std::string::size_type, std::string::size_type > trimInplace(std::string_view str)
Trim the whitespace by returning start end end of first and list non whitespace character position...
std::string_view consumeMaybeEscapedValue(std::string_view &input, std::string_view skip, std::string *output)
Consume a value that is potentially quoted and escaped.
std::string_view trimView(std::string_view str)
Trim the white space in string view.
bool endsWith(std::string_view str, std::string_view suffix)
Check if a string ends with a suffix.
std::string replaceAll(std::string str, const std::string &before, const std::string &after)
Replace all substring appearance of before with after.
bool startsWith(std::string_view str, std::string_view prefix)
Check if a string starts with a prefix.
String handle utilities.
std::string escapeForValue(std::string_view str)
escape a string if str contains certain characters.
size_t backwardSearch(const std::string &haystack, const std::string &needle, size_t from)
Fast backward substring search.
std::string trim(std::string_view str)
Trim the white space in str.
Local independent API to detect character type.