Fleet  0.0.9
Inference in the LOT
Strings.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <sstream>
4 #include <atomic>
5 #include <string>
6 
7 #include "Miscellaneous.h"
8 #include "Numerics.h"
9 #include "Random.h"
10 #include "Vector3D.h"
11 
12 // This holds our conversions to and from strings
13 #include "str.h"
14 
15 // This constant is occasionally useful, especially in grammar inference where we might
16 // want a reference to an empty input
17 const std::string EMPTY_STRING = "";
18 
19 const std::string LAMBDA_STRING = "\u03BB";
20 const std::string LAMBDAXDOT_STRING = LAMBDA_STRING+"x.";
21 
22 template <typename T>
23 std::string to_string_with_precision(const T a_value, const int n = 14) {
24  // https://stackoverflow.com/questions/16605967/set-precision-of-stdto-string-when-converting-floating-point-values
25  std::ostringstream out;
26  out.precision(n);
27  out << std::fixed << a_value;
28  return out.str();
29 }
30 
37 /* If x is a prefix of y -- works for strings and vectors */
38 template<typename T>
39 bool is_prefix(const T& prefix, const T& x) {
47  if(prefix.size() > x.size()) return false;
48  if(prefix.size() == 0) return true;
49 
50  return std::equal(prefix.begin(), prefix.end(), x.begin());
51 }
52 
53 bool contains(const std::string& s, const std::string& x) {
54  return s.find(x) != std::string::npos;
55 }
56 
57 bool contains(const std::string& s, const char x) {
58  return s.find(x) != std::string::npos;
59 }
60 
68 void replace_all(std::string& s, const std::string& x, const std::string& y, int add=0) {
69  auto pos = s.find(x);
70  while(pos != std::string::npos) {
71  s.replace(pos,x.length()+add,y);
72  pos = s.find(x);
73  }
74 }
75 
76 
82 std::string remove_characters(std::string s, const std::string& rem) {
83  // to optimize you know we could load rem into a map
84  for(auto it=s.begin(); it != s.end();) {
85  if(contains(rem, *it)) {
86  it = s.erase(it);
87  }
88  else {
89  ++it;
90  }
91  }
92  return s;
93 }
94 
95 
106  template<const float& add_p, const float& del_p, typename T=std::string>
107 inline double p_delete_append(const T& x, const T& y, const float log_alphabet) {
118  // all of these get precomputed at compile time
119  static const float log_add_p = log(add_p);
120  static const float log_del_p = log(del_p);
121  static const float log_1madd_p = log(1.0-add_p);
122  static const float log_1mdel_p = log(1.0-del_p);
123 
124 
125  // Well we can always delete the whole thing and add on the remainder
126  float lp = log_del_p*x.size() + // we don't add log_1mdel_p again here since we can't delete past the beginning
127  (log_add_p-log_alphabet)*y.size() + log_1madd_p;
128 
129  // now as long as they are equal, we can take only down that far if we want
130  // here we index over mi, the length of the string that so far is equal
131  for(size_t mi=1;mi<=std::min(x.size(),y.size());mi++){
132  if(x.at(mi-1) == y.at(mi-1)) {
133  lp = logplusexp(lp, log_del_p*(x.size()-mi) + log_1mdel_p +
134  (log_add_p-log_alphabet)*(y.size()-mi) + log_1madd_p);
135  }
136  else {
137  break;
138  }
139  }
140 
141  return lp;
142 }
143 
144 std::pair<std::string, std::string> divide(const std::string& s, const char delimiter) {
145  // divide this string into two pieces at the first occurance of delimiter
146  auto k = s.find(delimiter);
147  assert(k != std::string::npos && "*** Cannot divide a string without delimiter");
148  return std::make_pair(s.substr(0,k), s.substr(k+1));
149 }
150 
151 
152 
153 size_t count(const std::string& str, const std::string& sub) {
161  // https://www.rosettacode.org/wiki/Count_occurrences_of_a_substring#C.2B.2B
162 
163  if (sub.length() == 0) return 0;
164 
165  size_t count = 0;
166  for (size_t offset = str.find(sub);
167  offset != std::string::npos;
168  offset = str.find(sub, offset + sub.length())) {
169  ++count;
170  }
171  return count;
172 }
173 
174 
175 size_t count(const std::string& str, const char x) {
176  size_t count = 0;
177  for(auto& a : str) {
178  if(a == x)
179  count++;
180  }
181  return count;
182 }
183 
184 
185 std::string reverse(std::string x) {
186  return std::string(x.rbegin(),x.rend());
187 }
188 
189 
190 std::string QQ(const std::string& x) {
197  return std::string("\"") + x + std::string("\"");
198 }
199 std::string Q(const std::string& x) {
206  return std::string("\'") + x + std::string("\'");
207 }
208 
209 
215 void check_alphabet(const std::string& s, const std::string& a) {
216  for(char c: s) {
217  if(a.find(c) == std::string::npos) {
218  std::cerr << "*** Character '" << c << "' in " << s << " not in alphabet=" << a << std::endl;
219  assert(false);
220  }
221  }
222 }
223 
224 void check_alphabet(std::vector<std::string> t, const std::string& a){
225  for(auto& x : t) {
226  check_alphabet(x,a);
227  }
228 }
229 
236 unsigned int levenshtein_distance(const std::string& s1, const std::string& s2) {
237 
238  // From https://en.wikibooks.org/wiki/Algorithm_Implementation/Strings/Levenshtein_distance#C++
239 
240  const std::size_t len1 = s1.size(), len2 = s2.size();
241  std::vector<std::vector<unsigned int>> d(len1 + 1, std::vector<unsigned int>(len2 + 1));
242 
243  d[0][0] = 0;
244  for(unsigned int i = 1; i <= len1; ++i) d[i][0] = i;
245  for(unsigned int i = 1; i <= len2; ++i) d[0][i] = i;
246 
247  for(unsigned int i = 1; i <= len1; ++i)
248  for(unsigned int j = 1; j <= len2; ++j)
249  d[i][j] = std::min(d[i - 1][j] + 1, std::min(d[i][j - 1] + 1, d[i - 1][j - 1] + (s1[i - 1] == s2[j - 1] ? 0 : 1) ));
250 
251  return d[len1][len2];
252 }
253 
254 
269 double p_KashyapOommen1984_edit(const std::string x, const std::string y, const double perr, const size_t nalphabet) {
270  // From Kashyap & Oommen, 1984
271  // perr gets used here to define the probability of each of these operations
272  // BUT Note here that we use the logs of these
273 
274  const int m = x.length(); // these must be ints or some negatives below go bad
275  const int n = y.length();
276  const int T = std::max(m,n); // max insertions allowed
277  const int E = std::max(m,n); // max deletions allowed
278  const int R = std::min(m,n);
279 
280  const double lp_insert = -log(nalphabet); // P(y_t|lambda) - probability of generating a single character from nothing
281  const double lp_delete = log(perr); // P(phi|d_x) -- probability of deleting
282 
283  Vector3D<double> mytable(T+1,E+1,T+1);
284 
285  // set to 1 in log space
286  mytable(0,0,0) = 0.0;
287 
288  // indexes into y and x, and returns the swap probability
289  // when they are different. Also corrects so i,j are 0-indexed
290  // instead of 1-indexed, as they are below.
291  // Note that when we are correct, we sum up both ways of getting the answer
292  const auto Sab = [&](int i, int j) -> double {
293  return log(perr / nalphabet + (1.0-perr)*(y[i-1]==x[j-1] ? 1 : 0));
294  };
295 
296  for(int t=1;t<=T;t++)
297  mytable(t,0,0) = mytable(t-1,0,0) + lp_insert;
298 
299  for(int d=1;d<=E;d++)
300  mytable(0,d,0) = mytable(0,d-1,0) + lp_delete;
301 
302  for(int s=1;s<=R;s++)
303  mytable(0,0,s) = mytable(0,0,s-1) + Sab(s-1,s-1);
304 
305  for(int t=1;t<=T;t++)
306  for(int d=1;d<=E;d++)
307  mytable(t,d,0) = logplusexp(mytable(t-1,d,0)+lp_insert,
308  mytable(t,d-1,0)+lp_delete);
309 
310  for(int t=1;t<=T;t++)
311  for(int s=1;s<=n-t;s++)
312  mytable(t,0,s) = logplusexp(mytable(t-1,0,s)+lp_insert,
313  mytable(t,0,s-1)+Sab(s+t-1,s-1));
314 
315  for(int d=1;d<=E;d++)
316  for(int s=1;s<=m-d;s++)
317  mytable(0,d,s) = logplusexp(mytable(0,d-1,s)+lp_delete,
318  mytable(0,d,s-1)+Sab(s-1,s+d-1));
319 
320  for(int t=1;t<=T;t++)
321  for(int d=1;d<=E;d++)
322  for(int s=1;s<=std::min(n-t,m-d);s++)
323  mytable(t,d,s) = logplusexp(mytable(t-1,d,s)+lp_insert,
324  logplusexp(mytable(t,d-1,s)+lp_delete,
325  mytable(t,d,s-1)+Sab(t+s-1,d+s-1))); // shoot me in the face
326 
327  // now we sum up
328  double lp_yGx = -infinity; // p(Y|X)
329  for(int t=std::max(0,n-m); t<=std::min(T,E+n-m);t++){
330  double qt = geometric_lpdf(t+1,1.0-perr); // probability of t insertions -- but must use t+1 to allow t=0 (geometric is defined on 1,2,3,...)
331  lp_yGx = logplusexp(lp_yGx, mytable(t,m-n+t,n-t) + qt + lfactorial(m)+lfactorial(t)-lfactorial(m+t));
332  }
333  return lp_yGx;
334 
335 }
336 
std::string QQ(const std::string &x)
Definition: Strings.h:190
std::pair< std::string, std::string > divide(const std::string &s, const char delimiter)
Definition: Strings.h:144
const std::string LAMBDA_STRING
Definition: Strings.h:19
void check_alphabet(const std::string &s, const std::string &a)
Check that s only uses characters from a. On failure, we print the string and assert false...
Definition: Strings.h:215
double p_KashyapOommen1984_edit(const std::string x, const std::string y, const double perr, const size_t nalphabet)
The string probability model from Kashyap & Oommen, 1983, basically giving a string edit distance tha...
Definition: Strings.h:269
size_t count(const std::string &str, const std::string &sub)
Definition: Strings.h:153
std::string reverse(std::string x)
Definition: Strings.h:185
double geometric_lpdf(size_t k, double p)
Definition: Random.h:144
std::string to_string_with_precision(const T a_value, const int n=14)
Definition: Strings.h:23
void replace_all(std::string &s, const std::string &x, const std::string &y, int add=0)
Replace all occurances of x with y in s.
Definition: Strings.h:68
std::string remove_characters(std::string s, const std::string &rem)
Remove all characters in rem from s. TODO: Not very optimized here yeah.
Definition: Strings.h:82
std::string str(BindingTree *t)
Definition: BindingTree.h:195
unsigned int levenshtein_distance(const std::string &s1, const std::string &s2)
Compute levenshtein distiance between two strings (NOTE: Or O(N^2))
Definition: Strings.h:236
constexpr double infinity
Definition: Numerics.h:20
T logplusexp(const T a, const T b)
Definition: Numerics.h:131
std::string Q(const std::string &x)
Definition: Strings.h:199
Like Vector2D, but one dimension bigger. TODO: Replace this and Vector2D with a template please...
Definition: Vector3D.h:13
double p_delete_append(const T &x, const T &y, const float log_alphabet)
Probability of converting x into y by deleting some number (each with del_p, then stopping with prob ...
Definition: Strings.h:107
double lfactorial(double x)
Definition: Numerics.h:230
const std::string LAMBDAXDOT_STRING
Definition: Strings.h:20
bool is_prefix(const T &prefix, const T &x)
Check if prefix is a prefix of x – works with iterables, including strings and vectors.
Definition: Strings.h:39
const std::string EMPTY_STRING
Definition: Strings.h:17
This is a thread_local rng whose first object is used to see others (in other threads). This way, we can have thread_local rngs that all are seeded deterministcally in Fleet via –seed=X.
bool contains(const std::string &s, const std::string &x)
Definition: Strings.h:53