Fleet  0.0.9
Inference in the LOT
CharacterNGram.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <map>
4 #include <string>
5 
15 protected:
16  int n;
17  std::map<std::string, size_t> count;
18  std::map<std::string, size_t> count_nm1; // for computing conditional
19 
20 public:
21 
22  CharacterNGram(size_t _n) : n(_n) {
23  assert(n > 0);
24  }
25 
26  void train(std::string s) {
27 
28  // let's see, in a string like abcdefg
29  // 0123456
30  // we want to compute for a bigram P(b|a)
31  for(size_t i=0;i<=s.size()-n;i++) {
32  auto x = s.substr(i,n);
33  auto y = s.substr(i,n-1);
34  count[x] = get(count, x, 0) + 1;
35  count_nm1[y] = get(count_nm1, y, 0) + 1;
36  }
37  }
38 
39  void masked_train(const std::string& s, char mask) {
40  // this is the same as train except that we skip *anything* containing msk
41  // within the n-gram
42 
43  for(size_t i=0;i<=s.size()-n;i++) {
44  auto x = s.substr(i,n);
45  if(not contains(x, mask)) {
46  auto y = s.substr(i,n-1);
47 // print("TRAINING", n, x, y);
48  count[x] = get(count, x, 0) + 1;
49  count_nm1[y] = get(count_nm1, y, 0) + 1;
50  }
51  }
52  }
53 
54  virtual double probability(const std::string&) = 0;
55 };
56 
58 
59  int alphabet_size; // needed for smoothing
60  double lambda;
61 
62 public:
63 
64  AddLambdaSmoothedNGram(int _n, int a, double l) : CharacterNGram(_n), alphabet_size(a), lambda(l) {
65 
66  }
67 
68  virtual double probability(const std::string& x) override {
69  // probability of the LAST character of x, given the previous,
70  // using add_l smoothing, with alphabet size a
71  // NOTE: returns prob, not logprob!
72 // print(n, "GETTING", QQ(x), get(count, x, 0), QQ(x.substr(0,x.size()-1)), get(count_nm1, x.substr(0,x.size()-1), 0));
73  return (get(count, x, 0) + lambda) / (get(count_nm1, x.substr(0,x.size()-1), 0) + lambda * alphabet_size);
74  }
75 };
int n
Definition: CharacterNGram.h:16
std::map< std::string, size_t > count_nm1
Definition: CharacterNGram.h:18
CharacterNGram(size_t _n)
Definition: CharacterNGram.h:22
Definition: CharacterNGram.h:14
void train(std::string s)
Definition: CharacterNGram.h:26
std::map< std::string, size_t > count
Definition: CharacterNGram.h:17
virtual double probability(const std::string &)=0
virtual double probability(const std::string &x) override
Definition: CharacterNGram.h:68
void masked_train(const std::string &s, char mask)
Definition: CharacterNGram.h:39
AddLambdaSmoothedNGram(int _n, int a, double l)
Definition: CharacterNGram.h:64
bool contains(const std::string &s, const std::string &x)
Definition: Strings.h:53
Definition: CharacterNGram.h:57