Fleet  0.0.9
Inference in the LOT
Data.h
Go to the documentation of this file.
1 #pragma once
2 
3 #include <cstring>
4 #include <vector>
5 #include <map>
6 #include <algorithm>
7 #include "DiscreteDistribution.h"
8 
9 #include "FleetArgs.h"
10 
11 template<typename datum_t>
12 void load_data_file(std::vector<datum_t> &data, const char* datapath) {
19  for(auto [s, cnt] : read_csv<2>(datapath, false, '\t')) {
20  data.emplace_back(std::string(""), s, NaN, std::stod(cnt));
21  }
22 }
23 
24 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
25 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
26 // ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
27 // TODO: Put all this into another library
28 
29 template<typename T, typename TDATA>
30 std::map<T, double> highest(const std::vector<TDATA>& m, unsigned long N) {
31  // take a type of data and make a map of strings to counts, pulling out the top N
32  // and converting into a map
33 
34  std::map<T, double> out;
35 
36  std::vector<TDATA> v = m;
37  std::sort(v.begin(), v.end(), [](auto x, auto y){ return x.count > y.count; });
38 
39  for(size_t i=0;i<std::min(N, v.size()); i++) {
40  out[v[i].output] = v[i].count; // remember: must be output since that's what we're modeling
41  }
42  return out;
43 }
44 
45 
46 
47 template<typename TDATA>
48 std::pair<double, double> get_precision_and_recall(DiscreteDistribution<std::string>& model, std::vector<TDATA>& data, unsigned long N) {
49  // How many of the top N generated strings appear *anywhere* in the data
50  // And how many of the top N data appear *anywhere* in the generated strings
51  // Note: This is a little complicated if the data has fewer strings that the model, since we don't
52  // know the "true" precision and recall
53 
54  auto A = model.best(N, true);
55  auto B = highest<std::string,TDATA>(data, std::min(N,data.size()) );
56 
57  std::set<std::string> mdata; // make a map of all observed output strings
58  for(auto v : data) mdata.insert(v.output);
59 
60  unsigned long nprec = 0;
61  for(auto a: A) {
62  if(mdata.count(a))
63  nprec++;
64  }
65 
66  unsigned long nrec = 0;
67  for(auto b : B){
68  if(model.count(b.first))
69  nrec++;
70  }
71 
72  return std::make_pair(double(nprec)/A.size(), double(nrec)/B.size());
73 }
74 
75 
Definition: DiscreteDistribution.h:25
void load_data_file(std::vector< datum_t > &data, const char *datapath)
Definition: Data.h:12
size_t count(T x) const
Definition: DiscreteDistribution.h:201
constexpr double NaN
Definition: Numerics.h:21
This stores a distribution from values of T to log probabilities. It is used as the return value from...
std::map< T, double > highest(const std::vector< TDATA > &m, unsigned long N)
Definition: Data.h:30
std::vector< T > best(size_t n, bool include_equal) const
Definition: DiscreteDistribution.h:166
std::pair< double, double > get_precision_and_recall(DiscreteDistribution< std::string > &model, std::vector< TDATA > &data, unsigned long N)
Definition: Data.h:48