JASSv2
quantize.h
Go to the documentation of this file.
1 /*
2  QUANTIZE.H
3  ----------
4  Copyright (c) 2018 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
13 #pragma once
14 
15 #include <math.h>
16 
17 #include <iostream>
18 
19 #include "index_manager.h"
22 
23 namespace JASS
24  {
25  /*
26  CLASS QUANTIZE
27  --------------
28  */
41  template <typename RANKER>
43  {
44  private:
45  double largest_rsv;
46  double smallest_rsv;
47  std::shared_ptr<RANKER> ranker;
50 
51  public:
52  /*
53  QUANTIZE::QUANTIZE()
54  --------------------
55  */
61  quantize(size_t documents, std::shared_ptr<RANKER> ranker) :
62  index_manager::delegate(documents),
63  largest_rsv((std::numeric_limits<decltype(largest_rsv)>::min)()),
64  smallest_rsv((std::numeric_limits<decltype(smallest_rsv)>::max)()),
65  ranker(ranker),
66  documents_in_collection(static_cast<compress_integer::integer>(documents))
67  {
68  /* Nothing. */
69  }
70 
71  /*
72  QUANTIZE::~QUANTIZE()
73  ---------------------
74  */
78  virtual ~quantize()
79  {
80 // std::cout << "RSVmin:" << smallest_rsv << '\n';
81 // std::cout << "RSVmax:" << largest_rsv << '\n';
82  }
83 
84  /*
85  QUANTIZE::FINISH()
86  ------------------
87  */
91  virtual void finish(void)
92  {
93  /* Nothing */
94  }
95 
96  /*
97  QUANTIZE::OPERATOR()()
98  ----------------------
99  */
108  virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
109  {
110  /*
111  Compute the IDF component
112  */
113  ranker->compute_idf_component(document_frequency, documents_in_collection);
114 
115  /*
116  Compute the document / term score and keep a tally of the smallest and largest (for quantization)
117  */
118  auto end = document_ids + document_frequency;
119  auto current_tf = term_frequencies;
120  for (compress_integer::integer *current_id = document_ids; current_id < end; current_id++, current_tf++)
121  {
122  /*
123  Compute the term / document score
124  */
125  ranker->compute_tf_component(*current_tf);
126  auto score = ranker->compute_score(*current_id, *current_tf);
127 
128  /*
129  Keep a running tally of the largest and smallest rsv we've seen so far
130  */
131  if (score < smallest_rsv)
132  smallest_rsv = score;
133  if (score > largest_rsv)
134  largest_rsv = score;
135  }
136  }
137 
138  /*
139  QUANTIZE::OPERATOR()()
140  ----------------------
141  */
147  virtual void operator()(size_t document_id, const slice &primary_key)
148  {
149  /* Nothing. */
150  }
151 
152  /*
153  QUANTIZE::OPERATOR()()
154  ----------------------
155  */
165  virtual void operator()(index_manager::delegate &writer, const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
166  {
167  /*
168  Compute the IDF component
169  */
170  ranker->compute_idf_component(document_frequency, documents_in_collection);
171 
172  /*
173  Compute the document / term score and quantize it.
174  */
175  auto end = document_ids + document_frequency;
176  auto current_tf = term_frequencies;
177  for (compress_integer::integer *current_id = document_ids; current_id < end; current_id++, current_tf++)
178  {
179  /*
180  Compute the term / document score
181  */
182  ranker->compute_tf_component(*current_tf);
183  double score = ranker->compute_score(*current_id, *current_tf);
184 
185  /*
186  Quantize using uniform quantization, and write back as the new term frequency (which is now an impact score).
187  This uses Uniform Quantiization as defined by Anh et al. in:
188  Vo Ngoc Anh, Owen de Kretser, and Alistair Moffat. 2001. Vector-space ranking with effective early termination. In Proceedings of the 24th annual international ACM SIGIR conference on Research and development in information retrieval (SIGIR '01). ACM, New York, NY, USA, 35-42. DOI: https://doi.org/10.1145/383952.383957
189  */
190  index_postings_impact::impact_type impact = static_cast<index_postings_impact::impact_type>(((score - smallest_rsv) / (largest_rsv - smallest_rsv)) * impact_range) + index_postings_impact::smallest_impact;
191  *current_tf = impact;
192  }
193 
194  /*
195  Pass the quantized list to the writer.
196  */
197  writer(term, postings, document_frequency, document_ids, term_frequencies);
198  }
199 
200  /*
201  QUANTIZE::OPERATOR()()
202  ----------------------
203  */
210  virtual void operator()(index_manager::delegate &writer, size_t document_id, const slice &primary_key)
211  {
212  writer(document_id, primary_key);
213  }
214 
215  /*
216  QUANTIZE::GET_BOUNDS()
217  ----------------------
218  */
224  void get_bounds(double &smallest, double &largest)
225  {
226  smallest = smallest_rsv;
227  largest = largest_rsv;
228  }
229 
230  /*
231  QUANTIZE::SERIALISE_INDEX()
232  ---------------------------
233  */
239  void serialise_index(index_manager &index, std::vector<std::unique_ptr<index_manager::delegate>> &serialisers)
240  {
241  for (auto &outputter : serialisers)
242  {
243  index.iterate(*this, *outputter);
244  outputter->finish();
245  }
246  }
247 
248  /*
249  QUANTIZE::UNITTEST()
250  --------------------
251  */
255  static void unittest(void)
256  {
257  /*
258  Build an index.
259  */
262 
263  /*
264  Quantize the index.
265  */
266  std::shared_ptr<ranking_function_atire_bm25> ranker(new ranking_function_atire_bm25(0.9, 0.4, index.get_document_length_vector()));
268  index.iterate(quantizer);
269 
270  double smallest;
271  double largest;
272  quantizer.get_bounds(smallest, largest);
273 
274  JASS_assert(static_cast<int>(smallest) == 0);
275  JASS_assert(static_cast<int>(largest) == 2);
276 
277  puts("quantize::PASSED");
278  }
279  };
280  }
Quantize an index.
Definition: quantize.h:42
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function for each postings list is operator().
Definition: quantize.h:108
double smallest_rsv
The smallest score seen for any document/term pair.
Definition: quantize.h:46
Non-thread-Safe indexer object.
C++ slices (string-descriptors)
Definition: slice.h:27
Base class for the indexer object that stored the actual index during indexing.
Compression codexes for integer sequences.
Definition: compress_integer.h:34
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
virtual void operator()(index_manager::delegate &writer, size_t document_id, const slice &primary_key)
The callback function for primary keys (external document ids) is operator(). Not needed for quantiza...
Definition: quantize.h:210
The ATIRE verison of BM25.
Definition: ranking_function_atire_bm25.h:36
virtual void iterate(delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager.h:314
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
virtual void iterate(index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:297
virtual std::vector< compress_integer::integer > & get_document_length_vector(void)
Return a reference to the document length vector.
Definition: index_manager.h:271
static constexpr double impact_range
The number of values in the impact ordering range (normally 255).
Definition: quantize.h:49
static std::string ten_documents
Ten TREC formatted documents with ten terms (ten .. one) where each term occurs it&#39;s count number of ...
Definition: unittest_data.h:25
static void unittest_build_index(index_manager_sequential &index, const std::string &document_collection)
Build and index for the 10 sample documents. This is used by several unit tests that need a valid ind...
Definition: index_manager_sequential.h:368
virtual void finish(void)
Do any final cleaning up.
Definition: quantize.h:91
double largest_rsv
The largest score seen for any document/term pair.
Definition: quantize.h:45
static constexpr size_t largest_impact
The largest allowable immpact score (255 is an good value).
Definition: index_postings_impact.h:42
Base class for the callback function called by iterate.
Definition: index_manager.h:120
delegate(size_t documents)
Destructor.
Definition: index_manager.h:60
compress_integer::integer documents_in_collection
The number of documents in the collection.
Definition: quantize.h:48
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
Non-thread-Safe indexer object.
Definition: index_manager_sequential.h:38
quantize(size_t documents, std::shared_ptr< RANKER > ranker)
Constructor.
Definition: quantize.h:61
size_t documents
The number of documents in the collection.
Definition: index_manager.h:50
virtual void operator()(size_t document_id, const slice &primary_key)
The callback function for primary keys (external document ids) is operator(). Not needed for quantiza...
Definition: quantize.h:147
static void unittest(void)
Unit test this class.
Definition: quantize.h:255
Base class for holding the index during indexing.
Definition: index_manager.h:33
Base class for the callback function called by iterate.
Definition: index_manager.h:47
Definition: compress_integer_elias_delta_simd.c:23
virtual void operator()(index_manager::delegate &writer, const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function for each postings list is operator().
Definition: quantize.h:165
void get_bounds(double &smallest, double &largest)
Get the smallest and largest term / document influence (should be called after the first round of the...
Definition: quantize.h:224
std::shared_ptr< RANKER > ranker
The ranker to use for quantization.
Definition: quantize.h:47
compress_integer::integer get_highest_document_id(void) const
Return the number of documents that have been successfully indexed or are in the process of being ind...
Definition: index_manager.h:345
virtual ~quantize()
Destructor.
Definition: quantize.h:78
The ATIRE verison of the BM25 ranking function.
void serialise_index(index_manager &index, std::vector< std::unique_ptr< index_manager::delegate >> &serialisers)
Given the index and a serialiser, serialise the index to disk.
Definition: quantize.h:239
static constexpr size_t smallest_impact
The smallest allowable impact score (normally 1)
Definition: index_postings_impact.h:43