41 template <
typename RANKER>
63 largest_rsv((
std::numeric_limits<decltype(largest_rsv)>::min)()),
64 smallest_rsv((
std::numeric_limits<decltype(smallest_rsv)>::max)()),
113 ranker->compute_idf_component(document_frequency, documents_in_collection);
118 auto end = document_ids + document_frequency;
119 auto current_tf = term_frequencies;
125 ranker->compute_tf_component(*current_tf);
126 auto score = ranker->compute_score(*current_id, *current_tf);
131 if (score < smallest_rsv)
132 smallest_rsv = score;
133 if (score > largest_rsv)
170 ranker->compute_idf_component(document_frequency, documents_in_collection);
175 auto end = document_ids + document_frequency;
176 auto current_tf = term_frequencies;
182 ranker->compute_tf_component(*current_tf);
183 double score = ranker->compute_score(*current_id, *current_tf);
191 *current_tf = impact;
197 writer(term, postings, document_frequency, document_ids, term_frequencies);
212 writer(document_id, primary_key);
241 for (
auto &outputter : serialisers)
243 index.
iterate(*
this, *outputter);
272 quantizer.get_bounds(smallest, largest);
277 puts(
"quantize::PASSED");
Quantize an index.
Definition: quantize.h:42
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function for each postings list is operator().
Definition: quantize.h:108
double smallest_rsv
The smallest score seen for any document/term pair.
Definition: quantize.h:46
Non-thread-Safe indexer object.
C++ slices (string-descriptors)
Definition: slice.h:27
Base class for the indexer object that stored the actual index during indexing.
Compression codexes for integer sequences.
Definition: compress_integer.h:34
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
virtual void operator()(index_manager::delegate &writer, size_t document_id, const slice &primary_key)
The callback function for primary keys (external document ids) is operator(). Not needed for quantiza...
Definition: quantize.h:210
The ATIRE verison of BM25.
Definition: ranking_function_atire_bm25.h:36
virtual void iterate(delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager.h:314
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
virtual void iterate(index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:297
virtual std::vector< compress_integer::integer > & get_document_length_vector(void)
Return a reference to the document length vector.
Definition: index_manager.h:271
static constexpr double impact_range
The number of values in the impact ordering range (normally 255).
Definition: quantize.h:49
static std::string ten_documents
Ten TREC formatted documents with ten terms (ten .. one) where each term occurs it's count number of ...
Definition: unittest_data.h:25
static void unittest_build_index(index_manager_sequential &index, const std::string &document_collection)
Build and index for the 10 sample documents. This is used by several unit tests that need a valid ind...
Definition: index_manager_sequential.h:368
virtual void finish(void)
Do any final cleaning up.
Definition: quantize.h:91
double largest_rsv
The largest score seen for any document/term pair.
Definition: quantize.h:45
static constexpr size_t largest_impact
The largest allowable immpact score (255 is an good value).
Definition: index_postings_impact.h:42
Base class for the callback function called by iterate.
Definition: index_manager.h:120
delegate(size_t documents)
Destructor.
Definition: index_manager.h:60
compress_integer::integer documents_in_collection
The number of documents in the collection.
Definition: quantize.h:48
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
Non-thread-Safe indexer object.
Definition: index_manager_sequential.h:38
quantize(size_t documents, std::shared_ptr< RANKER > ranker)
Constructor.
Definition: quantize.h:61
size_t documents
The number of documents in the collection.
Definition: index_manager.h:50
virtual void operator()(size_t document_id, const slice &primary_key)
The callback function for primary keys (external document ids) is operator(). Not needed for quantiza...
Definition: quantize.h:147
static void unittest(void)
Unit test this class.
Definition: quantize.h:255
Base class for holding the index during indexing.
Definition: index_manager.h:33
Base class for the callback function called by iterate.
Definition: index_manager.h:47
Definition: compress_integer_elias_delta_simd.c:23
virtual void operator()(index_manager::delegate &writer, const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function for each postings list is operator().
Definition: quantize.h:165
void get_bounds(double &smallest, double &largest)
Get the smallest and largest term / document influence (should be called after the first round of the...
Definition: quantize.h:224
std::shared_ptr< RANKER > ranker
The ranker to use for quantization.
Definition: quantize.h:47
compress_integer::integer get_highest_document_id(void) const
Return the number of documents that have been successfully indexed or are in the process of being ind...
Definition: index_manager.h:345
virtual ~quantize()
Destructor.
Definition: quantize.h:78
The ATIRE verison of the BM25 ranking function.
void serialise_index(index_manager &index, std::vector< std::unique_ptr< index_manager::delegate >> &serialisers)
Given the index and a serialiser, serialise the index to disk.
Definition: quantize.h:239
static constexpr size_t smallest_impact
The smallest allowable impact score (normally 1)
Definition: index_postings_impact.h:43