JASSv2
ranking_function_atire_bm25.h
Go to the documentation of this file.
1 /*
2  RANKING_FUNCTION_ATIRE_BM25.H
3  -----------------------------
4  Copyright (c) 2018 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
14 #pragma once
15 
16 #include <math.h>
17 #include <stdio.h>
18 #include <stdint.h>
19 
20 #include <vector>
21 
22 #include "forceinline.h"
23 #include "compress_integer.h"
24 #include "index_postings_impact.h"
25 
26 
27 namespace JASS
28  {
29  /*
30  CLASS RANKING_FUNCTION_ATIRE_BM25
31  ---------------------------------
32  */
37  {
38  private:
39  double idf;
40  double top_row;
41  double k1_plus_1;
43  std::vector<float> length_correction;
44 
45  public:
46  /*
47  RANKING_FUNCTION_ATIRE_BM25::RANKING_FUNCTION_ATIRE_BM25()
48  ----------------------------------------------------------
49  */
56  ranking_function_atire_bm25(double k1, double b, std::vector<compress_integer::integer> &document_lengths):
57  idf(0),
58  top_row(0),
59  k1_plus_1(k1 + 1.0),
60  mean_document_length(0),
61  length_correction(document_lengths.size())
62  {
63  double one_minus_b = 1.0 - b;
64 
65  uint64_t sum = 0;
66  for (auto length : document_lengths)
67  sum += length;
68 
69  mean_document_length = static_cast<double>(sum) / static_cast<double>(document_lengths.size() - 1); // -1 because ID 0 is not used (and should be 0)
70 
71  auto correction = &length_correction[0]; // recall that we count from 1, not from 0
72  for (auto length : document_lengths)
73  *correction++ = k1 * (one_minus_b + b * static_cast<double>(length) / mean_document_length);
74  }
75 
76  /*
77  RANKING_FUNCTION_ATIRE_BM25::~RANKING_FUNCTION_ATIRE_BM25()
78  ----------------------------------------------------------
79  */
84  {
85  /* Nothing */
86  }
87 
88  /*
89  RANKING_FUNCTION_ATIRE_BM25::COMPUTE_IDF_COMPONENT()
90  ----------------------------------------------------
91  */
97  forceinline void compute_idf_component(compress_integer::integer document_frequency, compress_integer::integer documents_in_collection)
98  {
99  /*
100  N
101  IDF = log -
102  n
103 
104  This variant of IDF is better than log((N - n + 0.5) / (n + 0.5)) on the 70 INEX 2008 Wikipedia topics. Also, it is always positive.
105  */
106  idf = log((double)documents_in_collection / (double)document_frequency);
107  }
108 
109  /*
110  RANKING_FUNCTION_ATIRE_BM25::COMPUTE_TF_COMPONENT()
111  ---------------------------------------------------
112  */
118  {
119  top_row = term_frequency * k1_plus_1;
120  }
121 
122  /*
123  RANKING_FUNCTION_ATIRE_BM25::COMPUTE_SCORE()
124  --------------------------------------------
125  */
133  {
134  /*
135  tf(td) * (k1 + 1)
136  rsv = ----------------------------------- * IDF
137  len(d)
138  tf(td) + k1 * (1 - b + b * --------)
139  av_len_d
140 
141  In this implementation we ignore k3 and the number of times the term occurs in the query.
142  */
143  double tf = term_frequency;
144  return idf * (top_row / (tf + length_correction[document_id]));
145  }
146 
147  /*
148  RANKING_FUNCTION_ATIRE_BM25::UNITTEST()
149  ---------------------------------------
150  */
154  static void unittest(void)
155  {
156  double rsv;
157  std::vector<compress_integer::integer> lengths{30, 40, 50, 60, 70}; // the lengths of the documents in this pseudo-index
158  ranking_function_atire_bm25 ranker(0.9, 0.4, lengths); // k1=0.9, b=0.4
159 
160  ranker.compute_idf_component(2, static_cast<uint32_t>(lengths.size())); // this term occurs in 2 of 5 documents
161  ranker.compute_tf_component(12); // it occurs in this document 12 times
162  rsv = ranker.compute_score(1, 12); // it occurs in document 1 a total of 12 times;
163 
164  JASS_assert(static_cast<uint32_t>(rsv * 1000) == 1635);
165  puts("ranking_function_atire_bm25::PASSED");
166  }
167  };
168  }
~ranking_function_atire_bm25()
Destructor.
Definition: ranking_function_atire_bm25.h:83
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
The ATIRE verison of BM25.
Definition: ranking_function_atire_bm25.h:36
std::vector< float > length_correction
most of the bottom row of BM25 (k1 * ((1 - b) + b * length / mean_document_length)) for the current t...
Definition: ranking_function_atire_bm25.h:43
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
Compression codexes for integer sequences.
Holder class for an impact ordered postings list.
operating system and compiler independant definition of forceinline
forceinline void compute_tf_component(index_postings_impact::impact_type term_frequency)
Compute and store internally the term-frequency based component of the ranking function (useful when ...
Definition: ranking_function_atire_bm25.h:117
double k1_plus_1
k1 + 1
Definition: ranking_function_atire_bm25.h:41
static void unittest(void)
Unit test this class.
Definition: ranking_function_atire_bm25.h:154
double idf
the IDF of the term being processed
Definition: ranking_function_atire_bm25.h:39
forceinline double compute_score(compress_integer::integer document_id, index_postings_impact::impact_type term_frequency)
Compute BM25 from the given document, assuming pieces have already been computed. ...
Definition: ranking_function_atire_bm25.h:132
ranking_function_atire_bm25(double k1, double b, std::vector< compress_integer::integer > &document_lengths)
Constructor.
Definition: ranking_function_atire_bm25.h:56
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
double mean_document_length
the mean of the document lengths
Definition: ranking_function_atire_bm25.h:42
double top_row
the top-row of the ranking function for the term being processed (tf(td) * (k1 + 1)) ...
Definition: ranking_function_atire_bm25.h:40
Definition: compress_integer_elias_delta_simd.c:23
forceinline void compute_idf_component(compress_integer::integer document_frequency, compress_integer::integer documents_in_collection)
Called once per term (per query). Computes the IDF component of the ranking function and stores it in...
Definition: ranking_function_atire_bm25.h:97