JASSv2
index_postings_impact.h
Go to the documentation of this file.
1 /*
2  INDEX_POSTINGS_IMPACT.H
3  -----------------------
4  Copyright (c) 2017 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
13 #pragma once
14 
15 #include <vector>
16 #include <limits>
17 #include <sstream>
18 
19 #include "allocator.h"
20 #include "compress_integer.h"
21 
22 namespace JASS
23  {
24  /*
25  CLASS INDEX_POSTINGS_IMPACT
26  ---------------------------
27  */
32  {
33  friend class index_postings;
34 
35  public:
36 #ifdef NEVER
37  typedef uint8_t impact_type;
38  static constexpr size_t largest_impact = std::numeric_limits<impact_type>::max();
39  static constexpr size_t smallest_impact = 1;
40 #else
41  typedef uint16_t impact_type;
42  static constexpr size_t largest_impact = 1024;
43  static constexpr size_t smallest_impact = 1;
44 #endif
45 
46  public:
47  /*
48  CLASS INDEX_POSTINGS_IMPACT::IMPACT
49  -----------------------------------
50  */
54  class impact
55  {
56  public:
57  impact_type impact_score;
60 
61  /*
62  INDEX_POSTINGS_IMPACT::IMPACT::IMPACT
63  -------------------------------------
64  */
68  impact() :
69  impact_score(0),
70  start(nullptr),
71  finish(nullptr)
72  {
73  /* Nothing */
74  }
75 
76  public:
77  /*
78  INDEX_POSTINGS_IMPACT::IMPACT::BEGIN()
79  --------------------------------------
80  */
86  {
87  return start;
88  }
89 
90  /*
91  INDEX_POSTINGS_IMPACT::IMPACT::END()
92  ------------------------------------
93  */
99  {
100  return finish;
101  }
102 
103  /*
104  INDEX_POSTINGS_IMPACT::IMPACT::SIZE()
105  -------------------------------------
106  */
112  {
113  return static_cast<compress_integer::integer>(finish - start);
114  }
115  };
116 
117  private:
118  /*
119  CLASS INDEX_POSTINGS_IMPACT::REVERSE_ITERATOR
120  ---------------------------------------------
121  */
126  {
127  private:
129 
130  public:
131  /*
132  INDEX_POSTINGS_IMPACT::REVERSE_ITERATOR:REVERSE_ITERATOR
133  ---------------------------------------------------------
134  */
140  address(address)
141  {
142  /* Nothing */
143  }
144 
145  /*
146  INDEX_POSTINGS_IMPACT::REVERSE_ITERATOR:OPERATOR!=()
147  -----------------------------------------------------
148  */
154  bool operator!=(const reverse_iterator &other) const
155  {
156  return address != other.address;
157  }
158 
159  /*
160  INDEX_POSTINGS_IMPACT::REVERSE_ITERATOR:OPERATOR*()
161  ----------------------------------------------------
162  */
167  {
168  return *address;
169  }
170 
171  /*
172  INDEX_POSTINGS_IMPACT::REVERSE_ITERATOR:OPERATOR++()
173  -----------------------------------------------------
174  */
179  {
180  address--;
181  return *this;
182  }
183  };
184 
185  protected:
188  impact impacts[largest_impact + 1];
193  size_t temporary_size;
194  uint8_t *temporary;
195 
196  public:
197  /*
198  INDEX_POSTINGS_IMPACT::INDEX_POSTINGS_IMPACT()
199  ----------------------------------------------
200  */
208  index_postings_impact(size_t document_count, allocator &memory):
209  memory(memory),
210  number_of_impacts(0),
211  number_of_postings(document_count),
212  postings(static_cast<decltype(postings)>(memory.malloc((document_count + largest_impact + 1) * sizeof(*postings)))), // longest length is total_postings + all impacts + 1
213  document_ids((decltype(document_ids))memory.malloc(document_count * sizeof(*document_ids))),
214  term_frequencies((decltype(term_frequencies))memory.malloc(document_count * sizeof(*term_frequencies))),
215  temporary_size(document_count * (sizeof(*document_ids) / 7 + 1) * sizeof(*temporary)),
216  temporary((decltype(temporary))memory.malloc(temporary_size)) // enough space to decompress variable-byte encodings
217  {
218  /* Nothing */
219  }
220 
221  /*
222  INDEX_POSTINGS_IMPACT::SET_IMPACT_COUNT()
223  -----------------------------------------
224  */
230  void set_impact_count(size_t number_of_impacts)
231  {
232  this->number_of_impacts = number_of_impacts;
233  }
234 
235  /*
236  INDEX_POSTINGS_IMPACT::GET_POSTINGS()
237  -------------------------------------
238  */
244  {
245  return postings;
246  }
247 
248  /*
249  INDEX_POSTINGS_IMPACT::OPERATOR[]()
250  -----------------------------------
251  */
260  {
261  return postings[index];
262  }
263 
264  /*
265  INDEX_POSTINGS_IMPACT::HEADER()
266  -------------------------------
267  */
276  void header(size_t index, impact_type score, compress_integer::integer *postings_start, compress_integer::integer *postings_end)
277  {
278  impacts[index].impact_score = score;
279  impacts[index].start = postings_start;
280  impacts[index].finish = postings_end;
281  }
282 
283  /*
284  INDEX_POSTINGS_IMPACT::IMPACT_SIZE()
285  ------------------------------------
286  */
291  size_t impact_size(void) const
292  {
293  return number_of_impacts;
294  }
295 
296  /*
297  INDEX_POSTINGS_IMPACT::BEGIN()
298  ------------------------------
299  */
304  const impact *begin(void) const
305  {
306  return &impacts[0];
307  }
308 
309  /*
310  INDEX_POSTINGS_IMPACT::END()
311  ----------------------------
312  */
317  const impact *end(void) const
318  {
319  return &impacts[number_of_impacts];
320  }
321 
322  /*
323  INDEX_POSTINGS_IMPACT::RBEGIN()
324  -------------------------------
325  */
330  const auto rbegin(void) const
331  {
332  return reverse_iterator((impact *)(&impacts[number_of_impacts - 1]));
333  }
334 
335  /*
336  INDEX_POSTINGS_IMPACT::REND()
337  ----------------------------
338  */
343  const auto rend(void) const
344  {
345  return reverse_iterator((impact *)(&impacts[0] - 1));
346  }
347 
348  /*
349  INDEX_POSTINGS_IMPACT::TEXT_RENDER()
350  ------------------------------------
351  */
356  void text_render(std::ostream &stream)
357  {
358  stream << "{[";
359  for (const auto &header : *this)
360  {
361  stream << header.impact_score << ":";
362  for (const auto posting : header)
363  stream << posting << " ";
364  }
365 #ifdef NEVER
366  stream << "]\n[";
367  for (size_t index = 0; index < number_of_postings; index++)
368  stream << postings[index] << " ";
369  stream << "]}";
370 #else
371  stream << "]}";
372 #endif
373  }
374 
375  /*
376  INDEX_POSTINGS_IMPACT::UNITTEST()
377  ---------------------------------
378  */
382  static void unittest(void)
383  {
385  index_postings_impact postings(7, memory);
386 
387  /*
388  Set up some postings
389  */
390  postings[0] = 255; // impact
391  postings[1] = 10; // docid
392  postings[2] = 0; // termiator
393  postings[3] = 128; // impact
394  postings[4] = 2; // docid
395  postings[5] = 5; // docis
396  postings[6] = 0; // terminator
397 
398  /*
399  Set up the headers
400  */
401  postings.header(0, 255, &postings[1], &postings[2]);
402  postings.header(1, 128, &postings[4], &postings[6]);
403  postings.set_impact_count(2);
404 
405  /*
406  Check the data got into the right places.
407  */
408  const compress_integer::integer answer[] = {255, 10, 0, 128, 2, 5, 0};
409  JASS_assert(memcmp(&postings[0], answer, sizeof(answer)) == 0);
410 
411  JASS_assert(postings.impacts[0].impact_score == 255);
412  JASS_assert(postings.impacts[0].start == &postings.postings[1]);
413  JASS_assert(postings.impacts[0].finish == &postings.postings[2]);
414 
415  JASS_assert(postings.impacts[1].impact_score == 128);
416  JASS_assert(postings.impacts[1].start == &postings.postings[4]);
417  JASS_assert(postings.impacts[1].finish == &postings.postings[6]);
418 
419  JASS_assert(postings.number_of_postings == 7);
420  JASS_assert(postings.impact_size() == 2);
421 
422  /*
423  Check iteration
424  */
425  std::string serialised_answer = "255:10 \n128:2 5 \n";
426  std::stringstream output;
427  for (const auto &header : postings)
428  {
429  output << static_cast<int>(header.impact_score) << ":";
430  for (const auto &document_id : header)
431  output << document_id << " ";
432  output << '\n';
433  }
434 
435  JASS_assert(output.str() == serialised_answer);
436 
437  puts("index_postings_impact::PASSED");
438  }
439  };
440 
441  /*
442  OPERATOR<<()
443  ------------
444  */
451  inline std::ostream &operator<<(std::ostream &stream, index_postings_impact &data)
452  {
453  data.text_render(stream);
454  return stream;
455  }
456  }
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
impact_type impact_score
The impact score.
Definition: index_postings_impact.h:57
void header(size_t index, impact_type score, compress_integer::integer *postings_start, compress_integer::integer *postings_end)
Set the value of the impact header object at the given index.
Definition: index_postings_impact.h:276
void set_impact_count(size_t number_of_impacts)
Tell this object how many impacts it holds.
Definition: index_postings_impact.h:230
A <docid, tf> tuple.
Definition: posting.h:28
const impact * begin(void) const
Return a pointer to the first impact header (for use in an iterator).
Definition: index_postings_impact.h:304
static void unittest(void)
Unit test this class.
Definition: index_postings_impact.h:382
void text_render(std::ostream &stream)
Dump a human-readable version of the postings list down the stream.
Definition: index_postings_impact.h:356
const auto rbegin(void) const
Return a pointer to the last impact header (for use in an reverse iterator).
Definition: index_postings_impact.h:330
size_t number_of_postings
The length of the postings array measured in size_t.
Definition: index_postings_impact.h:189
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
impact * address
Which header we&#39;re pointing at.
Definition: index_postings_impact.h:128
const auto rend(void) const
Return a pointer to one element before the first impact headers (for use in reverse iteration iterato...
Definition: index_postings_impact.h:343
Compression codexes for integer sequences.
compress_integer::integer * postings
The list of document IDs, strung together for each postings segment.
Definition: index_postings_impact.h:190
Each impact is represented as an impact score, and a start and end pointer.
Definition: index_postings_impact.h:54
A reverse iterator for iterating over impact headers from highet to lowest.
Definition: index_postings_impact.h:125
impact & operator*()
Return a reference to the element pointed to by this iterator.
Definition: index_postings_impact.h:166
compress_integer::integer * end(void) const
Return a pointer to the end of the document identifiers array (for use as an iterator).
Definition: index_postings_impact.h:98
impact impacts[largest_impact+1]
List of impact pointers (the impact header).
Definition: index_postings_impact.h:188
index_postings_impact(size_t document_count, allocator &memory)
Constructor.
Definition: index_postings_impact.h:208
Holder class for an impact ordered postings list.
Definition: index_postings_impact.h:31
size_t temporary_size
The number of bytes in temporary.
Definition: index_postings_impact.h:193
static constexpr size_t largest_impact
The largest allowable immpact score (255 is an good value).
Definition: index_postings_impact.h:42
compress_integer::integer * begin(void) const
Return a pointer to the start of the document identifiers array (for use as an iterator).
Definition: index_postings_impact.h:85
uint8_t * temporary
Temporary buffer - cannot be used to store anything between calls.
Definition: index_postings_impact.h:194
const impact * end(void) const
Return a pointer to one element past the end of the impact headers (for use in an iterator)...
Definition: index_postings_impact.h:317
Simple block-allocator that internally allocates a large chunk then allocates smaller blocks from thi...
Definition: allocator_pool.h:61
Virtual base class for C style allocators.
Definition: allocator.h:33
compress_integer::integer * start
Pointer into the postings of the start of the document list for this impact score.
Definition: index_postings_impact.h:58
const reverse_iterator & operator++()
Increment this iterator.
Definition: index_postings_impact.h:178
compress_integer::integer * finish
Pointer into the postings of the end of the document list for this impact score.
Definition: index_postings_impact.h:59
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
bool operator!=(const reverse_iterator &other) const
Compare two iterator objects for non-equality.
Definition: index_postings_impact.h:154
allocator & memory
All allocation happens in this arena.
Definition: index_postings_impact.h:186
Base class for different allocators.
Definition: compress_integer_elias_delta_simd.c:23
compress_integer::integer * document_ids
The re-used buffer storing decoded document ids - used while impact ordering.
Definition: index_postings_impact.h:191
static std::ostream & operator<<(std::ostream &output, JASS_anytime_stats &data)
Dump a human readable version of the data down an output stream.
Definition: JASS_anytime_stats.h:62
compress_integer::integer size(void) const
Return the numner of postings with this impact score.
Definition: index_postings_impact.h:111
reverse_iterator(impact *address)
Constructor.
Definition: index_postings_impact.h:139
index_postings_impact::impact_type * term_frequencies
The re-used buffer storing the term frequencies - used while impact ordering.
Definition: index_postings_impact.h:192
size_t impact_size(void) const
Return the number of unique impact scores in the postings list.
Definition: index_postings_impact.h:291
impact()
Constructor.
Definition: index_postings_impact.h:68
size_t number_of_impacts
The number of impact objects in the impacts array.
Definition: index_postings_impact.h:187
compress_integer::integer * get_postings(void)
Return a pointer to the buffer containing the postings.
Definition: index_postings_impact.h:243
compress_integer::integer & operator[](size_t index) const
return a reference to the size_t at position index in the postings array.
Definition: index_postings_impact.h:259
static constexpr size_t smallest_impact
The smallest allowable impact score (normally 1)
Definition: index_postings_impact.h:43