JASSv2
index_manager_sequential.h
Go to the documentation of this file.
1 /*
2  INDEX_MANAGER_SEQUENTIAL.H
3  --------------------------
4  Copyright (c) 2016 Andrew Trotman
5  Released under the 2-clause BSD license (See:https://en.wikipedia.org/wiki/BSD_licenses)
6 */
13 #pragma once
14 
15 #include <vector>
16 #include <sstream>
17 
18 #include "parser.h"
19 #include "posting.h"
20 #include "hash_table.h"
21 #include "index_manager.h"
22 #include "unittest_data.h"
23 #include "index_postings.h"
24 #include "instream_memory.h"
25 #include "instream_document_trec.h"
26 
27 namespace JASS
28  {
29  /*
30  CLASS INDEX_MANAGER_SEQUENTIAL
31  ------------------------------
32  */
39  {
40  private:
44 
45  /*
46  Each of these buffers is re-used in the serialisation process
47  */
50  size_t temporary_size;
51  uint8_t *temporary;
52 
53  public:
54  /*
55  CLASS INDEX_MANAGER_SEQUENTIAL::DELEGATE
56  ----------------------------------------
57  */
62  {
63  private:
64  std::ostream &postings_out;
65  std::ostream &primary_keys_out;
66 
67  public:
68  /*
69  INDEX_MANAGER_SEQUENTIAL::DELEGATE::OPERATOR()()
70  ------------------------------------------------
71  */
78  delegate(size_t documents_in_collection, std::ostream &postings, std::ostream &primary_keys) :
79  index_manager::delegate(documents_in_collection),
80  postings_out(postings),
81  primary_keys_out(primary_keys)
82  {
83  /* Nothing */
84  }
85 
86  /*
87  INDEX_MANAGER_SEQUENTIAL::DELEGATE::FINISH()
88  --------------------------------------------
89  */
93  virtual void finish(void)
94  {
95  /* Nothing */
96  }
97 
98  /*
99  INDEX_MANAGER_SEQUENTIAL::DELEGATE::OPERATOR()()
100  ------------------------------------------------
101  */
105  virtual ~delegate()
106  {
107  /* Nothing */
108  }
109 
110  /*
111  INDEX_MANAGER_SEQUENTIAL::DELEGATE::OPERATOR()()
112  ------------------------------------------------
113  */
122  virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
123  {
124  postings_out << term << "->" << postings << std::endl;
125  }
126 
127  /*
128  INDEX_MANAGER_SEQUENTIAL::DELEGATE::OPERATOR()()
129  ------------------------------------------------
130  */
136  virtual void operator()(size_t document_id, const slice &primary_key)
137  {
138  primary_keys_out << document_id << "->" << primary_key << std::endl;
139  }
140  };
141  protected:
142  /*
143  INDEX_MANAGER_SEQUENTIAL::MAKE_SPACE()
144  --------------------------------------
145  */
149  void make_space(void)
150  {
151  auto new_temporary_size = get_highest_document_id() * (sizeof(*document_ids) / 7 + 1) * sizeof(*temporary);
152 
153  if (new_temporary_size > temporary_size)
154  {
155  temporary_size = new_temporary_size;
156  /*
157  we don't delete the old buffers because the memory object doesn't support allow us to do so
158  but, this would only be needed if the called serialises then adds then serialises without
159  deleting this object and newing another.
160  */
161  document_ids = reinterpret_cast<decltype(document_ids)>(memory.malloc(get_highest_document_id() * sizeof(*document_ids)));
162  term_frequencies = reinterpret_cast<decltype(term_frequencies)>(memory.malloc(get_highest_document_id() * sizeof(*term_frequencies)));
163  temporary = reinterpret_cast<decltype(temporary)>(memory.malloc(temporary_size));
164  }
165  }
166 
167  public:
168  /*
169  INDEX_MANAGER_SEQUENTIAL::INDEX_MANAGER_SEQUENTIAL()
170  ----------------------------------------------------
171  */
176  index_manager(),
177  index(memory),
178  primary_key(memory, 1000, 1.5),
179  document_ids(nullptr),
180  term_frequencies(nullptr),
181  temporary_size(0),
182  temporary(nullptr)
183  {
184  /* Nothing */
185  }
186 
187  /*
188  INDEX_MANAGER_SEQUENTIAL::~INDEX_MANAGER_SEQUENTIAL()
189  -----------------------------------------------------
190  */
195  {
196  /* Nothing */
197  }
198 
199 
200  /*
201  INDEX_MANAGER_SEQUENTIAL::BEGIN_DOCUMENT()
202  ------------------------------------------
203  */
208  virtual void begin_document(const slice &document_primary_key)
209  {
210  /*
211  Tell index_manager that we have started a new document
212  */
213  index_manager::begin_document(document_primary_key);
214  primary_key.push_back(slice(memory, document_primary_key));
215  }
216 
217 
218  /*
219  INDEX_MANAGER_SEQUENTIAL::SET_PRIMARY_KEYS()
220  --------------------------------------------
221  */
229  virtual void set_primary_keys(const std::vector<slice> &keys)
230  {
231  for (auto key : keys)
232  primary_key.push_back(key);
233  }
234 
235  /*
236  INDEX_MANAGER_SEQUENTIAL::TERM()
237  --------------------------------
238  */
243  virtual void term(const parser::token &term)
244  {
245  index[term.lexeme].push_back(get_highest_document_id(), term.count);
246  }
247 
248  /*
249  INDEX_MANAGER_SEQUENTIAL::TERM()
250  --------------------------------
251  */
257  virtual void term(const parser::token &term, const std::vector<posting> &postings_list)
258  {
259  index[term.lexeme].push_back(postings_list);
260  }
261 
262  /*
263  INDEX_MANAGER_SEQUENTIAL::TERM()
264  --------------------------------
265  */
271  virtual void term(const parser::token &term, compress_integer::integer docid)
272  {
273  index[term.lexeme].push_back(docid);
274  }
275 
276  /*
277  INDEX_MANAGER_SEQUENTIAL::TEXT_RENDER()
278  ---------------------------------------
279  */
284  virtual void text_render(std::ostream &stream) const
285  {
286  stream << index;
287  }
288 
289  /*
290  INDEX_MANAGER_SEQUENTIAL::ITERATE()
291  -----------------------------------
292  */
297  virtual void iterate(index_manager::delegate &callback)
298  {
299  /*
300  Make sure we have allocated the memory necessary for iteration (i.e. to linearize the postings lists).
301  */
302  make_space();
303 
304  /*
305  Iterate over the hash table calling the callback function with each term->postings pair.
306  */
307  for (const auto &[key, value] : index)
308  {
309  auto document_frequency = value.linearize(temporary, temporary_size, document_ids, term_frequencies, get_highest_document_id());
310  callback(key, value, document_frequency, document_ids, term_frequencies);
311  }
312 
313  /*
314  Iterate over the primary keys calling the callback function with each docid->key pair.
315  Note that the search engine counts documents from 1, not from 0.
316  */
317  size_t instance = 0;
318  callback(instance, slice("-"));
319  for (const auto &term : primary_key)
320  callback(++instance, term);
321  }
322 
323  /*
324  INDEX_MANAGER_SEQUENTIAL::ITERATE()
325  -----------------------------------
326  */
333  {
334  /*
335  Make sure we have allocated the memory necessary for iteration (i.e. to linearize the postings lists).
336  */
337  make_space();
338 
339  /*
340  Iterate over the hash table calling the callback function with each term->postings pair.
341  */
342  for (const auto &[term, postings] : index)
343  {
344  auto document_frequency = postings.linearize(temporary, temporary_size, document_ids, term_frequencies, get_highest_document_id());
345  quantizer(callback, term, postings, document_frequency, document_ids, term_frequencies);
346  }
347 
348  /*
349  Iterate over the primary keys calling the callback function with each docid->key pair.
350  Note that the search engine counts documents from 1, not from 0.
351  */
352  size_t instance = 0;
353  quantizer(callback, instance, slice("-"));
354  for (const auto &term : primary_key)
355  quantizer(callback, ++instance, term);
356  }
357 
358  /*
359  INDEX_MANAGER_SEQUENTIAL::UNITTEST_BUILD_INDEX()
360  ------------------------------------------------
361  Build a index from the standard 10-document collection.
362  */
368  static void unittest_build_index(index_manager_sequential &index, const std::string &document_collection)
369  {
370  class parser parser; // We need a parser
371  document document; // That creates documents
372  std::shared_ptr<instream> file(new instream_memory(document_collection.c_str(), document_collection.size())); // From this stream (the standard 10 document stream).
373  instream_document_trec source(file); // Set up the instream
374 
375  /*
376  Build an index
377  */
378  do
379  {
380  compress_integer::integer document_length = 0;
381  /*
382  Read the next document and give it to the parser (until eof).
383  */
384  document.rewind();
385  source.read(document);
386  if (document.isempty())
387  break;
388  parser.set_document(document);
389 
390  /*
391  Mark the start of a new document.
392  */
393  index.begin_document(document.primary_key);
394 
395  /*
396  For each document, read the token stream and add to the hash table.
397  */
398  bool finished = false;
399  do
400  {
401  /*
402  Get the next token.
403  */
404  const auto &token = parser.get_next_token();
405 
406  /*
407  Different behaviour based on its type.
408  */
409  switch (token.type)
410  {
412  document_length++;
413  finished = true; // finish up.
414  break;
417  document_length++;
418  index.term(token); // add it to the index.
419  break;
420  default:
421  break; // ignore.
422  }
423  }
424  while (!finished);
425 
426  /*
427  Mark that we're at the end of the document
428  */
429  index.end_document(document_length);
430  }
431  while (!document.isempty());
432  }
433 
434  /*
435  INDEX_MANAGER_SEQUENTIAL::UNITTEST()
436  ------------------------------------
437  */
441  static void unittest(void)
442  {
443  /*
444  This is the postings answer that is expected
445  */
446  std::string answer
447  (
448  "6-><6,1>\n"
449  "1-><1,1>\n"
450  "4-><4,1>\n"
451  "5-><5,1>\n"
452  "3-><3,1>\n"
453  "8-><8,1>\n"
454  "7-><7,1>\n"
455  "2-><2,1>\n"
456  "9-><9,1>\n"
457  "10-><10,1>\n"
458  "four-><7,1><8,1><9,1><10,1>\n"
459  "eight-><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n"
460  "five-><6,1><7,1><8,1><9,1><10,1>\n"
461  "seven-><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n"
462  "two-><9,1><10,1>\n"
463  "six-><5,1><6,1><7,1><8,1><9,1><10,1>\n"
464  "three-><8,1><9,1><10,1>\n"
465  "one-><10,1>\n"
466  "nine-><2,1><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n"
467  "ten-><1,1><2,1><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n"
468  );
469  /*
470  This is the docid to primary_key answer
471  */
472  std::string primary_key_answer
473  (
474  "0->-\n"
475  "1->1\n"
476  "2->2\n"
477  "3->3\n"
478  "4->4\n"
479  "5->5\n"
480  "6->6\n"
481  "7->7\n"
482  "8->8\n"
483  "9->9\n"
484  "10->10\n"
485  );
486 
487  /*
488  Build the index for the standard 10 document collection
489  */
492 
493  /*
494  Serialise it
495  */
496  std::ostringstream computed_result;
497  computed_result << index;
498 
499  /*
500  Check it produced the expected index
501  */
502  JASS_assert(computed_result.str() == answer);
503 
504  /*
505  Test the iterating callback mechanism
506  */
507  std::ostringstream postings_result;
508  std::ostringstream primary_key_result;
509  delegate callback(10, postings_result, primary_key_result);
510  index.iterate(callback);
511 
512  JASS_assert(postings_result.str() == answer);
513  JASS_assert(primary_key_result.str() == primary_key_answer);
514 
515  /*
516  Done
517  */
518  puts("index_manager_sequential::PASSED");
519  }
520  };
521  }
virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function to serialise the postings (given the term) is operator().
Definition: index_manager_sequential.h:122
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
virtual void operator()(size_t document_id, const slice &primary_key)
The callback function to serialise the primary keys (external document ids) is operator().
Definition: index_manager_sequential.h:136
virtual void term(const parser::token &term, compress_integer::integer docid)
Hand a new term with a pre-computed postings list to this object.
Definition: index_manager_sequential.h:271
Child class of instream for creating documents from TREC pre-web (i.e. news articles) data...
allocator_pool memory
All memory in allocatged from this allocator.
Definition: index_manager_sequential.h:41
C++ slices (string-descriptors)
Definition: slice.h:27
Non-thread-Safe object that holds a single postings list during indexing.
Base class for the callback function called by iterate.
Definition: index_manager_sequential.h:61
Simple, but fast, XML parser.
Definition: parser.h:39
Container class representing a document through the indexing pipeline.
Definition: document.h:31
Base class for the indexer object that stored the actual index during indexing.
virtual void * malloc(size_t bytes, size_t alignment=alignment_boundary)
Allocate a small chunk of memory from the internal block and return a pointer to the caller...
Definition: allocator_pool.cpp:90
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
virtual void end_document(compress_integer::integer document_length)
Tell this object that you&#39;ve finished with the current document (and are about to move on to the next...
Definition: index_manager.h:258
numeric token
Definition: parser.h:68
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
slice lexeme
The token itself, stored as a slice (pointer / length pair)
Definition: parser.h:85
virtual ~index_manager_sequential()
Destructor.
Definition: index_manager_sequential.h:194
static void unittest(void)
Unit test this class.
Definition: index_manager_sequential.h:441
virtual void term(const parser::token &term)
Hand a new term from the token stream to this object.
Definition: index_manager_sequential.h:243
void make_space(void)
make sure all the internal buffers needed for iteration have been allocated
Definition: index_manager_sequential.h:149
std::ostream & primary_keys_out
The unit test iterates into this.
Definition: index_manager_sequential.h:65
virtual void iterate(index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:297
virtual void text_render(std::ostream &stream) const
Dump a human-readable version of the index down the stream.
Definition: index_manager_sequential.h:284
index_postings_impact::impact_type * term_frequencies
The re-used buffer storing the term frequencies.
Definition: index_manager_sequential.h:49
Simple XML parser that does&#39;t do either attributes or entities.
void push_back(const TYPE &element)
Add an element to the end of the array.
Definition: dynamic_array.h:261
static std::string ten_documents
Ten TREC formatted documents with ten terms (ten .. one) where each term occurs it&#39;s count number of ...
Definition: unittest_data.h:25
hash_table< slice, index_postings, 24 > index
The index is a hash table of index_postings keyed on the term (a slice).
Definition: index_manager_sequential.h:42
virtual void begin_document(const slice &primary_key)
Tell this object that you&#39;re about to start indexing a new object.
Definition: index_manager.h:219
virtual void begin_document(const slice &document_primary_key)
Tell this object that you&#39;re about to start indexing a new object.
Definition: index_manager_sequential.h:208
static void unittest_build_index(index_manager_sequential &index, const std::string &document_collection)
Build and index for the 10 sample documents. This is used by several unit tests that need a valid ind...
Definition: index_manager_sequential.h:368
delegate(size_t documents_in_collection, std::ostream &postings, std::ostream &primary_keys)
Constructor.
Definition: index_manager_sequential.h:78
Subclass of instream for reading data from a memory buffer.
Definition: instream_memory.h:28
virtual void set_document(const class document &document)
Start parsing from the start of this document.
Definition: parser.h:196
virtual void term(const parser::token &term, const std::vector< posting > &postings_list)
Hand a new term with a pre-computed postings list to this object.
Definition: index_manager_sequential.h:257
Tuple for a posting in a traditions <d,tf> postings list.
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser.cpp:79
Simple block-allocator that internally allocates a large chunk then allocates smaller blocks from thi...
Definition: allocator_pool.h:61
Base class for the callback function called by iterate.
Definition: index_manager.h:120
compress_integer::integer * document_ids
The re-used buffer storing decoded document ids.
Definition: index_manager_sequential.h:48
std::ostream & postings_out
The unit test iterates into this.
Definition: index_manager_sequential.h:64
The final token is marked as an EOF token (and has no content).
Definition: parser.h:78
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
Data that can be used and re-used for unittests (and other purposes).
Non-thread-Safe indexer object.
Definition: index_manager_sequential.h:38
size_t temporary_size
The number of bytes in temporary.
Definition: index_manager_sequential.h:50
A token as returned by the parser.
Definition: parser.h:58
Child class of instream for creating documents from TREC pre-web (i.e. news articles) data...
Definition: instream_document_trec.h:35
Thread-safe hash table (without delete).
Definition: hash_table.h:41
Thread-safe grow-only dynamic array using the thread-safe allocator.
Definition: allocator_pool.h:32
File based I/O methods including whole file and partial files.
Definition: file.h:45
virtual void finish(void)
Any final clean up.
Definition: index_manager_sequential.h:93
Thread-safe hash table (without delete).
Base class for holding the index during indexing.
Definition: index_manager.h:33
Base class for the callback function called by iterate.
Definition: index_manager.h:47
Subclass of instream for reading data from a memory buffer.
Definition: compress_integer_elias_delta_simd.c:23
virtual ~delegate()
Destructor.
Definition: index_manager_sequential.h:105
uint8_t * temporary
Temporary buffer - cannot be used to store anything between calls.
Definition: index_manager_sequential.h:51
alphabetic token
Definition: parser.h:67
compress_integer::integer get_highest_document_id(void) const
Return the number of documents that have been successfully indexed or are in the process of being ind...
Definition: index_manager.h:345
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87
virtual void read(document &buffer)
Read the next document from the source instream into document.
Definition: instream_document_trec.cpp:83
dynamic_array< slice > primary_key
The list of primary keys (i.e. external document identifiers) allocated in memory.
Definition: index_manager_sequential.h:43
index_manager_sequential()
Constructor.
Definition: index_manager_sequential.h:175
virtual void iterate(index_manager::quantizing_delegate &quantizer, index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:332
virtual void set_primary_keys(const std::vector< slice > &keys)
Add a list of primary keys to the current list. Normally used to set it without actually indexing (wa...
Definition: index_manager_sequential.h:229