78 delegate(
size_t documents_in_collection, std::ostream &postings, std::ostream &primary_keys) :
80 postings_out(postings),
81 primary_keys_out(primary_keys)
124 postings_out << term <<
"->" << postings << std::endl;
138 primary_keys_out << document_id <<
"->" << primary_key << std::endl;
153 if (new_temporary_size > temporary_size)
155 temporary_size = new_temporary_size;
163 temporary =
reinterpret_cast<decltype(temporary)
>(memory.
malloc(temporary_size));
178 primary_key(memory, 1000, 1.5),
179 document_ids(nullptr),
180 term_frequencies(nullptr),
231 for (
auto key : keys)
259 index[term.
lexeme].push_back(postings_list);
273 index[term.
lexeme].push_back(docid);
307 for (
const auto &[key, value] : index)
309 auto document_frequency = value.linearize(temporary, temporary_size, document_ids, term_frequencies,
get_highest_document_id());
310 callback(key, value, document_frequency, document_ids, term_frequencies);
318 callback(instance,
slice(
"-"));
319 for (
const auto &
term : primary_key)
320 callback(++instance,
term);
342 for (
const auto &[
term, postings] : index)
344 auto document_frequency = postings.linearize(temporary, temporary_size, document_ids, term_frequencies,
get_highest_document_id());
345 quantizer(callback,
term, postings, document_frequency, document_ids, term_frequencies);
353 quantizer(callback, instance,
slice(
"-"));
354 for (
const auto &
term : primary_key)
355 quantizer(callback, ++instance,
term);
372 std::shared_ptr<instream>
file(
new instream_memory(document_collection.c_str(), document_collection.size()));
385 source.
read(document);
386 if (document.isempty())
398 bool finished =
false;
431 while (!document.isempty());
458 "four-><7,1><8,1><9,1><10,1>\n" 459 "eight-><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n" 460 "five-><6,1><7,1><8,1><9,1><10,1>\n" 461 "seven-><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n" 463 "six-><5,1><6,1><7,1><8,1><9,1><10,1>\n" 464 "three-><8,1><9,1><10,1>\n" 466 "nine-><2,1><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n" 467 "ten-><1,1><2,1><3,1><4,1><5,1><6,1><7,1><8,1><9,1><10,1>\n" 472 std::string primary_key_answer
496 std::ostringstream computed_result;
497 computed_result <<
index;
507 std::ostringstream postings_result;
508 std::ostringstream primary_key_result;
509 delegate callback(10, postings_result, primary_key_result);
510 index.iterate(callback);
513 JASS_assert(primary_key_result.str() == primary_key_answer);
518 puts(
"index_manager_sequential::PASSED");
virtual void operator()(const slice &term, const index_postings &postings, compress_integer::integer document_frequency, compress_integer::integer *document_ids, index_postings_impact::impact_type *term_frequencies)
The callback function to serialise the postings (given the term) is operator().
Definition: index_manager_sequential.h:122
Non-thread-safe object that accumulates a single postings list during indexing.
Definition: index_postings.h:40
virtual void operator()(size_t document_id, const slice &primary_key)
The callback function to serialise the primary keys (external document ids) is operator().
Definition: index_manager_sequential.h:136
virtual void term(const parser::token &term, compress_integer::integer docid)
Hand a new term with a pre-computed postings list to this object.
Definition: index_manager_sequential.h:271
Child class of instream for creating documents from TREC pre-web (i.e. news articles) data...
allocator_pool memory
All memory in allocatged from this allocator.
Definition: index_manager_sequential.h:41
C++ slices (string-descriptors)
Definition: slice.h:27
Non-thread-Safe object that holds a single postings list during indexing.
Base class for the callback function called by iterate.
Definition: index_manager_sequential.h:61
Simple, but fast, XML parser.
Definition: parser.h:39
Container class representing a document through the indexing pipeline.
Definition: document.h:31
Base class for the indexer object that stored the actual index during indexing.
virtual void * malloc(size_t bytes, size_t alignment=alignment_boundary)
Allocate a small chunk of memory from the internal block and return a pointer to the caller...
Definition: allocator_pool.cpp:90
uint32_t integer
This class and descendants will work on integers of this size. Do not change without also changing JA...
Definition: compress_integer.h:40
virtual void end_document(compress_integer::integer document_length)
Tell this object that you've finished with the current document (and are about to move on to the next...
Definition: index_manager.h:258
numeric token
Definition: parser.h:68
#define JASS_assert(expression)
Drop in replacement for assert() that aborts in Release as well as Debug.
Definition: asserts.h:33
slice lexeme
The token itself, stored as a slice (pointer / length pair)
Definition: parser.h:85
virtual ~index_manager_sequential()
Destructor.
Definition: index_manager_sequential.h:194
static void unittest(void)
Unit test this class.
Definition: index_manager_sequential.h:441
virtual void term(const parser::token &term)
Hand a new term from the token stream to this object.
Definition: index_manager_sequential.h:243
void make_space(void)
make sure all the internal buffers needed for iteration have been allocated
Definition: index_manager_sequential.h:149
std::ostream & primary_keys_out
The unit test iterates into this.
Definition: index_manager_sequential.h:65
virtual void iterate(index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:297
virtual void text_render(std::ostream &stream) const
Dump a human-readable version of the index down the stream.
Definition: index_manager_sequential.h:284
index_postings_impact::impact_type * term_frequencies
The re-used buffer storing the term frequencies.
Definition: index_manager_sequential.h:49
Simple XML parser that does't do either attributes or entities.
void push_back(const TYPE &element)
Add an element to the end of the array.
Definition: dynamic_array.h:261
static std::string ten_documents
Ten TREC formatted documents with ten terms (ten .. one) where each term occurs it's count number of ...
Definition: unittest_data.h:25
hash_table< slice, index_postings, 24 > index
The index is a hash table of index_postings keyed on the term (a slice).
Definition: index_manager_sequential.h:42
virtual void begin_document(const slice &primary_key)
Tell this object that you're about to start indexing a new object.
Definition: index_manager.h:219
virtual void begin_document(const slice &document_primary_key)
Tell this object that you're about to start indexing a new object.
Definition: index_manager_sequential.h:208
static void unittest_build_index(index_manager_sequential &index, const std::string &document_collection)
Build and index for the 10 sample documents. This is used by several unit tests that need a valid ind...
Definition: index_manager_sequential.h:368
delegate(size_t documents_in_collection, std::ostream &postings, std::ostream &primary_keys)
Constructor.
Definition: index_manager_sequential.h:78
Subclass of instream for reading data from a memory buffer.
Definition: instream_memory.h:28
virtual void set_document(const class document &document)
Start parsing from the start of this document.
Definition: parser.h:196
virtual void term(const parser::token &term, const std::vector< posting > &postings_list)
Hand a new term with a pre-computed postings list to this object.
Definition: index_manager_sequential.h:257
Tuple for a posting in a traditions <d,tf> postings list.
virtual const class parser::token & get_next_token(void)
Continue parsing the input looking for the next token.
Definition: parser.cpp:79
Simple block-allocator that internally allocates a large chunk then allocates smaller blocks from thi...
Definition: allocator_pool.h:61
Base class for the callback function called by iterate.
Definition: index_manager.h:120
compress_integer::integer * document_ids
The re-used buffer storing decoded document ids.
Definition: index_manager_sequential.h:48
std::ostream & postings_out
The unit test iterates into this.
Definition: index_manager_sequential.h:64
The final token is marked as an EOF token (and has no content).
Definition: parser.h:78
Definition: document_id.h:16
uint16_t impact_type
An impact value (i.e. a term frequency value) is of this type.
Definition: index_postings_impact.h:41
Data that can be used and re-used for unittests (and other purposes).
Non-thread-Safe indexer object.
Definition: index_manager_sequential.h:38
size_t temporary_size
The number of bytes in temporary.
Definition: index_manager_sequential.h:50
A token as returned by the parser.
Definition: parser.h:58
Child class of instream for creating documents from TREC pre-web (i.e. news articles) data...
Definition: instream_document_trec.h:35
Thread-safe hash table (without delete).
Definition: hash_table.h:41
Thread-safe grow-only dynamic array using the thread-safe allocator.
Definition: allocator_pool.h:32
File based I/O methods including whole file and partial files.
Definition: file.h:45
virtual void finish(void)
Any final clean up.
Definition: index_manager_sequential.h:93
Thread-safe hash table (without delete).
Base class for holding the index during indexing.
Definition: index_manager.h:33
Base class for the callback function called by iterate.
Definition: index_manager.h:47
Subclass of instream for reading data from a memory buffer.
Definition: compress_integer_elias_delta_simd.c:23
virtual ~delegate()
Destructor.
Definition: index_manager_sequential.h:105
uint8_t * temporary
Temporary buffer - cannot be used to store anything between calls.
Definition: index_manager_sequential.h:51
alphabetic token
Definition: parser.h:67
compress_integer::integer get_highest_document_id(void) const
Return the number of documents that have been successfully indexed or are in the process of being ind...
Definition: index_manager.h:345
index_postings_impact::impact_type count
The number of times the token is seen (normally 1, but if parsing a forward index it might be known t...
Definition: parser.h:87
virtual void read(document &buffer)
Read the next document from the source instream into document.
Definition: instream_document_trec.cpp:83
dynamic_array< slice > primary_key
The list of primary keys (i.e. external document identifiers) allocated in memory.
Definition: index_manager_sequential.h:43
index_manager_sequential()
Constructor.
Definition: index_manager_sequential.h:175
virtual void iterate(index_manager::quantizing_delegate &quantizer, index_manager::delegate &callback)
Iterate over the index calling callback.operator() with each postings list.
Definition: index_manager_sequential.h:332
virtual void set_primary_keys(const std::vector< slice > &keys)
Add a list of primary keys to the current list. Normally used to set it without actually indexing (wa...
Definition: index_manager_sequential.h:229