169 num_postings_lists(0),
171 total_postings_lists(0),
173 total_terms_in_collection(0),
174 average_doclength(0),
189 std::cout <<
"version:" << version <<
"\n";
190 std::cout <<
"num_postings_lists:" << num_postings_lists <<
"\n";
191 std::cout <<
"num_docs:" << num_docs <<
"\n";
192 std::cout <<
"total_postings_lists:" << total_postings_lists <<
"\n";
193 std::cout <<
"total_docs:" << total_docs <<
"\n";
194 std::cout <<
"total_terms_in_collection:" << total_terms_in_collection <<
"\n";
195 std::cout <<
"average_doclength:" << average_doclength <<
"\n";
196 std::cout <<
"description:" << description <<
"\n";
229 while (stream < stream_end)
236 uint64_t value = protobuf::get_uint64_t(stream);
238 into.
docid =
static_cast<uint32_t
>(value);
264 while (stream < stream_end)
271 uint64_t value = protobuf::get_uint64_t(stream);
282 slice term = protobuf::get_blob(stream);
288 const uint8_t *here =
reinterpret_cast<const uint8_t *
> (term.
address());
290 if (get_next_postings_pair(d_tf_pair, here, here + term.
size()) ==
FAIL)
311 document_frequency = 0;
312 collection_frequency = 0;
356 collection_docid.
clear();
369 size_t length = protobuf::get_uint64_t(stream);
370 const uint8_t *stream_end = stream + length;
372 while (stream < stream_end)
379 uint64_t value = protobuf::get_uint64_t(stream);
382 into.
docid = (uint32_t)value;
390 slice term = protobuf::get_blob(stream);
431 stream(source.stream),
461 int64_t postings_list_length = protobuf::get_uint64_t(stream);
469 which = (std::numeric_limits<decltype(which)>::max)();
486 bool answer = which <= with.
which;
564 stream(source.stream),
587 which = (std::numeric_limits<decltype(which)>::max)();
618 return which <= with.
which;
687 size_t length = protobuf::get_uint64_t(stream);
688 const uint8_t *stream_end = stream + length;
690 while (stream < stream_end)
697 uint64_t value = protobuf::get_uint64_t(stream);
700 header.
version = (uint32_t)value;
716 double value = protobuf::get_double(stream);
724 slice term = protobuf::get_blob(stream);
748 source_file(source_file),
const uint8_t * source_file
The CIFF file in memory.
Definition: ciff_lin.h:667
uint64_t collection_frequency
The number of times the term occurs in the collection.
Definition: ciff_lin.h:212
error_code status
OK or FAIL (FAIL only on error in input stream)
Definition: ciff_lin.h:672
const uint8_t * stream
Where in the source stream we are reading from.
Definition: ciff_lin.h:547
postings_list_iterator(ciff_lin &of, size_t which)
Constructor.
Definition: ciff_lin.h:429
void clear(void)
removes all content from the postings list
Definition: ciff_lin.h:308
A <docid, tf> tuple.
Definition: posting.h:28
error_code read_header(header &header)
Read the CIFF header containing details about how many postings lists, etc.
Definition: ciff_lin.h:684
size_t which
Which doc record we are current on (counting from 0)
Definition: ciff_lin.h:548
ciff_lin::docrecords_iterator end()
Return and iterator to the end of this object.
Definition: ciff_lin.h:660
iterator class for iterating over an index
Definition: ciff_lin.h:543
uint64_t document_frequency
The number of documents containing the term.
Definition: ciff_lin.h:211
C++ slices (string-descriptors)
Definition: slice.h:27
bool operator!=(docrecords_iterator &with)
Compare two iterators.
Definition: ciff_lin.h:616
const uint8_t * stream
Where in the CIFF we currently are.
Definition: ciff_lin.h:668
void * address(void) const
Extract the pointer value from the slice.
Definition: slice.h:269
header ciff_header
The header from the CIFF file.
Definition: ciff_lin.h:669
Method completed successfully.
Definition: ciff_lin.h:136
std::vector< posting > postings
The postings list, a vector of <d,tf> tuples.
Definition: ciff_lin.h:213
size_t which
Which postings list we are current on (counting from 0)
Definition: ciff_lin.h:416
Variable byte encoded integer of up-to 64 bits, signed or unsigned.
Definition: protobuf.h:44
static error_code get_next_postings_pair(posting &into, const uint8_t *&stream, const uint8_t *stream_end)
read a <d,tf> pair from a protobuf encoded stream
Definition: ciff_lin.h:227
slice term
The term as a <pointer,length> tuple.
Definition: ciff_lin.h:210
docrecords_foreach docrecords(void)
Return an object capable of being an iterator for document details. Assumes the "file pointer" is in ...
Definition: ciff_lin.h:777
bool operator!=(postings_list_iterator &with)
Compare two iterators.
Definition: ciff_lin.h:484
Hand crafted methods to read protocol buffer (protobuf) encoded files.
ciff_lin & source
The ciff_lin being iterated over.
Definition: ciff_lin.h:546
slice collection_docid
The primary key as a <pointer,length> tuple.
Definition: ciff_lin.h:328
A postings list with a term, df, cf, and postings list of <d,tf> pairs.
Definition: ciff_lin.h:207
Method did not completed successfully.
Definition: ciff_lin.h:137
ciff_lin::postings_list_iterator end()
Return and iterator to the end of this object.
Definition: ciff_lin.h:530
doc_record & operator*()
Return the most recently constructed postings list.
Definition: ciff_lin.h:603
docrecords_iterator & operator++()
Move on to the next postings list by constructing and storing the current one. On error move to the e...
Definition: ciff_lin.h:578
void clear(void)
Clear the contents of this doc record.
Definition: ciff_lin.h:353
static error_code get_next_postings(postings_list &into, const uint8_t *&stream, const uint8_t *stream_end)
read a full postings list (inclding term, df, cf, etc.) from a protobuf encoded stream ...
Definition: ciff_lin.h:262
ciff_lin & source
The ciff_lin being iterated over.
Definition: ciff_lin.h:414
void clear(void)
Construct an empty slice with a pool allocator.
Definition: slice.h:223
ciff_lin::docrecords_iterator begin()
Return and iterator to the start of this object.
Definition: ciff_lin.h:648
Tuple for a posting in a traditions <d,tf> postings list.
ciff_lin::postings_list_iterator begin()
Return and iterator to the start of this object.
Definition: ciff_lin.h:518
postings_list postings
The postings list we just made.
Definition: ciff_lin.h:417
Little endian 64-bit number, integer or floating point types.
Definition: protobuf.h:45
const uint8_t * stream
Where in the source stream we are reading from.
Definition: ciff_lin.h:415
postings_foreach postings(void)
Return an object capable of being an iterator for postings lists. Assumes the "file pointer" is in th...
Definition: ciff_lin.h:764
postings_list_iterator & operator++()
Move on to the next postings list by constructing and storing the current one. On error move to the e...
Definition: ciff_lin.h:445
postings_list & operator*()
Return the most recently constructed postings list.
Definition: ciff_lin.h:459
static error_code get_next(doc_record &into, const uint8_t *&stream)
Get the next doc record from the stream.
Definition: ciff_lin.h:367
An object used to allow iteration over document records.
Definition: ciff_lin.h:629
Reader for Jimmy Lin's shared index format.
Definition: ciff_lin.h:127
header & get_header(void)
Return the header object.
Definition: ciff_lin.h:790
compress_integer::integer docid
The Document identifier.
Definition: posting.h:31
error_code
success or failure.
Definition: ciff_lin.h:134
Definition: compress_integer_elias_delta_simd.c:23
iterator class for iterating over an index
Definition: ciff_lin.h:411
static uint8_t get_type_and_field(wire_type &type, const uint8_t *&stream)
Extract the field number and its type from the stream.
Definition: protobuf.h:259
ciff_lin(const uint8_t *source_file)
Constructor.
Definition: ciff_lin.h:747
docrecords_iterator(ciff_lin &of, size_t which)
Constructor.
Definition: ciff_lin.h:562
int32_t docid
The docid in the postings lists.
Definition: ciff_lin.h:327
size_t size(void) const
Return the length of this slice.
Definition: slice.h:256
int32_t doclength
The length of the document.
Definition: ciff_lin.h:329
doc_record current
The doc record list we just made.
Definition: ciff_lin.h:549
wire_type
The known protobuf types for protobuf 3.
Definition: protobuf.h:42
compress_integer::integer term_frequency
The number of times the term occurs in document with id docid.
Definition: posting.h:32
a document record object containing document lengths and primary keys
Definition: ciff_lin.h:324
doc_record()
Constructor.
Definition: ciff_lin.h:339
An object used to allow iteration over postings lists.
Definition: ciff_lin.h:499
Blob or string. length VARINT then data.
Definition: protobuf.h:46